204 files changed, 208154 insertions, 0 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc
new file mode 100644
index 0000000..cb8a3ff
--- /dev/null
+++ b/sys/kern/Make.tags.inc
@@ -0,0 +1,97 @@
+# $FreeBSD$
+#	@(#)Make.tags.inc	8.1 (Berkeley) 6/11/93
+
+SYS?=	${.CURDIR}/..
+
+# Common files for "make tags", included by the Makefile for each
+# architecture.
+
+# Put the /sys/sys include files at the end so that subroutine definitions
+# win when there is a struct tag with the same name (e.g., vmmeter).  The
+# better solution would be for ctags to generate "struct vmmeter" tags.
+
+COMM=	${SYS}/dev/advansys/*.[ch] \
+	${SYS}/dev/aha/*.[ch] \
+	${SYS}/dev/aic7xxx/*.[ch] \
+	${SYS}/dev/buslogic/*.[ch] \
+	${SYS}/dev/dpt/*.[ch] \
+	${SYS}/dev/en/*.[ch] \
+	${SYS}/dev/iicbus/*.[ch] \
+	${SYS}/dev/isp/*.[ch] \
+	${SYS}/dev/pdq/*.[ch] \
+	${SYS}/dev/ppbus/*.[ch] \
+	${SYS}/dev/smbus/*.[ch] \
+	${SYS}/dev/vx/*.[ch] \
+	${SYS}/fs/cd9660/*.[ch] \
+	${SYS}/fs/deadfs/*.[ch] \
+	${SYS}/fs/devfs/*.[ch] \
+	${SYS}/fs/fdescfs/*.[ch] \
+	${SYS}/fs/fifofs/*.[ch] \
+	${SYS}/fs/msdosfs/*.[ch] \
+	${SYS}/fs/nullfs/*.[ch] \
+	${SYS}/fs/procfs/*.[ch] \
+	${SYS}/fs/smbfs/*.[ch] \
+	${SYS}/fs/udf/*.[ch] \
+	${SYS}/fs/unionfs/*.[ch] \
+	${SYS}/geom/*.[ch] \
+	${SYS}/kern/*.[ch] \
+	${SYS}/net/*.[ch] \
+	${SYS}/netatalk/*.[ch] \
+	${SYS}/netinet/*.[ch] \
+	${SYS}/netinet6/*.[ch] \
+	${SYS}/netipsec/*.[ch] \
+	${SYS}/netipx/*.[ch] \
+	${SYS}/netnatm/*.[ch] \
+	${SYS}/nfs/*.[ch] \
+	${SYS}/nfsclient/*.[ch] \
+	${SYS}/nfsserver/*.[ch] \
+	${SYS}/pci/*.[ch] \
+	${SYS}/ufs/ffs/*.[ch] \
+	${SYS}/ufs/ufs/*.[ch] \
+	${SYS}/vm/*.[ch] \
+	${SYS}/sys/*.[ch]
+
+COMMDIR1= ${SYS}/conf \
+	${SYS}/geom \
+	${SYS}/kern \
+	${SYS}/net \
+	${SYS}/netatalk \
+	${SYS}/netinet \
+	${SYS}/netinet6 \
+	${SYS}/netipsec \
+	${SYS}/netipx \
+	${SYS}/netnatm \
+	${SYS}/nfs \
+	${SYS}/pci \
+	${SYS}/vm \
+	${SYS}/sys
+
+COMMDIR2= ${SYS}/dev/advansys \
+	${SYS}/dev/aha \
+	${SYS}/dev/aic7xxx \
+	${SYS}/dev/buslogic \
+	${SYS}/dev/ccd \
+	${SYS}/dev/dec \
+	${SYS}/dev/dpt \
+	${SYS}/dev/en \
+	${SYS}/dev/hea \
+	${SYS}/dev/hfa \
+	${SYS}/dev/iicbus \
+	${SYS}/dev/isp \
+	${SYS}/dev/pdq \
+	${SYS}/dev/ppbus \
+	${SYS}/dev/smbus \
+	${SYS}/dev/vn \
+	${SYS}/dev/vx \
+	${SYS}/fs/deadfs \
+	${SYS}/fs/devfs \
+	${SYS}/fs/fdescfs \
+	${SYS}/fs/fifofs \
+	${SYS}/fs/msdosfs \
+	${SYS}/fs/nullfs \
+	${SYS}/fs/procfs \
+	${SYS}/fs/specfs \
+	${SYS}/fs/unionfs \
+	${SYS}/fs/cd9660 \
+	${SYS}/ufs/ffs \
+	${SYS}/ufs/ufs
diff --git a/sys/kern/Makefile b/sys/kern/Makefile
new file mode 100644
index 0000000..0721e82
--- /dev/null
+++ b/sys/kern/Makefile
@@ -0,0 +1,21 @@
+#	@(#)Makefile	8.2 (Berkeley) 3/21/94
+# $FreeBSD$
+
+# Makefile for init_sysent
+
+all:
+	@echo "make sysent only"
+
+sysent: init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscall.mk \
+../sys/sysproto.h
+
+init_sysent.c syscalls.c systrace_args.c ../sys/syscall.h \
+../sys/syscall.mk ../sys/sysproto.h: makesyscalls.sh syscalls.master \
+capabilities.conf
+	-mv -f init_sysent.c init_sysent.c.bak
+	-mv -f syscalls.c syscalls.c.bak
+	-mv -f systrace_args.c systrace_args.c.bak
+	-mv -f ../sys/syscall.h ../sys/syscall.h.bak
+	-mv -f ../sys/syscall.mk ../sys/syscall.mk.bak
+	-mv -f ../sys/sysproto.h ../sys/sysproto.h.bak
+	sh makesyscalls.sh syscalls.master
diff --git a/sys/kern/bus_if.m b/sys/kern/bus_if.m
new file mode 100644
index 0000000..b0ad611
--- /dev/null
+++ b/sys/kern/bus_if.m
@@ -0,0 +1,672 @@
+#-
+# Copyright (c) 1998-2004 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+/**
+ * @defgroup BUS bus - KObj methods for drivers of devices with children
+ * @brief A set of methods required device drivers that support
+ * child devices.
+ * @{
+ */
+INTERFACE bus;
+
+#
+# Default implementations of some methods.
+#
+CODE {
+	static struct resource *
+	null_alloc_resource(device_t dev, device_t child,
+	    int type, int *rid, u_long start, u_long end,
+	    u_long count, u_int flags)
+	{
+	    return (0);
+	}
+
+	static int
+	null_remap_intr(device_t bus, device_t dev, u_int irq)
+	{
+
+		if (dev != NULL)
+			return (BUS_REMAP_INTR(dev, NULL, irq));
+		return (ENXIO);
+	}
+
+	static device_t
+	null_add_child(device_t bus, int order, const char *name,
+	    int unit)
+	{
+
+		panic("bus_add_child is not implemented");
+	}
+};
+
+/**
+ * @brief Print a description of a child device
+ *
+ * This is called from system code which prints out a description of a
+ * device. It should describe the attachment that the child has with
+ * the parent. For instance the TurboLaser bus prints which node the
+ * device is attached to. See bus_generic_print_child() for more 
+ * information.
+ *
+ * @param _dev		the device whose child is being printed
+ * @param _child	the child device to describe
+ *
+ * @returns		the number of characters output.
+ */
+METHOD int print_child {
+	device_t _dev;
+	device_t _child;
+} DEFAULT bus_generic_print_child;
+
+/**
+ * @brief Print a notification about an unprobed child device.
+ *
+ * Called for each child device that did not succeed in probing for a
+ * driver.
+ *
+ * @param _dev		the device whose child was being probed
+ * @param _child	the child device which failed to probe
+ */   
+METHOD void probe_nomatch {
+        device_t _dev;
+        device_t _child;
+};
+
+/**
+ * @brief Read the value of a bus-specific attribute of a device
+ *
+ * This method, along with BUS_WRITE_IVAR() manages a bus-specific set
+ * of instance variables of a child device.  The intention is that
+ * each different type of bus defines a set of appropriate instance
+ * variables (such as ports and irqs for ISA bus etc.)
+ *
+ * This information could be given to the child device as a struct but
+ * that makes it hard for a bus to add or remove variables without
+ * forcing an edit and recompile for all drivers which may not be
+ * possible for vendor supplied binary drivers.
+ *
+ * This method copies the value of an instance variable to the
+ * location specified by @p *_result.
+ * 
+ * @param _dev		the device whose child was being examined
+ * @param _child	the child device whose instance variable is
+ *			being read
+ * @param _index	the instance variable to read
+ * @param _result	a loction to recieve the instance variable
+ *			value
+ * 
+ * @retval 0		success
+ * @retval ENOENT	no such instance variable is supported by @p
+ *			_dev 
+ */
+METHOD int read_ivar {
+	device_t _dev;
+	device_t _child;
+	int _index;
+	uintptr_t *_result;
+};
+
+/**
+ * @brief Write the value of a bus-specific attribute of a device
+ * 
+ * This method sets the value of an instance variable to @p _value.
+ * 
+ * @param _dev		the device whose child was being updated
+ * @param _child	the child device whose instance variable is
+ *			being written
+ * @param _index	the instance variable to write
+ * @param _value	the value to write to that instance variable
+ * 
+ * @retval 0		success
+ * @retval ENOENT	no such instance variable is supported by @p
+ *			_dev 
+ * @retval EINVAL	the instance variable was recognised but
+ *			contains a read-only value
+ */
+METHOD int write_ivar {
+	device_t _dev;
+	device_t _child;
+	int _indx;
+	uintptr_t _value;
+};
+
+/**
+ * @brief Notify a bus that a child was deleted
+ *
+ * Called at the beginning of device_delete_child() to allow the parent
+ * to teardown any bus-specific state for the child.
+ * 
+ * @param _dev		the device whose child is being deleted
+ * @param _child	the child device which is being deleted
+ */
+METHOD void child_deleted {
+	device_t _dev;
+	device_t _child;
+};
+
+/**
+ * @brief Notify a bus that a child was detached
+ *
+ * Called after the child's DEVICE_DETACH() method to allow the parent
+ * to reclaim any resources allocated on behalf of the child.
+ * 
+ * @param _dev		the device whose child changed state
+ * @param _child	the child device which changed state
+ */
+METHOD void child_detached {
+	device_t _dev;
+	device_t _child;
+};
+
+/**
+ * @brief Notify a bus that a new driver was added
+ * 
+ * Called when a new driver is added to the devclass which owns this
+ * bus. The generic implementation of this method attempts to probe and
+ * attach any un-matched children of the bus.
+ * 
+ * @param _dev		the device whose devclass had a new driver
+ *			added to it
+ * @param _driver	the new driver which was added
+ */
+METHOD void driver_added {
+	device_t _dev;
+	driver_t *_driver;
+} DEFAULT bus_generic_driver_added;
+
+/**
+ * @brief Create a new child device
+ *
+ * For busses which use use drivers supporting DEVICE_IDENTIFY() to
+ * enumerate their devices, this method is used to create new
+ * device instances. The new device will be added after the last
+ * existing child with the same order.
+ * 
+ * @param _dev		the bus device which will be the parent of the
+ *			new child device
+ * @param _order	a value which is used to partially sort the
+ *			children of @p _dev - devices created using
+ *			lower values of @p _order appear first in @p
+ *			_dev's list of children
+ * @param _name		devclass name for new device or @c NULL if not
+ *			specified
+ * @param _unit		unit number for new device or @c -1 if not
+ *			specified
+ */
+METHOD device_t add_child {
+	device_t _dev;
+	u_int _order;
+	const char *_name;
+	int _unit;
+} DEFAULT null_add_child;
+
+/**
+ * @brief Allocate a system resource
+ *
+ * This method is called by child devices of a bus to allocate resources.
+ * The types are defined in <machine/resource.h>; the meaning of the
+ * resource-ID field varies from bus to bus (but @p *rid == 0 is always
+ * valid if the resource type is). If a resource was allocated and the
+ * caller did not use the RF_ACTIVE to specify that it should be
+ * activated immediately, the caller is responsible for calling
+ * BUS_ACTIVATE_RESOURCE() when it actually uses the resource.
+ *
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which is requesting an allocation
+ * @param _type		the type of resource to allocate
+ * @param _rid		a pointer to the resource identifier
+ * @param _start	hint at the start of the resource range - pass
+ *			@c 0UL for any start address
+ * @param _end		hint at the end of the resource range - pass
+ *			@c ~0UL for any end address
+ * @param _count	hint at the size of range required - pass @c 1
+ *			for any size
+ * @param _flags	any extra flags to control the resource
+ *			allocation - see @c RF_XXX flags in
+ *			<sys/rman.h> for details
+ * 
+ * @returns		the resource which was allocated or @c NULL if no
+ *			resource could be allocated
+ */
+METHOD struct resource * alloc_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int	       *_rid;
+	u_long		_start;
+	u_long		_end;
+	u_long		_count;
+	u_int		_flags;
+} DEFAULT null_alloc_resource;
+
+/**
+ * @brief Activate a resource
+ *
+ * Activate a resource previously allocated with
+ * BUS_ALLOC_RESOURCE(). This may for instance map a memory region
+ * into the kernel's virtual address space.
+ *
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which allocated the resource
+ * @param _type		the type of resource
+ * @param _rid		the resource identifier
+ * @param _r		the resource to activate
+ */
+METHOD int activate_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int		_rid;
+	struct resource *_r;
+};
+
+/**
+ * @brief Deactivate a resource
+ *
+ * Deactivate a resource previously allocated with
+ * BUS_ALLOC_RESOURCE(). This may for instance unmap a memory region
+ * from the kernel's virtual address space.
+ *
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which allocated the resource
+ * @param _type		the type of resource
+ * @param _rid		the resource identifier
+ * @param _r		the resource to deactivate
+ */
+METHOD int deactivate_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int		_rid;
+	struct resource *_r;
+};
+
+/**
+ * @brief Adjust a resource
+ *
+ * Adjust the start and/or end of a resource allocated by
+ * BUS_ALLOC_RESOURCE.  At least part of the new address range must overlap
+ * with the existing address range.  If the successful, the resource's range
+ * will be adjusted to [start, end] on return.
+ *
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which allocated the resource
+ * @param _type		the type of resource
+ * @param _res		the resource to adjust
+ * @param _start	the new starting address of the resource range
+ * @param _end		the new ending address of the resource range
+ */
+METHOD int adjust_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	struct resource *_res;
+	u_long		_start;
+	u_long		_end;
+};
+
+/**
+ * @brief Release a resource
+ *
+ * Free a resource allocated by the BUS_ALLOC_RESOURCE.  The @p _rid
+ * value must be the same as the one returned by BUS_ALLOC_RESOURCE()
+ * (which is not necessarily the same as the one the client passed).
+ *
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which allocated the resource
+ * @param _type		the type of resource
+ * @param _rid		the resource identifier
+ * @param _r		the resource to release
+ */
+METHOD int release_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int		_rid;
+	struct resource *_res;
+};
+
+/**
+ * @brief Install an interrupt handler
+ *
+ * This method is used to associate an interrupt handler function with
+ * an irq resource. When the interrupt triggers, the function @p _intr
+ * will be called with the value of @p _arg as its single
+ * argument. The value returned in @p *_cookiep is used to cancel the
+ * interrupt handler - the caller should save this value to use in a
+ * future call to BUS_TEARDOWN_INTR().
+ * 
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which allocated the resource
+ * @param _irq		the resource representing the interrupt
+ * @param _flags	a set of bits from enum intr_type specifying
+ *			the class of interrupt
+ * @param _intr		the function to call when the interrupt
+ *			triggers
+ * @param _arg		a value to use as the single argument in calls
+ *			to @p _intr
+ * @param _cookiep	a pointer to a location to recieve a cookie
+ *			value that may be used to remove the interrupt
+ *			handler
+ */
+METHOD int setup_intr {
+	device_t	_dev;
+	device_t	_child;
+	struct resource *_irq;
+	int		_flags;
+	driver_filter_t	*_filter;
+	driver_intr_t	*_intr;
+	void		*_arg;
+	void		**_cookiep;
+};
+
+/**
+ * @brief Uninstall an interrupt handler
+ *
+ * This method is used to disassociate an interrupt handler function
+ * with an irq resource. The value of @p _cookie must be the value
+ * returned from a previous call to BUS_SETUP_INTR().
+ * 
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which allocated the resource
+ * @param _irq		the resource representing the interrupt
+ * @param _cookie	the cookie value returned when the interrupt
+ *			was originally registered
+ */
+METHOD int teardown_intr {
+	device_t	_dev;
+	device_t	_child;
+	struct resource	*_irq;
+	void		*_cookie;
+};
+
+/**
+ * @brief Define a resource which can be allocated with
+ * BUS_ALLOC_RESOURCE().
+ *
+ * This method is used by some busses (typically ISA) to allow a
+ * driver to describe a resource range that it would like to
+ * allocate. The resource defined by @p _type and @p _rid is defined
+ * to start at @p _start and to include @p _count indices in its
+ * range.
+ * 
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which owns the resource
+ * @param _type		the type of resource
+ * @param _rid		the resource identifier
+ * @param _start	the start of the resource range
+ * @param _count	the size of the resource range
+ */
+METHOD int set_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int		_rid;
+	u_long		_start;
+	u_long		_count;
+};
+
+/**
+ * @brief Describe a resource
+ *
+ * This method allows a driver to examine the range used for a given
+ * resource without actually allocating it.
+ * 
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which owns the resource
+ * @param _type		the type of resource
+ * @param _rid		the resource identifier
+ * @param _start	the address of a location to recieve the start
+ *			index of the resource range
+ * @param _count	the address of a location to recieve the size
+ *			of the resource range
+ */
+METHOD int get_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int		_rid;
+	u_long		*_startp;
+	u_long		*_countp;
+};
+
+/**
+ * @brief Delete a resource.
+ * 
+ * Use this to delete a resource (possibly one previously added with
+ * BUS_SET_RESOURCE()).
+ * 
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which owns the resource
+ * @param _type		the type of resource
+ * @param _rid		the resource identifier
+ */
+METHOD void delete_resource {
+	device_t	_dev;
+	device_t	_child;
+	int		_type;
+	int		_rid;
+};
+
+/**
+ * @brief Return a struct resource_list.
+ *
+ * Used by drivers which use bus_generic_rl_alloc_resource() etc. to
+ * implement their resource handling. It should return the resource
+ * list of the given child device.
+ * 
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which owns the resource list
+ */
+METHOD struct resource_list * get_resource_list {
+	device_t	_dev;
+	device_t	_child;
+} DEFAULT bus_generic_get_resource_list;
+
+/**
+ * @brief Is the hardware described by @p _child still attached to the
+ * system?
+ *
+ * This method should return 0 if the device is not present.  It
+ * should return -1 if it is present.  Any errors in determining
+ * should be returned as a normal errno value.  Client drivers are to
+ * assume that the device is present, even if there is an error
+ * determining if it is there.  Busses are to try to avoid returning
+ * errors, but newcard will return an error if the device fails to
+ * implement this method.
+ * 
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which is being examined
+ */
+METHOD int child_present {
+	device_t	_dev;
+	device_t	_child;
+} DEFAULT bus_generic_child_present;
+
+/**
+ * @brief Returns the pnp info for this device.
+ *
+ * Return it as a string.  If the string is insufficient for the
+ * storage, then return EOVERFLOW.
+ * 
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which is being examined
+ * @param _buf		the address of a buffer to receive the pnp
+ *			string
+ * @param _buflen	the size of the buffer pointed to by @p _buf
+ */
+METHOD int child_pnpinfo_str {
+	device_t	_dev;
+	device_t	_child;
+	char		*_buf;
+	size_t		_buflen;
+};
+
+/**
+ * @brief Returns the location for this device.
+ *
+ * Return it as a string.  If the string is insufficient for the
+ * storage, then return EOVERFLOW.
+ * 
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which is being examined
+ * @param _buf		the address of a buffer to receive the location
+ *			string
+ * @param _buflen	the size of the buffer pointed to by @p _buf
+ */
+METHOD int child_location_str {
+	device_t	_dev;
+	device_t	_child;
+	char		*_buf;
+	size_t		_buflen;
+};
+
+/**
+ * @brief Allow drivers to request that an interrupt be bound to a specific
+ * CPU.
+ * 
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which allocated the resource
+ * @param _irq		the resource representing the interrupt
+ * @param _cpu		the CPU to bind the interrupt to
+ */
+METHOD int bind_intr {
+	device_t	_dev;
+	device_t	_child;
+	struct resource *_irq;
+	int		_cpu;
+} DEFAULT bus_generic_bind_intr;
+
+/**
+ * @brief Allow (bus) drivers to specify the trigger mode and polarity
+ * of the specified interrupt.
+ * 
+ * @param _dev		the bus device
+ * @param _irq		the interrupt number to modify
+ * @param _trig		the trigger mode required
+ * @param _pol		the interrupt polarity required
+ */
+METHOD int config_intr {
+	device_t	_dev;
+	int		_irq;
+	enum intr_trigger _trig;
+	enum intr_polarity _pol;
+} DEFAULT bus_generic_config_intr;
+
+/**
+ * @brief Allow drivers to associate a description with an active
+ * interrupt handler.
+ *
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device which allocated the resource
+ * @param _irq		the resource representing the interrupt
+ * @param _cookie	the cookie value returned when the interrupt
+ *			was originally registered
+ * @param _descr	the description to associate with the interrupt
+ */
+METHOD int describe_intr {
+	device_t	_dev;
+	device_t	_child;
+	struct resource *_irq;
+	void		*_cookie;
+	const char	*_descr;
+} DEFAULT bus_generic_describe_intr;
+
+/**
+ * @brief Notify a (bus) driver about a child that the hints mechanism
+ * believes it has discovered.
+ *
+ * The bus is responsible for then adding the child in the right order
+ * and discovering other things about the child.  The bus driver is
+ * free to ignore this hint, to do special things, etc.  It is all up
+ * to the bus driver to interpret.
+ *
+ * This method is only called in response to the parent bus asking for
+ * hinted devices to be enumerated.
+ *
+ * @param _dev		the bus device
+ * @param _dname	the name of the device w/o unit numbers
+ * @param _dunit	the unit number of the device
+ */
+METHOD void hinted_child {
+	device_t	_dev;
+	const char	*_dname;
+	int		_dunit;
+};
+
+/**
+ * @brief Returns bus_dma_tag_t for use w/ devices on the bus.
+ *
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device to which the tag will belong
+ */
+METHOD bus_dma_tag_t get_dma_tag {
+	device_t	_dev;
+	device_t	_child;
+} DEFAULT bus_generic_get_dma_tag;
+
+/**
+ * @brief Allow the bus to determine the unit number of a device.
+ *
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device whose unit is to be wired
+ * @param _name		the name of the device's new devclass
+ * @param _unitp	a pointer to the device's new unit value
+ */
+METHOD void hint_device_unit {
+	device_t	_dev;
+	device_t	_child;
+	const char	*_name;
+	int		*_unitp;
+};
+
+/**
+ * @brief Notify a bus that the bus pass level has been changed
+ *
+ * @param _dev		the bus device
+ */
+METHOD void new_pass {
+	device_t	_dev;
+} DEFAULT bus_generic_new_pass;
+
+/**
+ * @brief Notify a bus that specified child's IRQ should be remapped.
+ *
+ * @param _dev		the bus device
+ * @param _child	the child device
+ * @param _irq		the irq number
+ */
+METHOD int remap_intr {
+	device_t	_dev;
+	device_t	_child;
+	u_int		_irq;
+} DEFAULT null_remap_intr;
diff --git a/sys/kern/capabilities.conf b/sys/kern/capabilities.conf
new file mode 100644
index 0000000..7f68668
--- /dev/null
+++ b/sys/kern/capabilities.conf
@@ -0,0 +1,754 @@
+##
+## Copyright (c) 2008-2010 Robert N. M. Watson
+## All rights reserved.
+##
+## This software was developed at the University of Cambridge Computer
+## Laboratory with support from a grant from Google, Inc.
+##
+## Redistribution and use in source and binary forms, with or without
+## modification, are permitted provided that the following conditions
+## are met:
+## 1. Redistributions of source code must retain the above copyright
+##    notice, this list of conditions and the following disclaimer.
+## 2. Redistributions in binary form must reproduce the above copyright
+##    notice, this list of conditions and the following disclaimer in the
+##    documentation and/or other materials provided with the distribution.
+##
+## THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+## ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+## ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+## FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+## DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+## OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+## HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+## LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+## OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+## SUCH DAMAGE.
+##
+## List of system calls enabled in capability mode, one name per line.
+##
+## Notes:
+## - sys_exit(2), abort2(2) and close(2) are very important.
+## - Sorted alphabetically, please keep it that way.
+##
+## $FreeBSD$
+##
+
+##
+## Allow ACL and MAC label operations by file descriptor, subject to
+## capability rights.  Allow MAC label operations on the current process but
+## we will need to scope __mac_get_pid(2).
+##
+__acl_aclcheck_fd
+__acl_delete_fd
+__acl_get_fd
+__acl_set_fd
+__mac_get_fd
+#__mac_get_pid
+__mac_get_proc
+__mac_set_fd
+__mac_set_proc
+
+##
+## Allow sysctl(2) as we scope internal to the call; this is a global
+## namespace, but there are several critical sysctls required for almost
+## anything to run, such as hw.pagesize.  For now that policy lives in the
+## kernel for performance and simplicity, but perhaps it could move to a
+## proxying daemon in userspace.
+##
+__sysctl
+
+##
+## Allow umtx operations as these are scoped by address space.
+##
+## XXRW: Need to check this very carefully.
+##
+_umtx_lock
+_umtx_op
+_umtx_unlock
+
+##
+## Allow process termination using abort2(2).
+##
+abort2
+
+##
+## Allow accept(2) since it doesn't manipulate namespaces directly, rather
+## relies on existing bindings on a socket, subject to capability rights.
+##
+accept
+accept4
+
+##
+## Allow AIO operations by file descriptor, subject to capability rights.
+##
+aio_cancel
+aio_error
+aio_fsync
+aio_read
+aio_return
+aio_suspend
+aio_waitcomplete
+aio_write
+
+##
+## audit(2) is a global operation, submitting to the global trail, but it is
+## controlled by privilege, and it might be useful to be able to submit
+## records from sandboxes.  For now, disallow, but we may want to think about
+## providing some sort of proxy service for this.
+##
+#audit
+
+##
+## Allow bindat(2).
+##
+bindat
+
+##
+## Allow capability mode and capability system calls.
+##
+cap_enter
+cap_fcntls_get
+cap_fcntls_limit
+cap_getmode
+cap_ioctls_get
+cap_ioctls_limit
+__cap_rights_get
+cap_rights_limit
+
+##
+## Allow read-only clock operations.
+##
+clock_getres
+clock_gettime
+
+##
+## Always allow file descriptor close(2).
+##
+close
+closefrom
+
+##
+## Allow connectat(2).
+##
+connectat
+
+##
+## cpuset(2) and related calls require scoping by process, but should
+## eventually be allowed, at least in the current process case.
+##
+#cpuset
+#cpuset_getaffinity
+#cpuset_getid
+#cpuset_setaffinity
+#cpuset_setid
+
+##
+## Always allow dup(2) and dup2(2) manipulation of the file descriptor table.
+##
+dup
+dup2
+
+##
+## Allow extended attribute operations by file descriptor, subject to
+## capability rights.
+##
+extattr_delete_fd
+extattr_get_fd
+extattr_list_fd
+extattr_set_fd
+
+##
+## Allow changing file flags, mode, and owner by file descriptor, subject to
+## capability rights.
+##
+fchflags
+fchmod
+fchown
+
+##
+## For now, allow fcntl(2), subject to capability rights, but this probably
+## needs additional scoping.
+##
+fcntl
+
+##
+## Allow fexecve(2), subject to capability rights.  We perform some scoping,
+## such as disallowing privilege escalation.
+##
+fexecve
+
+##
+## Allow flock(2), subject to capability rights.
+##
+flock
+
+##
+## Allow fork(2), even though it returns pids -- some applications seem to
+## prefer this interface.
+##
+fork
+
+##
+## Allow fpathconf(2), subject to capability rights.
+##
+fpathconf
+
+##
+## Allow various file descriptor-based I/O operations, subject to capability
+## rights.
+##
+freebsd6_ftruncate
+freebsd6_lseek
+freebsd6_mmap
+freebsd6_pread
+freebsd6_pwrite
+
+##
+## Allow querying file and file system state with fstat(2) and fstatfs(2),
+## subject to capability rights.
+##
+fstat
+fstatfs
+
+##
+## Allow further file descriptor-based I/O operations, subject to capability
+## rights.
+##
+fsync
+ftruncate
+
+##
+## Allow futimes(2), subject to capability rights.
+##
+futimes
+
+##
+## Allow querying process audit state, subject to normal access control.
+##
+getaudit
+getaudit_addr
+getauid
+
+##
+## Allow thread context management with getcontext(2).
+##
+getcontext
+
+##
+## Allow directory I/O on a file descriptor, subject to capability rights.
+## Originally we had separate capabilities for directory-specific read
+## operations, but on BSD we allow reading the raw directory data, so we just
+## rely on CAP_READ now.
+##
+getdents
+getdirentries
+
+##
+## Allow querying certain trivial global state.
+##
+getdomainname
+
+##
+## Allow querying current process credential state.
+##
+getegid
+geteuid
+
+##
+## Allow querying certain trivial global state.
+##
+gethostid
+gethostname
+
+##
+## Allow querying per-process timer.
+##
+getitimer
+
+##
+## Allow querying current process credential state.
+##
+getgid
+getgroups
+getlogin
+
+##
+## Allow querying certain trivial global state.
+##
+getpagesize
+getpeername
+
+##
+## Allow querying certain per-process scheduling, resource limit, and
+## credential state.
+##
+## XXXRW: getpgid(2) needs scoping.  It's not clear if it's worth scoping
+## getppid(2).  getpriority(2) needs scoping.  getrusage(2) needs scoping.
+## getsid(2) needs scoping.
+##
+getpgid
+getpgrp
+getpid
+getppid
+getpriority
+getresgid
+getresuid
+getrlimit
+getrusage
+getsid
+
+##
+## Allow querying socket state, subject to capability rights.
+##
+## XXXRW: getsockopt(2) may need more attention.
+##
+getsockname
+getsockopt
+
+##
+## Allow querying the global clock.
+##
+gettimeofday
+
+##
+## Allow querying current process credential state.
+##
+getuid
+
+##
+## Allow ioctl(2), which hopefully will be limited by applications only to
+## required commands with cap_ioctls_limit(2) syscall.
+##
+ioctl
+
+##
+## Allow querying current process credential state.
+##
+issetugid
+
+##
+## Allow kevent(2), as we will authorize based on capability rights on the
+## target descriptor.
+##
+kevent
+
+##
+## Allow kill(2), as we allow the process to send signals only to himself.
+##
+kill
+
+##
+## Allow message queue operations on file descriptors, subject to capability
+## rights.
+##
+kmq_notify
+kmq_setattr
+kmq_timedreceive
+kmq_timedsend
+
+##
+## Allow kqueue(2), we will control use.
+##
+kqueue
+
+##
+## Allow managing per-process timers.
+##
+ktimer_create
+ktimer_delete
+ktimer_getoverrun
+ktimer_gettime
+ktimer_settime
+
+##
+## We can't allow ktrace(2) because it relies on a global namespace, but we
+## might want to introduce an fktrace(2) of some sort.
+##
+#ktrace
+
+##
+## Allow AIO operations by file descriptor, subject to capability rights.
+##
+lio_listio
+
+##
+## Allow listen(2), subject to capability rights.
+##
+## XXXRW: One might argue this manipulates a global namespace.
+##
+listen
+
+##
+## Allow I/O-related file descriptors, subject to capability rights.
+##
+lseek
+
+##
+## Allow MAC label operations by file descriptor, subject to capability
+## rights.
+##
+mac_get_fd
+mac_set_fd
+
+##
+## Allow simple VM operations on the current process.
+##
+madvise
+mincore
+minherit
+mlock
+mlockall
+
+##
+## Allow memory mapping a file descriptor, and updating protections, subject
+## to capability rights.
+##
+mmap
+mprotect
+
+##
+## Allow simple VM operations on the current process.
+##
+msync
+munlock
+munlockall
+munmap
+
+##
+## Allow the current process to sleep.
+##
+nanosleep
+
+##
+## Allow querying the global clock.
+##
+ntp_gettime
+
+##
+## Allow AIO operations by file descriptor, subject to capability rights.
+##
+oaio_read
+oaio_write
+
+##
+## Allow simple VM operations on the current process.
+##
+obreak
+
+##
+## Allow AIO operations by file descriptor, subject to capability rights.
+##
+olio_listio
+
+##
+## Operations relative to directory capabilities.
+##
+chflagsat
+faccessat
+fchmodat
+fchownat
+fstatat
+futimesat
+linkat
+mkdirat
+mkfifoat
+mknodat
+openat
+readlinkat
+renameat
+symlinkat
+unlinkat
+
+##
+## Allow entry into open(2). This system call will fail, since access to the
+## global file namespace has been disallowed, but allowing entry into the
+## syscall means that an audit trail will be generated (which is also very
+## useful for debugging).
+##
+open
+
+##
+## Allow poll(2), which will be scoped by capability rights.
+##
+## XXXRW: Perhaps we don't need the OpenBSD version?
+## XXXRW: We don't yet do that scoping.
+##
+openbsd_poll
+
+##
+## Process descriptor-related system calls are allowed.
+##
+pdfork
+pdgetpid
+pdkill
+#pdwait4	# not yet implemented
+
+##
+## Allow pipe(2).
+##
+pipe
+pipe2
+
+##
+## Allow poll(2), which will be scoped by capability rights.
+## XXXRW: We don't yet do that scoping.
+##
+poll
+
+##
+## Allow I/O-related file descriptors, subject to capability rights.
+##
+pread
+preadv
+
+##
+## Allow access to profiling state on the current process.
+##
+profil
+
+##
+## Disallow ptrace(2) for now, but we do need debugging facilities in
+## capability mode, so we will want to revisit this, possibly by scoping its
+## operation.
+##
+#ptrace
+
+##
+## Allow I/O-related file descriptors, subject to capability rights.
+##
+pwrite
+pwritev
+read
+readv
+recv
+recvfrom
+recvmsg
+
+##
+## Allow real-time scheduling primitives to be used.
+##
+## XXXRW: These require scoping.
+##
+rtprio
+rtprio_thread
+
+##
+## Allow simple VM operations on the current process.
+##
+sbrk
+
+##
+## Allow querying trivial global scheduler state.
+##
+sched_get_priority_max
+sched_get_priority_min
+
+##
+## Allow various thread/process scheduler operations.
+##
+## XXXRW: Some of these require further scoping.
+##
+sched_getparam
+sched_getscheduler
+sched_rr_getinterval
+sched_setparam
+sched_setscheduler
+sched_yield
+
+##
+## Allow I/O-related file descriptors, subject to capability rights.
+##
+sctp_generic_recvmsg
+sctp_generic_sendmsg
+sctp_generic_sendmsg_iov
+sctp_peeloff
+
+##
+## Allow select(2), which will be scoped by capability rights.
+##
+## XXXRW: But is it?
+##
+select
+
+##
+## Allow I/O-related file descriptors, subject to capability rights.  Use of
+## explicit addresses here is restricted by the system calls themselves.
+##
+send
+sendfile
+sendmsg
+sendto
+
+##
+## Allow setting per-process audit state, which is controlled separately by
+## privileges.
+##
+setaudit
+setaudit_addr
+setauid
+
+##
+## Allow setting thread context.
+##
+setcontext
+
+##
+## Allow setting current process credential state, which is controlled
+## separately by privilege.
+##
+setegid
+seteuid
+setgid
+
+##
+## Allow use of the process interval timer.
+##
+setitimer
+
+##
+## Allow setpriority(2).
+##
+## XXXRW: Requires scoping.
+##
+setpriority
+
+##
+## Allow setting current process credential state, which is controlled
+## separately by privilege.
+##
+setregid
+setresgid
+setresuid
+setreuid
+
+##
+## Allow setting process resource limits with setrlimit(2).
+##
+setrlimit
+
+##
+## Allow creating a new session with setsid(2).
+##
+setsid
+
+##
+## Allow setting socket options with setsockopt(2), subject to capability
+## rights.
+##
+## XXXRW: Might require scoping.
+##
+setsockopt
+
+##
+## Allow setting current process credential state, which is controlled
+## separately by privilege.
+##
+setuid
+
+##
+## shm_open(2) is scoped so as to allow only access to new anonymous objects.
+##
+shm_open
+
+##
+## Allow I/O-related file descriptors, subject to capability rights.
+##
+shutdown
+
+##
+## Allow signal control on current process.
+##
+sigaction
+sigaltstack
+sigblock
+sigpending
+sigprocmask
+sigqueue
+sigreturn
+sigsetmask
+sigstack
+sigsuspend
+sigtimedwait
+sigvec
+sigwaitinfo
+
+##
+## Allow creating new socket pairs with socket(2) and socketpair(2).
+##
+socket
+socketpair
+
+##
+## Allow simple VM operations on the current process.
+##
+## XXXRW: Kernel doesn't implement this, so drop?
+##
+sstk
+
+##
+## Do allow sync(2) for now, but possibly shouldn't.
+##
+sync
+
+##
+## Always allow process termination with sys_exit(2).
+##
+sys_exit
+
+##
+## sysarch(2) does rather diverse things, but is required on at least i386
+## in order to configure per-thread data.  As such, it's scoped on each
+## architecture.
+##
+sysarch
+
+##
+## Allow thread operations operating only on current process.
+##
+thr_create
+thr_exit
+thr_kill
+
+##
+## Disallow thr_kill2(2), as it may operate beyond the current process.
+##
+## XXXRW: Requires scoping.
+##
+#thr_kill2
+
+##
+## Allow thread operations operating only on current process.
+##
+thr_new
+thr_self
+thr_set_name
+thr_suspend
+thr_wake
+
+##
+## Allow manipulation of the current process umask with umask(2).
+##
+umask
+
+##
+## Allow submitting of process trace entries with utrace(2).
+##
+utrace
+
+##
+## Allow generating UUIDs with uuidgen(2).
+##
+uuidgen
+
+##
+## Allow I/O-related file descriptors, subject to capability rights.
+##
+write
+writev
+
+##
+## Allow processes to yield(2).
+##
+yield
diff --git a/sys/kern/clock_if.m b/sys/kern/clock_if.m
new file mode 100644
index 0000000..cb1179a
--- /dev/null
+++ b/sys/kern/clock_if.m
@@ -0,0 +1,45 @@
+#-
+# Copyright (c) 2001 by Thomas Moestl <tmm@FreeBSD.org>.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# $FreeBSD$
+
+#include <sys/bus.h>
+#include <sys/time.h>
+
+INTERFACE clock;
+
+# Interface for clock drivers. This is inspired by the NetBSD device-independent
+# clock code (by Gordon W. Ross).
+
+# An EINVAL error return from this call signifies that the clock has an illegal
+# setting.
+METHOD int gettime {
+	device_t dev;
+	struct timespec *ts;
+};
+
+METHOD int settime {
+	device_t dev;
+	struct timespec *ts;
+};
diff --git a/sys/kern/cpufreq_if.m b/sys/kern/cpufreq_if.m
new file mode 100644
index 0000000..8b1213e
--- /dev/null
+++ b/sys/kern/cpufreq_if.m
@@ -0,0 +1,100 @@
+#
+# Copyright (c) 2004 Nate Lawson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+
+INTERFACE cpufreq;
+
+HEADER {
+	struct cf_level;
+	struct cf_setting;
+};
+
+# cpufreq interface methods
+
+#
+# Set the current CPU frequency level.
+#
+METHOD int set {
+	device_t		dev;
+	const struct cf_level	*level;
+	int			priority;
+};
+
+#
+# Get the current active level.
+#
+METHOD int get {
+	device_t		dev;
+	struct cf_level		*level;
+};
+
+#
+# Get the current possible levels, based on all drivers.
+#
+METHOD int levels {
+	device_t		dev;
+	struct cf_level		*levels;
+	int			*count;
+};
+
+# Individual frequency driver methods
+
+#
+# Set an individual driver's setting.
+#
+METHOD int drv_set {
+	device_t		dev;
+	const struct cf_setting	*set;
+};
+
+#
+# Get an individual driver's setting.
+#
+METHOD int drv_get {
+	device_t		dev;
+	struct cf_setting	*set;
+};
+
+#
+# Get the settings supported by a driver.
+#
+METHOD int drv_settings {
+	device_t		dev;
+	struct cf_setting	*sets;
+	int			*count;
+};
+
+#
+# Get an individual driver's type.
+#
+METHOD int drv_type {
+	device_t		dev;
+	int			*type;
+};
+
diff --git a/sys/kern/device_if.m b/sys/kern/device_if.m
new file mode 100644
index 0000000..eb720eb
--- /dev/null
+++ b/sys/kern/device_if.m
@@ -0,0 +1,318 @@
+#-
+# Copyright (c) 1998-2004 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+
+/**
+ * @defgroup DEVICE device - KObj methods for all device drivers
+ * @brief A basic set of methods required for all device drivers.
+ *
+ * The device interface is used to match devices to drivers during
+ * autoconfiguration and provides methods to allow drivers to handle
+ * system-wide events such as suspend, resume or shutdown.
+ * @{
+ */
+INTERFACE device;
+
+#
+# Default implementations of some methods.
+#
+CODE {
+	static int null_shutdown(device_t dev)
+	{
+	    return 0;
+	}
+
+	static int null_suspend(device_t dev)
+	{
+	    return 0;
+	}
+
+	static int null_resume(device_t dev)
+	{
+	    return 0;
+	}
+
+	static int null_quiesce(device_t dev)
+	{
+	    return EOPNOTSUPP;
+	}
+};
+	
+/**
+ * @brief Probe to see if a device matches a driver.
+ *
+ * Users should not call this method directly. Normally, this
+ * is called via device_probe_and_attach() to select a driver
+ * calling the DEVICE_PROBE() of all candidate drivers and attach
+ * the winning driver (if any) to the device.
+ *
+ * This function is used to match devices to device drivers.
+ * Typically, the driver will examine the device to see if
+ * it is suitable for this driver. This might include checking
+ * the values of various device instance variables or reading
+ * hardware registers.
+ *  
+ * In some cases, there may be more than one driver available
+ * which can be used for a device (for instance there might
+ * be a generic driver which works for a set of many types of
+ * device and a more specific driver which works for a subset
+ * of devices). Because of this, a driver should not assume
+ * that it will be the driver that attaches to the device even
+ * if it returns a success status from DEVICE_PROBE(). In particular,
+ * a driver must free any resources which it allocated during
+ * the probe before returning. The return value of DEVICE_PROBE()
+ * is used to elect which driver is used - the driver which returns
+ * the largest non-error value wins the election and attaches to
+ * the device. Common non-error values are described in the
+ * DEVICE_PROBE(9) manual page.
+ *
+ * If a driver matches the hardware, it should set the device
+ * description string using device_set_desc() or
+ * device_set_desc_copy(). This string is used to generate an
+ * informative message when DEVICE_ATTACH() is called.
+ * 
+ * As a special case, if a driver returns zero, the driver election
+ * is cut short and that driver will attach to the device
+ * immediately. This should rarely be used.
+ *
+ * For example, a probe method for a PCI device driver might look
+ * like this:
+ *
+ * @code
+ * int
+ * foo_probe(device_t dev)
+ * {
+ *         if (pci_get_vendor(dev) == FOOVENDOR &&
+ *             pci_get_device(dev) == FOODEVICE) {
+ *                 device_set_desc(dev, "Foo device");
+ *                 return (BUS_PROBE_DEFAULT);
+ *         }
+ *         return (ENXIO);
+ * }
+ * @endcode
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * 	KOBJMETHOD(device_probe, foo_probe)
+ * @endcode
+ *
+ * @param dev		the device to probe
+ *
+ * @retval 0		if this is the only possible driver for this
+ *			device
+ * @retval negative	if the driver can match this device - the
+ *			least negative value is used to select the
+ *			driver
+ * @retval ENXIO	if the driver does not match the device
+ * @retval positive	if some kind of error was detected during
+ *			the probe, a regular unix error code should
+ *			be returned to indicate the type of error
+ * @see DEVICE_ATTACH(), pci_get_vendor(), pci_get_device()
+ */
+METHOD int probe {
+	device_t dev;
+};
+
+/**
+ * @brief Allow a device driver to detect devices not otherwise enumerated.
+ *
+ * The DEVICE_IDENTIFY() method is used by some drivers (e.g. the ISA
+ * bus driver) to help populate the bus device with a useful set of
+ * child devices, normally by calling the BUS_ADD_CHILD() method of
+ * the parent device. For instance, the ISA bus driver uses several
+ * special drivers, including the isahint driver and the pnp driver to
+ * create child devices based on configuration hints and PnP bus
+ * probes respectively.
+ *
+ * Many bus drivers which support true plug-and-play do not need to
+ * use this method at all since child devices can be discovered
+ * automatically without help from child drivers.
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * 	KOBJMETHOD(device_identify, foo_identify)
+ * @endcode
+ *
+ * @param driver	the driver whose identify method is being called
+ * @param parent	the parent device to use when adding new children
+ */
+STATICMETHOD void identify {
+	driver_t *driver;
+	device_t parent;
+};
+
+/**
+ * @brief Attach a device to a device driver
+ *
+ * Normally only called via device_probe_and_attach(), this is called
+ * when a driver has succeeded in probing against a device.
+ * This method should initialise the hardware and allocate other
+ * system resources (e.g. devfs entries) as required.
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * 	KOBJMETHOD(device_attach, foo_attach)
+ * @endcode
+ *
+ * @param dev		the device to probe
+ *
+ * @retval 0		success
+ * @retval non-zero	if some kind of error was detected during
+ *			the attach, a regular unix error code should
+ *			be returned to indicate the type of error
+ * @see DEVICE_PROBE()
+ */
+METHOD int attach {
+	device_t dev;
+};
+
+/**
+ * @brief Detach a driver from a device.
+ *
+ * This can be called if the user is replacing the
+ * driver software or if a device is about to be physically removed
+ * from the system (e.g. for removable hardware such as USB or PCCARD).
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * 	KOBJMETHOD(device_detach, foo_detach)
+ * @endcode
+ *
+ * @param dev		the device to detach
+ *
+ * @retval 0		success
+ * @retval non-zero	the detach could not be performed, e.g. if the
+ *			driver does not support detaching.
+ *
+ * @see DEVICE_ATTACH()
+ */
+METHOD int detach {
+	device_t dev;
+};
+
+/**
+ * @brief Called during system shutdown.
+ *
+ * This method allows drivers to detect when the system is being shut down.
+ * Some drivers need to use this to place their hardware in a consistent
+ * state before rebooting the computer.
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * 	KOBJMETHOD(device_shutdown, foo_shutdown)
+ * @endcode
+ */
+METHOD int shutdown {
+	device_t dev;
+} DEFAULT null_shutdown;
+
+/**
+ * @brief This is called by the power-management subsystem when a
+ * suspend has been requested by the user or by some automatic
+ * mechanism.
+ *
+ * This gives drivers a chance to veto the suspend or save their
+ * configuration before power is removed.
+ *
+ * To include this method in a device driver, use a line like this in
+ * the driver's method list:
+ *
+ * @code
+ * 	KOBJMETHOD(device_suspend, foo_suspend)
+ * @endcode
+ *
+ * @param dev		the device being suspended
+ *
+ * @retval 0		success
+ * @retval non-zero	an error occurred while attempting to prepare the
+ *                      device for suspension
+ *
+ * @see DEVICE_RESUME()
+ */
+METHOD int suspend {
+	device_t dev;
+} DEFAULT null_suspend;
+
+/**
+ * @brief This is called when the system resumes after a suspend.
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * 	KOBJMETHOD(device_resume, foo_resume)
+ * @endcode
+ *
+ * @param dev		the device being resumed
+ *
+ * @retval 0		success
+ * @retval non-zero	an error occurred while attempting to restore the
+ *                      device from suspension
+ *
+ * @see DEVICE_SUSPEND()
+ */
+METHOD int resume {
+	device_t dev;
+} DEFAULT null_resume;
+
+/**
+ * @brief This is called when the driver is asked to quiesce itself.
+ *
+ * The driver should arrange for the orderly shutdown of this device.
+ * All further access to the device should be curtailed.  Soon there
+ * will be a request to detach, but there won't necessarily be one.
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * 	KOBJMETHOD(device_quiesce, foo_quiesce)
+ * @endcode
+ *
+ * @param dev		the device being quiesced
+ *
+ * @retval 0		success
+ * @retval non-zero	an error occurred while attempting to quiesce the
+ *                      device
+ *
+ * @see DEVICE_DETACH()
+ */
+METHOD int quiesce {
+	device_t dev;
+} DEFAULT null_quiesce;
diff --git a/sys/kern/dtio_kdtrace.c b/sys/kern/dtio_kdtrace.c
new file mode 100644
index 0000000..3d6f416
--- /dev/null
+++ b/sys/kern/dtio_kdtrace.c
@@ -0,0 +1,232 @@
+/*-
+ * Copyright (c) 2012 Advanced Computing Technologies LLC
+ * Written by George Neville-Neil gnn@freebsd.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+
+#include <sys/dtrace.h>
+#include "../sys/dtrace_bsd.h"
+
+
+static int	dtio_unload(void);
+static void	dtio_getargdesc(void *, dtrace_id_t, void *,
+		    dtrace_argdesc_t *);
+static void	dtio_provide(void *, dtrace_probedesc_t *);
+static void	dtio_destroy(void *, dtrace_id_t, void *);
+static void	dtio_enable(void *, dtrace_id_t, void *);
+static void	dtio_disable(void *, dtrace_id_t, void *);
+static void	dtio_load(void *);
+
+static dtrace_pattr_t dtio_attr = {
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+};
+
+static char    *kernel = "kernel";
+
+/*
+ * Name strings.
+ */
+static char	*dtio_start_str = "start";
+static char	*dtio_done_str = "done";
+static char	*dtio_wait_start_str = "wait-start";
+static char	*dtio_wait_done_str = "wait-done";
+
+static dtrace_pops_t dtio_pops = {
+	dtio_provide,
+	NULL,
+	dtio_enable,
+	dtio_disable,
+	NULL,
+	NULL,
+	dtio_getargdesc,
+	NULL,
+	NULL,
+	dtio_destroy
+};
+
+static dtrace_provider_id_t	dtio_id;
+
+extern uint32_t	dtio_start_id;
+extern uint32_t	dtio_done_id;
+extern uint32_t	dtio_wait_start_id;
+extern uint32_t	dtio_wait_done_id;
+
+static void
+dtio_getargdesc(void *arg, dtrace_id_t id, void *parg,
+    dtrace_argdesc_t *desc)
+{
+	const char *p = NULL;
+
+	switch (desc->dtargd_ndx) {
+	case 0:
+		p = "struct bio *";
+		break;
+	case 1:
+		p = "struct devstat *";
+		break;
+	default:
+		desc->dtargd_ndx = DTRACE_ARGNONE;
+	}
+
+	if (p != NULL)
+		strlcpy(desc->dtargd_native, p, sizeof(desc->dtargd_native));
+}
+
+static void
+dtio_provide(void *arg, dtrace_probedesc_t *desc)
+{
+	if (desc != NULL)
+		return;
+
+	if (dtrace_probe_lookup(dtio_id, kernel, NULL, 
+				dtio_start_str) == 0) {
+		dtio_start_id = dtrace_probe_create(dtio_id, kernel, NULL, 
+						   dtio_start_str, 0, NULL);
+	}
+	if (dtrace_probe_lookup(dtio_id, kernel, NULL, dtio_done_str) == 0) {
+		dtio_done_id = dtrace_probe_create(dtio_id, kernel, NULL, 
+						   dtio_done_str, 0, NULL);
+	}
+	if (dtrace_probe_lookup(dtio_id, kernel, NULL, 
+				dtio_wait_start_str) == 0) {
+		dtio_wait_start_id = dtrace_probe_create(dtio_id, kernel, 
+							 NULL, 
+							 dtio_wait_start_str, 
+							 0, NULL);
+	}
+	if (dtrace_probe_lookup(dtio_id, kernel, NULL, 
+				dtio_wait_done_str) == 0) {
+		dtio_wait_done_id = dtrace_probe_create(dtio_id, kernel, NULL, 
+						   dtio_wait_done_str, 0, NULL);
+	}
+
+}
+
+static void
+dtio_destroy(void *arg, dtrace_id_t id, void *parg)
+{
+}
+
+static void
+dtio_enable(void *arg, dtrace_id_t id, void *parg)
+{
+	if (id == dtio_start_id)
+		dtrace_io_start_probe =
+			(dtrace_io_start_probe_func_t)dtrace_probe;
+	else if (id == dtio_done_id)
+		dtrace_io_done_probe =
+			(dtrace_io_done_probe_func_t)dtrace_probe;
+	else if (id == dtio_wait_start_id)
+		dtrace_io_wait_start_probe =
+			(dtrace_io_wait_start_probe_func_t)dtrace_probe;
+	else if (id == dtio_wait_done_id)
+		dtrace_io_wait_done_probe =
+			(dtrace_io_wait_done_probe_func_t)dtrace_probe;
+	else
+		printf("dtrace io provider: unknown ID\n");
+
+}
+
+static void
+dtio_disable(void *arg, dtrace_id_t id, void *parg)
+{
+	if (id == dtio_start_id)
+		dtrace_io_start_probe = NULL;
+	else if (id == dtio_done_id)
+		dtrace_io_done_probe = NULL;
+	else if (id == dtio_wait_start_id)
+		dtrace_io_wait_start_probe = NULL;
+	else if (id == dtio_wait_done_id)
+		dtrace_io_wait_done_probe = NULL;
+	else 
+		printf("dtrace io provider: unknown ID\n");
+	
+}
+
+static void
+dtio_load(void *dummy)
+{
+	if (dtrace_register("io", &dtio_attr, DTRACE_PRIV_USER, NULL, 
+			    &dtio_pops, NULL, &dtio_id) != 0)
+		return;
+}
+
+
+static int
+dtio_unload()
+{
+	dtrace_io_start_probe = NULL;
+	dtrace_io_done_probe = NULL;
+	dtrace_io_wait_start_probe = NULL;
+	dtrace_io_wait_done_probe = NULL;
+
+	return (dtrace_unregister(dtio_id));
+}
+
+static int
+dtio_modevent(module_t mod __unused, int type, void *data __unused)
+{
+	int error = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+		break;
+
+	case MOD_UNLOAD:
+		break;
+
+	case MOD_SHUTDOWN:
+		break;
+
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+
+	return (error);
+}
+
+SYSINIT(dtio_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
+    dtio_load, NULL);
+SYSUNINIT(dtio_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
+    dtio_unload, NULL);
+
+DEV_MODULE(dtio, dtio_modevent, NULL);
+MODULE_VERSION(dtio, 1);
+MODULE_DEPEND(dtio, dtrace, 1, 1, 1);
+MODULE_DEPEND(dtio, opensolaris, 1, 1, 1);
diff --git a/sys/kern/genassym.sh b/sys/kern/genassym.sh
new file mode 100644
index 0000000..1cbc32b
--- /dev/null
+++ b/sys/kern/genassym.sh
@@ -0,0 +1,69 @@
+#!/bin/sh
+# $FreeBSD$
+
+usage()
+{
+	echo "usage: genassym [-o outfile] objfile"
+	exit 1
+}
+
+
+work()
+{
+	${NM:='nm'} "$1" | ${AWK:='awk'} '
+	/ C .*sign$/ {
+		sign = substr($1, length($1) - 3, 4)
+		sub("^0*", "", sign)
+		if (sign != "")
+			sign = "-"
+	}
+	/ C .*w0$/ {
+		w0 = substr($1, length($1) - 3, 4)
+	}
+	/ C .*w1$/ {
+		w1 = substr($1, length($1) - 3, 4)
+	}
+	/ C .*w2$/ {
+		w2 = substr($1, length($1) - 3, 4)
+	}
+	/ C .*w3$/ {
+		w3 = substr($1, length($1) - 3, 4)
+		w = w3 w2 w1 w0
+		sub("^0*", "", w)
+		if (w == "")
+			w = "0"
+		sub("w3$", "", $3)
+		# This still has minor problems representing INT_MIN, etc. 
+		# E.g.,
+		# with 32-bit 2''s complement ints, this prints -0x80000000,
+		# which has the wrong type (unsigned int).
+		printf("#define\t%s\t%s0x%s\n", $3, sign, w)
+	} '
+}
+
+
+#
+#MAIN PROGGRAM
+#
+use_outfile="no"
+while getopts "o:" option
+do
+	case "$option" in
+	o)	outfile="$OPTARG"
+		use_outfile="yes";;
+	*)	usage;;
+	esac
+done
+shift $(($OPTIND - 1))
+case $# in
+1)	;;
+*)	usage;;
+esac
+
+if [ "$use_outfile" = "yes" ]
+then
+	work $1  3>"$outfile" >&3 3>&-
+else
+	work $1
+fi
+
diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c
new file mode 100644
index 0000000..3ae78de
--- /dev/null
+++ b/sys/kern/imgact_aout.c
@@ -0,0 +1,343 @@
+/*-
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+#include <machine/frame.h>
+#include <machine/md_var.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_param.h>
+
+#ifdef __amd64__
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_util.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/ia32/ia32_signal.h>
+#endif
+
+static int	exec_aout_imgact(struct image_params *imgp);
+static int	aout_fixup(register_t **stack_base, struct image_params *imgp);
+
+#if defined(__i386__)
+struct sysentvec aout_sysvec = {
+	.sv_size	= SYS_MAXSYSCALL,
+	.sv_table	= sysent,
+	.sv_mask	= 0,
+	.sv_sigsize	= 0,
+	.sv_sigtbl	= NULL,
+	.sv_errsize	= 0,
+	.sv_errtbl	= NULL,
+	.sv_transtrap	= NULL,
+	.sv_fixup	= aout_fixup,
+	.sv_sendsig	= sendsig,
+	.sv_sigcode	= sigcode,
+	.sv_szsigcode	= &szsigcode,
+	.sv_prepsyscall	= NULL,
+	.sv_name	= "FreeBSD a.out",
+	.sv_coredump	= NULL,
+	.sv_imgact_try	= NULL,
+	.sv_minsigstksz	= MINSIGSTKSZ,
+	.sv_pagesize	= PAGE_SIZE,
+	.sv_minuser	= VM_MIN_ADDRESS,
+	.sv_maxuser	= VM_MAXUSER_ADDRESS,
+	.sv_usrstack	= USRSTACK,
+	.sv_psstrings	= PS_STRINGS,
+	.sv_stackprot	= VM_PROT_ALL,
+	.sv_copyout_strings	= exec_copyout_strings,
+	.sv_setregs	= exec_setregs,
+	.sv_fixlimit	= NULL,
+	.sv_maxssiz	= NULL,
+	.sv_flags	= SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32,
+	.sv_set_syscall_retval = cpu_set_syscall_retval,
+	.sv_fetch_syscall_args = cpu_fetch_syscall_args,
+	.sv_syscallnames = syscallnames,
+	.sv_schedtail	= NULL,
+};
+
+#elif defined(__amd64__)
+
+#define	AOUT32_USRSTACK	0xbfc00000
+#define	AOUT32_PS_STRINGS \
+    (AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings))
+#define	AOUT32_MINUSER	FREEBSD32_MINUSER
+
+extern const char *freebsd32_syscallnames[];
+extern u_long ia32_maxssiz;
+
+struct sysentvec aout_sysvec = {
+	.sv_size	= FREEBSD32_SYS_MAXSYSCALL,
+	.sv_table	= freebsd32_sysent,
+	.sv_mask	= 0,
+	.sv_sigsize	= 0,
+	.sv_sigtbl	= NULL,
+	.sv_errsize	= 0,
+	.sv_errtbl	= NULL,
+	.sv_transtrap	= NULL,
+	.sv_fixup	= aout_fixup,
+	.sv_sendsig	= ia32_sendsig,
+	.sv_sigcode	= ia32_sigcode,
+	.sv_szsigcode	= &sz_ia32_sigcode,
+	.sv_prepsyscall	= NULL,
+	.sv_name	= "FreeBSD a.out",
+	.sv_coredump	= NULL,
+	.sv_imgact_try	= NULL,
+	.sv_minsigstksz	= MINSIGSTKSZ,
+	.sv_pagesize	= IA32_PAGE_SIZE,
+	.sv_minuser	= AOUT32_MINUSER,
+	.sv_maxuser	= AOUT32_USRSTACK,
+	.sv_usrstack	= AOUT32_USRSTACK,
+	.sv_psstrings	= AOUT32_PS_STRINGS,
+	.sv_stackprot	= VM_PROT_ALL,
+	.sv_copyout_strings	= freebsd32_copyout_strings,
+	.sv_setregs	= ia32_setregs,
+	.sv_fixlimit	= ia32_fixlimit,
+	.sv_maxssiz	= &ia32_maxssiz,
+	.sv_flags	= SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32,
+	.sv_set_syscall_retval = ia32_set_syscall_retval,
+	.sv_fetch_syscall_args = ia32_fetch_syscall_args,
+	.sv_syscallnames = freebsd32_syscallnames,
+};
+#else
+#error "Port me"
+#endif
+
+static int
+aout_fixup(register_t **stack_base, struct image_params *imgp)
+{
+
+	*(char **)stack_base -= sizeof(uint32_t);
+	return (suword32(*stack_base, imgp->args->argc));
+}
+
+static int
+exec_aout_imgact(struct image_params *imgp)
+{
+	const struct exec *a_out = (const struct exec *) imgp->image_header;
+	struct vmspace *vmspace;
+	vm_map_t map;
+	vm_object_t object;
+	vm_offset_t text_end, data_end;
+	unsigned long virtual_offset;
+	unsigned long file_offset;
+	unsigned long bss_size;
+	int error;
+
+	/*
+	 * Linux and *BSD binaries look very much alike,
+	 * only the machine id is different:
+	 * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
+	 * NetBSD is in network byte order.. ugh.
+	 */
+	if (((a_out->a_midmag >> 16) & 0xff) != 0x86 &&
+	    ((a_out->a_midmag >> 16) & 0xff) != 0 &&
+	    ((((int)ntohl(a_out->a_midmag)) >> 16) & 0xff) != 0x86)
+                return -1;
+
+	/*
+	 * Set file/virtual offset based on a.out variant.
+	 *	We do two cases: host byte order and network byte order
+	 *	(for NetBSD compatibility)
+	 */
+	switch ((int)(a_out->a_midmag & 0xffff)) {
+	case ZMAGIC:
+		virtual_offset = 0;
+		if (a_out->a_text) {
+			file_offset = PAGE_SIZE;
+		} else {
+			/* Bill's "screwball mode" */
+			file_offset = 0;
+		}
+		break;
+	case QMAGIC:
+		virtual_offset = PAGE_SIZE;
+		file_offset = 0;
+		/* Pass PS_STRINGS for BSD/OS binaries only. */
+		if (N_GETMID(*a_out) == MID_ZERO)
+			imgp->ps_strings = aout_sysvec.sv_psstrings;
+		break;
+	default:
+		/* NetBSD compatibility */
+		switch ((int)(ntohl(a_out->a_midmag) & 0xffff)) {
+		case ZMAGIC:
+		case QMAGIC:
+			virtual_offset = PAGE_SIZE;
+			file_offset = 0;
+			break;
+		default:
+			return (-1);
+		}
+	}
+
+	bss_size = roundup(a_out->a_bss, PAGE_SIZE);
+
+	/*
+	 * Check various fields in header for validity/bounds.
+	 */
+	if (/* entry point must lay with text region */
+	    a_out->a_entry < virtual_offset ||
+	    a_out->a_entry >= virtual_offset + a_out->a_text ||
+
+	    /* text and data size must each be page rounded */
+	    a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK
+
+#ifdef __amd64__
+	    ||
+	    /* overflows */
+	    virtual_offset + a_out->a_text + a_out->a_data + bss_size > UINT_MAX
+#endif
+	    )
+		return (-1);
+
+	/* text + data can't exceed file size */
+	if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
+		return (EFAULT);
+
+	/*
+	 * text/data/bss must not exceed limits
+	 */
+	PROC_LOCK(imgp->proc);
+	if (/* text can't exceed maximum text size */
+	    a_out->a_text > maxtsiz ||
+
+	    /* data + bss can't exceed rlimit */
+	    a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA) ||
+	    racct_set(imgp->proc, RACCT_DATA, a_out->a_data + bss_size) != 0) {
+			PROC_UNLOCK(imgp->proc);
+			return (ENOMEM);
+	}
+	PROC_UNLOCK(imgp->proc);
+
+	/*
+	 * Avoid a possible deadlock if the current address space is destroyed
+	 * and that address space maps the locked vnode.  In the common case,
+	 * the locked vnode's v_usecount is decremented but remains greater
+	 * than zero.  Consequently, the vnode lock is not needed by vrele().
+	 * However, in cases where the vnode lock is external, such as nullfs,
+	 * v_usecount may become zero.
+	 */
+	VOP_UNLOCK(imgp->vp, 0);
+
+	/*
+	 * Destroy old process VM and create a new one (with a new stack)
+	 */
+	error = exec_new_vmspace(imgp, &aout_sysvec);
+
+	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+	if (error)
+		return (error);
+
+	/*
+	 * The vm space can be changed by exec_new_vmspace
+	 */
+	vmspace = imgp->proc->p_vmspace;
+
+	object = imgp->object;
+	map = &vmspace->vm_map;
+	vm_map_lock(map);
+	vm_object_reference(object);
+
+	text_end = virtual_offset + a_out->a_text;
+	error = vm_map_insert(map, object,
+		file_offset,
+		virtual_offset, text_end,
+		VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
+		MAP_COPY_ON_WRITE | MAP_PREFAULT);
+	if (error) {
+		vm_map_unlock(map);
+		vm_object_deallocate(object);
+		return (error);
+	}
+	data_end = text_end + a_out->a_data;
+	if (a_out->a_data) {
+		vm_object_reference(object);
+		error = vm_map_insert(map, object,
+			file_offset + a_out->a_text,
+			text_end, data_end,
+			VM_PROT_ALL, VM_PROT_ALL,
+			MAP_COPY_ON_WRITE | MAP_PREFAULT);
+		if (error) {
+			vm_map_unlock(map);
+			vm_object_deallocate(object);
+			return (error);
+		}
+	}
+
+	if (bss_size) {
+		error = vm_map_insert(map, NULL, 0,
+			data_end, data_end + bss_size,
+			VM_PROT_ALL, VM_PROT_ALL, 0);
+		if (error) {
+			vm_map_unlock(map);
+			return (error);
+		}
+	}
+	vm_map_unlock(map);
+
+	/* Fill in process VM information */
+	vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
+	vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
+	vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
+	vmspace->vm_daddr = (caddr_t) (uintptr_t)
+			    (virtual_offset + a_out->a_text);
+
+	/* Fill in image_params */
+	imgp->interpreted = 0;
+	imgp->entry_addr = a_out->a_entry;
+
+	imgp->proc->p_sysent = &aout_sysvec;
+
+	return (0);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
+EXEC_SET(aout, aout_execsw);
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
new file mode 100644
index 0000000..61a2aef
--- /dev/null
+++ b/sys/kern/imgact_elf.c
@@ -0,0 +1,2135 @@
+/*-
+ * Copyright (c) 2000 David O'Brien
+ * Copyright (c) 1995-1996 Søren Schmidt
+ * Copyright (c) 1996 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+#include "opt_core.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/exec.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mman.h>
+#include <sys/namei.h>
+#include <sys/pioctl.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/sf_buf.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syscall.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/vnode.h>
+#include <sys/syslog.h>
+#include <sys/eventhandler.h>
+#include <sys/user.h>
+
+#include <net/zlib.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+
+#include <machine/elf.h>
+#include <machine/md_var.h>
+
+#define ELF_NOTE_ROUNDSIZE	4
+#define OLD_EI_BRAND	8
+
+static int __elfN(check_header)(const Elf_Ehdr *hdr);
+static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
+    const char *interp, int interp_name_len, int32_t *osrel);
+static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
+    u_long *entry, size_t pagesize);
+static int __elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
+    caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
+    size_t pagesize);
+static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
+static boolean_t __elfN(freebsd_trans_osrel)(const Elf_Note *note,
+    int32_t *osrel);
+static boolean_t kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel);
+static boolean_t __elfN(check_note)(struct image_params *imgp,
+    Elf_Brandnote *checknote, int32_t *osrel);
+static vm_prot_t __elfN(trans_prot)(Elf_Word);
+static Elf_Word __elfN(untrans_prot)(vm_prot_t);
+
+SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0,
+    "");
+
+#ifdef COMPRESS_USER_CORES
+static int compress_core(gzFile, char *, char *, unsigned int,
+    struct thread * td);
+#endif
+#define CORE_BUF_SIZE	(16 * 1024)
+
+int __elfN(fallback_brand) = -1;
+SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
+    fallback_brand, CTLFLAG_RW, &__elfN(fallback_brand), 0,
+    __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
+TUNABLE_INT("kern.elf" __XSTRING(__ELF_WORD_SIZE) ".fallback_brand",
+    &__elfN(fallback_brand));
+
+static int elf_legacy_coredump = 0;
+SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, 
+    &elf_legacy_coredump, 0, "");
+
+int __elfN(nxstack) =
+#if defined(__amd64__) || defined(__powerpc64__) /* both 64 and 32 bit */
+	1;
+#else
+	0;
+#endif
+SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
+    nxstack, CTLFLAG_RW, &__elfN(nxstack), 0,
+    __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable non-executable stack");
+
+#if __ELF_WORD_SIZE == 32
+#if defined(__amd64__) || defined(__ia64__)
+int i386_read_exec = 0;
+SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0,
+    "enable execution from readable segments");
+#endif
+#endif
+
+static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
+
+#define	trunc_page_ps(va, ps)	((va) & ~(ps - 1))
+#define	round_page_ps(va, ps)	(((va) + (ps - 1)) & ~(ps - 1))
+#define	aligned(a, t)	(trunc_page_ps((u_long)(a), sizeof(t)) == (u_long)(a))
+
+static const char FREEBSD_ABI_VENDOR[] = "FreeBSD";
+
+Elf_Brandnote __elfN(freebsd_brandnote) = {
+	.hdr.n_namesz	= sizeof(FREEBSD_ABI_VENDOR),
+	.hdr.n_descsz	= sizeof(int32_t),
+	.hdr.n_type	= 1,
+	.vendor		= FREEBSD_ABI_VENDOR,
+	.flags		= BN_TRANSLATE_OSREL,
+	.trans_osrel	= __elfN(freebsd_trans_osrel)
+};
+
+static boolean_t
+__elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel)
+{
+	uintptr_t p;
+
+	p = (uintptr_t)(note + 1);
+	p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
+	*osrel = *(const int32_t *)(p);
+
+	return (TRUE);
+}
+
+static const char GNU_ABI_VENDOR[] = "GNU";
+static int GNU_KFREEBSD_ABI_DESC = 3;
+
+Elf_Brandnote __elfN(kfreebsd_brandnote) = {
+	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
+	.hdr.n_descsz	= 16,	/* XXX at least 16 */
+	.hdr.n_type	= 1,
+	.vendor		= GNU_ABI_VENDOR,
+	.flags		= BN_TRANSLATE_OSREL,
+	.trans_osrel	= kfreebsd_trans_osrel
+};
+
+static boolean_t
+kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel)
+{
+	const Elf32_Word *desc;
+	uintptr_t p;
+
+	p = (uintptr_t)(note + 1);
+	p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
+
+	desc = (const Elf32_Word *)p;
+	if (desc[0] != GNU_KFREEBSD_ABI_DESC)
+		return (FALSE);
+
+	/*
+	 * Debian GNU/kFreeBSD embed the earliest compatible kernel version
+	 * (__FreeBSD_version: <major><two digit minor>Rxx) in the LSB way.
+	 */
+	*osrel = desc[1] * 100000 + desc[2] * 1000 + desc[3];
+
+	return (TRUE);
+}
+
+int
+__elfN(insert_brand_entry)(Elf_Brandinfo *entry)
+{
+	int i;
+
+	for (i = 0; i < MAX_BRANDS; i++) {
+		if (elf_brand_list[i] == NULL) {
+			elf_brand_list[i] = entry;
+			break;
+		}
+	}
+	if (i == MAX_BRANDS) {
+		printf("WARNING: %s: could not insert brandinfo entry: %p\n",
+			__func__, entry);
+		return (-1);
+	}
+	return (0);
+}
+
+int
+__elfN(remove_brand_entry)(Elf_Brandinfo *entry)
+{
+	int i;
+
+	for (i = 0; i < MAX_BRANDS; i++) {
+		if (elf_brand_list[i] == entry) {
+			elf_brand_list[i] = NULL;
+			break;
+		}
+	}
+	if (i == MAX_BRANDS)
+		return (-1);
+	return (0);
+}
+
+int
+__elfN(brand_inuse)(Elf_Brandinfo *entry)
+{
+	struct proc *p;
+	int rval = FALSE;
+
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		if (p->p_sysent == entry->sysvec) {
+			rval = TRUE;
+			break;
+		}
+	}
+	sx_sunlock(&allproc_lock);
+
+	return (rval);
+}
+
+static Elf_Brandinfo *
+__elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
+    int interp_name_len, int32_t *osrel)
+{
+	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
+	Elf_Brandinfo *bi;
+	boolean_t ret;
+	int i;
+
+	/*
+	 * We support four types of branding -- (1) the ELF EI_OSABI field
+	 * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
+	 * branding w/in the ELF header, (3) path of the `interp_path'
+	 * field, and (4) the ".note.ABI-tag" ELF section.
+	 */
+
+	/* Look for an ".note.ABI-tag" ELF section */
+	for (i = 0; i < MAX_BRANDS; i++) {
+		bi = elf_brand_list[i];
+		if (bi == NULL)
+			continue;
+		if (hdr->e_machine == bi->machine && (bi->flags &
+		    (BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) {
+			ret = __elfN(check_note)(imgp, bi->brand_note, osrel);
+			if (ret)
+				return (bi);
+		}
+	}
+
+	/* If the executable has a brand, search for it in the brand list. */
+	for (i = 0; i < MAX_BRANDS; i++) {
+		bi = elf_brand_list[i];
+		if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
+			continue;
+		if (hdr->e_machine == bi->machine &&
+		    (hdr->e_ident[EI_OSABI] == bi->brand ||
+		    strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
+		    bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0))
+			return (bi);
+	}
+
+	/* Lacking a known brand, search for a recognized interpreter. */
+	if (interp != NULL) {
+		for (i = 0; i < MAX_BRANDS; i++) {
+			bi = elf_brand_list[i];
+			if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
+				continue;
+			if (hdr->e_machine == bi->machine &&
+			    /* ELF image p_filesz includes terminating zero */
+			    strlen(bi->interp_path) + 1 == interp_name_len &&
+			    strncmp(interp, bi->interp_path, interp_name_len)
+			    == 0)
+				return (bi);
+		}
+	}
+
+	/* Lacking a recognized interpreter, try the default brand */
+	for (i = 0; i < MAX_BRANDS; i++) {
+		bi = elf_brand_list[i];
+		if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
+			continue;
+		if (hdr->e_machine == bi->machine &&
+		    __elfN(fallback_brand) == bi->brand)
+			return (bi);
+	}
+	return (NULL);
+}
+
+static int
+__elfN(check_header)(const Elf_Ehdr *hdr)
+{
+	Elf_Brandinfo *bi;
+	int i;
+
+	if (!IS_ELF(*hdr) ||
+	    hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
+	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
+	    hdr->e_ident[EI_VERSION] != EV_CURRENT ||
+	    hdr->e_phentsize != sizeof(Elf_Phdr) ||
+	    hdr->e_version != ELF_TARG_VER)
+		return (ENOEXEC);
+
+	/*
+	 * Make sure we have at least one brand for this machine.
+	 */
+
+	for (i = 0; i < MAX_BRANDS; i++) {
+		bi = elf_brand_list[i];
+		if (bi != NULL && bi->machine == hdr->e_machine)
+			break;
+	}
+	if (i == MAX_BRANDS)
+		return (ENOEXEC);
+
+	return (0);
+}
+
+static int
+__elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
+    vm_offset_t start, vm_offset_t end, vm_prot_t prot)
+{
+	struct sf_buf *sf;
+	int error;
+	vm_offset_t off;
+
+	/*
+	 * Create the page if it doesn't exist yet. Ignore errors.
+	 */
+	vm_map_lock(map);
+	vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end),
+	    VM_PROT_ALL, VM_PROT_ALL, 0);
+	vm_map_unlock(map);
+
+	/*
+	 * Find the page from the underlying object.
+	 */
+	if (object) {
+		sf = vm_imgact_map_page(object, offset);
+		if (sf == NULL)
+			return (KERN_FAILURE);
+		off = offset - trunc_page(offset);
+		error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
+		    end - start);
+		vm_imgact_unmap_page(sf);
+		if (error) {
+			return (KERN_FAILURE);
+		}
+	}
+
+	return (KERN_SUCCESS);
+}
+
+static int
+__elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
+    vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow)
+{
+	struct sf_buf *sf;
+	vm_offset_t off;
+	vm_size_t sz;
+	int error, rv;
+
+	if (start != trunc_page(start)) {
+		rv = __elfN(map_partial)(map, object, offset, start,
+		    round_page(start), prot);
+		if (rv)
+			return (rv);
+		offset += round_page(start) - start;
+		start = round_page(start);
+	}
+	if (end != round_page(end)) {
+		rv = __elfN(map_partial)(map, object, offset +
+		    trunc_page(end) - start, trunc_page(end), end, prot);
+		if (rv)
+			return (rv);
+		end = trunc_page(end);
+	}
+	if (end > start) {
+		if (offset & PAGE_MASK) {
+			/*
+			 * The mapping is not page aligned. This means we have
+			 * to copy the data. Sigh.
+			 */
+			rv = vm_map_find(map, NULL, 0, &start, end - start,
+			    FALSE, prot | VM_PROT_WRITE, VM_PROT_ALL, 0);
+			if (rv)
+				return (rv);
+			if (object == NULL)
+				return (KERN_SUCCESS);
+			for (; start < end; start += sz) {
+				sf = vm_imgact_map_page(object, offset);
+				if (sf == NULL)
+					return (KERN_FAILURE);
+				off = offset - trunc_page(offset);
+				sz = end - start;
+				if (sz > PAGE_SIZE - off)
+					sz = PAGE_SIZE - off;
+				error = copyout((caddr_t)sf_buf_kva(sf) + off,
+				    (caddr_t)start, sz);
+				vm_imgact_unmap_page(sf);
+				if (error) {
+					return (KERN_FAILURE);
+				}
+				offset += sz;
+			}
+			rv = KERN_SUCCESS;
+		} else {
+			vm_object_reference(object);
+			vm_map_lock(map);
+			rv = vm_map_insert(map, object, offset, start, end,
+			    prot, VM_PROT_ALL, cow);
+			vm_map_unlock(map);
+			if (rv != KERN_SUCCESS)
+				vm_object_deallocate(object);
+		}
+		return (rv);
+	} else {
+		return (KERN_SUCCESS);
+	}
+}
+
+static int
+__elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
+    caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
+    size_t pagesize)
+{
+	struct sf_buf *sf;
+	size_t map_len;
+	vm_map_t map;
+	vm_object_t object;
+	vm_offset_t map_addr;
+	int error, rv, cow;
+	size_t copy_len;
+	vm_offset_t file_addr;
+
+	/*
+	 * It's necessary to fail if the filsz + offset taken from the
+	 * header is greater than the actual file pager object's size.
+	 * If we were to allow this, then the vm_map_find() below would
+	 * walk right off the end of the file object and into the ether.
+	 *
+	 * While I'm here, might as well check for something else that
+	 * is invalid: filsz cannot be greater than memsz.
+	 */
+	if ((off_t)filsz + offset > imgp->attr->va_size || filsz > memsz) {
+		uprintf("elf_load_section: truncated ELF file\n");
+		return (ENOEXEC);
+	}
+
+	object = imgp->object;
+	map = &imgp->proc->p_vmspace->vm_map;
+	map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
+	file_addr = trunc_page_ps(offset, pagesize);
+
+	/*
+	 * We have two choices.  We can either clear the data in the last page
+	 * of an oversized mapping, or we can start the anon mapping a page
+	 * early and copy the initialized data into that first page.  We
+	 * choose the second..
+	 */
+	if (memsz > filsz)
+		map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr;
+	else
+		map_len = round_page_ps(offset + filsz, pagesize) - file_addr;
+
+	if (map_len != 0) {
+		/* cow flags: don't dump readonly sections in core */
+		cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
+		    (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
+
+		rv = __elfN(map_insert)(map,
+				      object,
+				      file_addr,	/* file offset */
+				      map_addr,		/* virtual start */
+				      map_addr + map_len,/* virtual end */
+				      prot,
+				      cow);
+		if (rv != KERN_SUCCESS)
+			return (EINVAL);
+
+		/* we can stop now if we've covered it all */
+		if (memsz == filsz) {
+			return (0);
+		}
+	}
+
+
+	/*
+	 * We have to get the remaining bit of the file into the first part
+	 * of the oversized map segment.  This is normally because the .data
+	 * segment in the file is extended to provide bss.  It's a neat idea
+	 * to try and save a page, but it's a pain in the behind to implement.
+	 */
+	copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize);
+	map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize);
+	map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) -
+	    map_addr;
+
+	/* This had damn well better be true! */
+	if (map_len != 0) {
+		rv = __elfN(map_insert)(map, NULL, 0, map_addr, map_addr +
+		    map_len, VM_PROT_ALL, 0);
+		if (rv != KERN_SUCCESS) {
+			return (EINVAL);
+		}
+	}
+
+	if (copy_len != 0) {
+		vm_offset_t off;
+
+		sf = vm_imgact_map_page(object, offset + filsz);
+		if (sf == NULL)
+			return (EIO);
+
+		/* send the page fragment to user space */
+		off = trunc_page_ps(offset + filsz, pagesize) -
+		    trunc_page(offset + filsz);
+		error = copyout((caddr_t)sf_buf_kva(sf) + off,
+		    (caddr_t)map_addr, copy_len);
+		vm_imgact_unmap_page(sf);
+		if (error) {
+			return (error);
+		}
+	}
+
+	/*
+	 * set it to the specified protection.
+	 * XXX had better undo the damage from pasting over the cracks here!
+	 */
+	vm_map_protect(map, trunc_page(map_addr), round_page(map_addr +
+	    map_len), prot, FALSE);
+
+	return (0);
+}
+
+/*
+ * Load the file "file" into memory.  It may be either a shared object
+ * or an executable.
+ *
+ * The "addr" reference parameter is in/out.  On entry, it specifies
+ * the address where a shared object should be loaded.  If the file is
+ * an executable, this value is ignored.  On exit, "addr" specifies
+ * where the file was actually loaded.
+ *
+ * The "entry" reference parameter is out only.  On exit, it specifies
+ * the entry point for the loaded file.
+ */
+static int
+__elfN(load_file)(struct proc *p, const char *file, u_long *addr,
+	u_long *entry, size_t pagesize)
+{
+	struct {
+		struct nameidata nd;
+		struct vattr attr;
+		struct image_params image_params;
+	} *tempdata;
+	const Elf_Ehdr *hdr = NULL;
+	const Elf_Phdr *phdr = NULL;
+	struct nameidata *nd;
+	struct vattr *attr;
+	struct image_params *imgp;
+	vm_prot_t prot;
+	u_long rbase;
+	u_long base_addr = 0;
+	int error, i, numsegs;
+
+#ifdef CAPABILITY_MODE
+	/*
+	 * XXXJA: This check can go away once we are sufficiently confident
+	 * that the checks in namei() are correct.
+	 */
+	if (IN_CAPABILITY_MODE(curthread))
+		return (ECAPMODE);
+#endif
+
+	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
+	nd = &tempdata->nd;
+	attr = &tempdata->attr;
+	imgp = &tempdata->image_params;
+
+	/*
+	 * Initialize part of the common data
+	 */
+	imgp->proc = p;
+	imgp->attr = attr;
+	imgp->firstpage = NULL;
+	imgp->image_header = NULL;
+	imgp->object = NULL;
+	imgp->execlabel = NULL;
+
+	NDINIT(nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, file, curthread);
+	if ((error = namei(nd)) != 0) {
+		nd->ni_vp = NULL;
+		goto fail;
+	}
+	NDFREE(nd, NDF_ONLY_PNBUF);
+	imgp->vp = nd->ni_vp;
+
+	/*
+	 * Check permissions, modes, uid, etc on the file, and "open" it.
+	 */
+	error = exec_check_permissions(imgp);
+	if (error)
+		goto fail;
+
+	error = exec_map_first_page(imgp);
+	if (error)
+		goto fail;
+
+	/*
+	 * Also make certain that the interpreter stays the same, so set
+	 * its VV_TEXT flag, too.
+	 */
+	VOP_SET_TEXT(nd->ni_vp);
+
+	imgp->object = nd->ni_vp->v_object;
+
+	hdr = (const Elf_Ehdr *)imgp->image_header;
+	if ((error = __elfN(check_header)(hdr)) != 0)
+		goto fail;
+	if (hdr->e_type == ET_DYN)
+		rbase = *addr;
+	else if (hdr->e_type == ET_EXEC)
+		rbase = 0;
+	else {
+		error = ENOEXEC;
+		goto fail;
+	}
+
+	/* Only support headers that fit within first page for now      */
+	if ((hdr->e_phoff > PAGE_SIZE) ||
+	    (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
+		error = ENOEXEC;
+		goto fail;
+	}
+
+	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+	if (!aligned(phdr, Elf_Addr)) {
+		error = ENOEXEC;
+		goto fail;
+	}
+
+	for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
+		if (phdr[i].p_type == PT_LOAD && phdr[i].p_memsz != 0) {
+			/* Loadable segment */
+			prot = __elfN(trans_prot)(phdr[i].p_flags);
+			error = __elfN(load_section)(imgp, phdr[i].p_offset,
+			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
+			    phdr[i].p_memsz, phdr[i].p_filesz, prot, pagesize);
+			if (error != 0)
+				goto fail;
+			/*
+			 * Establish the base address if this is the
+			 * first segment.
+			 */
+			if (numsegs == 0)
+  				base_addr = trunc_page(phdr[i].p_vaddr +
+				    rbase);
+			numsegs++;
+		}
+	}
+	*addr = base_addr;
+	*entry = (unsigned long)hdr->e_entry + rbase;
+
+fail:
+	if (imgp->firstpage)
+		exec_unmap_first_page(imgp);
+
+	if (nd->ni_vp)
+		vput(nd->ni_vp);
+
+	free(tempdata, M_TEMP);
+
+	return (error);
+}
+
+static int
+__CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
+{
+	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
+	const Elf_Phdr *phdr;
+	Elf_Auxargs *elf_auxargs;
+	struct vmspace *vmspace;
+	vm_prot_t prot;
+	u_long text_size = 0, data_size = 0, total_size = 0;
+	u_long text_addr = 0, data_addr = 0;
+	u_long seg_size, seg_addr;
+	u_long addr, baddr, et_dyn_addr, entry = 0, proghdr = 0;
+	int32_t osrel = 0;
+	int error = 0, i, n, interp_name_len = 0;
+	const char *interp = NULL, *newinterp = NULL;
+	Elf_Brandinfo *brand_info;
+	char *path;
+	struct sysentvec *sv;
+
+	/*
+	 * Do we have a valid ELF header ?
+	 *
+	 * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later
+	 * if particular brand doesn't support it.
+	 */
+	if (__elfN(check_header)(hdr) != 0 ||
+	    (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN))
+		return (-1);
+
+	/*
+	 * From here on down, we return an errno, not -1, as we've
+	 * detected an ELF file.
+	 */
+
+	if ((hdr->e_phoff > PAGE_SIZE) ||
+	    (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
+		/* Only support headers in first page for now */
+		return (ENOEXEC);
+	}
+	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+	if (!aligned(phdr, Elf_Addr))
+		return (ENOEXEC);
+	n = 0;
+	baddr = 0;
+	for (i = 0; i < hdr->e_phnum; i++) {
+		switch (phdr[i].p_type) {
+		case PT_LOAD:
+			if (n == 0)
+				baddr = phdr[i].p_vaddr;
+			n++;
+			break;
+		case PT_INTERP:
+			/* Path to interpreter */
+			if (phdr[i].p_filesz > MAXPATHLEN ||
+			    phdr[i].p_offset > PAGE_SIZE ||
+			    phdr[i].p_filesz > PAGE_SIZE - phdr[i].p_offset)
+				return (ENOEXEC);
+			interp = imgp->image_header + phdr[i].p_offset;
+			interp_name_len = phdr[i].p_filesz;
+			break;
+		case PT_GNU_STACK:
+			if (__elfN(nxstack))
+				imgp->stack_prot =
+				    __elfN(trans_prot)(phdr[i].p_flags);
+			break;
+		}
+	}
+
+	brand_info = __elfN(get_brandinfo)(imgp, interp, interp_name_len,
+	    &osrel);
+	if (brand_info == NULL) {
+		uprintf("ELF binary type \"%u\" not known.\n",
+		    hdr->e_ident[EI_OSABI]);
+		return (ENOEXEC);
+	}
+	if (hdr->e_type == ET_DYN) {
+		if ((brand_info->flags & BI_CAN_EXEC_DYN) == 0)
+			return (ENOEXEC);
+		/*
+		 * Honour the base load address from the dso if it is
+		 * non-zero for some reason.
+		 */
+		if (baddr == 0)
+			et_dyn_addr = ET_DYN_LOAD_ADDR;
+		else
+			et_dyn_addr = 0;
+	} else
+		et_dyn_addr = 0;
+	sv = brand_info->sysvec;
+	if (interp != NULL && brand_info->interp_newpath != NULL)
+		newinterp = brand_info->interp_newpath;
+
+	/*
+	 * Avoid a possible deadlock if the current address space is destroyed
+	 * and that address space maps the locked vnode.  In the common case,
+	 * the locked vnode's v_usecount is decremented but remains greater
+	 * than zero.  Consequently, the vnode lock is not needed by vrele().
+	 * However, in cases where the vnode lock is external, such as nullfs,
+	 * v_usecount may become zero.
+	 *
+	 * The VV_TEXT flag prevents modifications to the executable while
+	 * the vnode is unlocked.
+	 */
+	VOP_UNLOCK(imgp->vp, 0);
+
+	error = exec_new_vmspace(imgp, sv);
+	imgp->proc->p_sysent = sv;
+
+	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+	if (error)
+		return (error);
+
+	for (i = 0; i < hdr->e_phnum; i++) {
+		switch (phdr[i].p_type) {
+		case PT_LOAD:	/* Loadable segment */
+			if (phdr[i].p_memsz == 0)
+				break;
+			prot = __elfN(trans_prot)(phdr[i].p_flags);
+			error = __elfN(load_section)(imgp, phdr[i].p_offset,
+			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + et_dyn_addr,
+			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
+			    sv->sv_pagesize);
+			if (error != 0)
+				return (error);
+
+			/*
+			 * If this segment contains the program headers,
+			 * remember their virtual address for the AT_PHDR
+			 * aux entry. Static binaries don't usually include
+			 * a PT_PHDR entry.
+			 */
+			if (phdr[i].p_offset == 0 &&
+			    hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize
+				<= phdr[i].p_filesz)
+				proghdr = phdr[i].p_vaddr + hdr->e_phoff +
+				    et_dyn_addr;
+
+			seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr);
+			seg_size = round_page(phdr[i].p_memsz +
+			    phdr[i].p_vaddr + et_dyn_addr - seg_addr);
+
+			/*
+			 * Make the largest executable segment the official
+			 * text segment and all others data.
+			 *
+			 * Note that obreak() assumes that data_addr + 
+			 * data_size == end of data load area, and the ELF
+			 * file format expects segments to be sorted by
+			 * address.  If multiple data segments exist, the
+			 * last one will be used.
+			 */
+
+			if (phdr[i].p_flags & PF_X && text_size < seg_size) {
+				text_size = seg_size;
+				text_addr = seg_addr;
+			} else {
+				data_size = seg_size;
+				data_addr = seg_addr;
+			}
+			total_size += seg_size;
+			break;
+		case PT_PHDR: 	/* Program header table info */
+			proghdr = phdr[i].p_vaddr + et_dyn_addr;
+			break;
+		default:
+			break;
+		}
+	}
+	
+	if (data_addr == 0 && data_size == 0) {
+		data_addr = text_addr;
+		data_size = text_size;
+	}
+
+	entry = (u_long)hdr->e_entry + et_dyn_addr;
+
+	/*
+	 * Check limits.  It should be safe to check the
+	 * limits after loading the segments since we do
+	 * not actually fault in all the segments pages.
+	 */
+	PROC_LOCK(imgp->proc);
+	if (data_size > lim_cur(imgp->proc, RLIMIT_DATA) ||
+	    text_size > maxtsiz ||
+	    total_size > lim_cur(imgp->proc, RLIMIT_VMEM) ||
+	    racct_set(imgp->proc, RACCT_DATA, data_size) != 0 ||
+	    racct_set(imgp->proc, RACCT_VMEM, total_size) != 0) {
+		PROC_UNLOCK(imgp->proc);
+		return (ENOMEM);
+	}
+
+	vmspace = imgp->proc->p_vmspace;
+	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
+	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
+	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
+	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
+
+	/*
+	 * We load the dynamic linker where a userland call
+	 * to mmap(0, ...) would put it.  The rationale behind this
+	 * calculation is that it leaves room for the heap to grow to
+	 * its maximum allowed size.
+	 */
+	addr = round_page((vm_offset_t)vmspace->vm_daddr + lim_max(imgp->proc,
+	    RLIMIT_DATA));
+	PROC_UNLOCK(imgp->proc);
+
+	imgp->entry_addr = entry;
+
+	if (interp != NULL) {
+		int have_interp = FALSE;
+		VOP_UNLOCK(imgp->vp, 0);
+		if (brand_info->emul_path != NULL &&
+		    brand_info->emul_path[0] != '\0') {
+			path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+			snprintf(path, MAXPATHLEN, "%s%s",
+			    brand_info->emul_path, interp);
+			error = __elfN(load_file)(imgp->proc, path, &addr,
+			    &imgp->entry_addr, sv->sv_pagesize);
+			free(path, M_TEMP);
+			if (error == 0)
+				have_interp = TRUE;
+		}
+		if (!have_interp && newinterp != NULL) {
+			error = __elfN(load_file)(imgp->proc, newinterp, &addr,
+			    &imgp->entry_addr, sv->sv_pagesize);
+			if (error == 0)
+				have_interp = TRUE;
+		}
+		if (!have_interp) {
+			error = __elfN(load_file)(imgp->proc, interp, &addr,
+			    &imgp->entry_addr, sv->sv_pagesize);
+		}
+		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+		if (error != 0) {
+			uprintf("ELF interpreter %s not found\n", interp);
+			return (error);
+		}
+	} else
+		addr = et_dyn_addr;
+
+	/*
+	 * Construct auxargs table (used by the fixup routine)
+	 */
+	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
+	elf_auxargs->execfd = -1;
+	elf_auxargs->phdr = proghdr;
+	elf_auxargs->phent = hdr->e_phentsize;
+	elf_auxargs->phnum = hdr->e_phnum;
+	elf_auxargs->pagesz = PAGE_SIZE;
+	elf_auxargs->base = addr;
+	elf_auxargs->flags = 0;
+	elf_auxargs->entry = entry;
+
+	imgp->auxargs = elf_auxargs;
+	imgp->interpreted = 0;
+	imgp->reloc_base = addr;
+	imgp->proc->p_osrel = osrel;
+
+	return (error);
+}
+
+#define	suword __CONCAT(suword, __ELF_WORD_SIZE)
+
+int
+__elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
+{
+	Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
+	Elf_Addr *base;
+	Elf_Addr *pos;
+
+	base = (Elf_Addr *)*stack_base;
+	pos = base + (imgp->args->argc + imgp->args->envc + 2);
+
+	if (args->execfd != -1)
+		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
+	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
+	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
+	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
+	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
+	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
+	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
+	AUXARGS_ENTRY(pos, AT_BASE, args->base);
+	if (imgp->execpathp != 0)
+		AUXARGS_ENTRY(pos, AT_EXECPATH, imgp->execpathp);
+	AUXARGS_ENTRY(pos, AT_OSRELDATE, osreldate);
+	if (imgp->canary != 0) {
+		AUXARGS_ENTRY(pos, AT_CANARY, imgp->canary);
+		AUXARGS_ENTRY(pos, AT_CANARYLEN, imgp->canarylen);
+	}
+	AUXARGS_ENTRY(pos, AT_NCPUS, mp_ncpus);
+	if (imgp->pagesizes != 0) {
+		AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes);
+		AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen);
+	}
+	if (imgp->sysent->sv_timekeep_base != 0) {
+		AUXARGS_ENTRY(pos, AT_TIMEKEEP,
+		    imgp->sysent->sv_timekeep_base);
+	}
+	AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj
+	    != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
+	    imgp->sysent->sv_stackprot);
+	AUXARGS_ENTRY(pos, AT_NULL, 0);
+
+	free(imgp->auxargs, M_TEMP);
+	imgp->auxargs = NULL;
+
+	base--;
+	suword(base, (long)imgp->args->argc);
+	*stack_base = (register_t *)base;
+	return (0);
+}
+
+/*
+ * Code for generating ELF core dumps.
+ */
+
+typedef void (*segment_callback)(vm_map_entry_t, void *);
+
+/* Closure for cb_put_phdr(). */
+struct phdr_closure {
+	Elf_Phdr *phdr;		/* Program header to fill in */
+	Elf_Off offset;		/* Offset of segment in core file */
+};
+
+/* Closure for cb_size_segment(). */
+struct sseg_closure {
+	int count;		/* Count of writable segments. */
+	size_t size;		/* Total size of all writable segments. */
+};
+
+typedef void (*outfunc_t)(void *, struct sbuf *, size_t *);
+
+struct note_info {
+	int		type;		/* Note type. */
+	outfunc_t 	outfunc; 	/* Output function. */
+	void		*outarg;	/* Argument for the output function. */
+	size_t		outsize;	/* Output size. */
+	TAILQ_ENTRY(note_info) link;	/* Link to the next note info. */
+};
+
+TAILQ_HEAD(note_info_list, note_info);
+
+static void cb_put_phdr(vm_map_entry_t, void *);
+static void cb_size_segment(vm_map_entry_t, void *);
+static void each_writable_segment(struct thread *, segment_callback, void *);
+static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *,
+    int, void *, size_t, struct note_info_list *, size_t, gzFile);
+static void __elfN(prepare_notes)(struct thread *, struct note_info_list *,
+    size_t *);
+static void __elfN(puthdr)(struct thread *, void *, size_t, int, size_t);
+static void __elfN(putnote)(struct note_info *, struct sbuf *);
+static size_t register_note(struct note_info_list *, int, outfunc_t, void *);
+static int sbuf_drain_core_output(void *, const char *, int);
+static int sbuf_drain_count(void *arg, const char *data, int len);
+
+static void __elfN(note_fpregset)(void *, struct sbuf *, size_t *);
+static void __elfN(note_prpsinfo)(void *, struct sbuf *, size_t *);
+static void __elfN(note_prstatus)(void *, struct sbuf *, size_t *);
+static void __elfN(note_threadmd)(void *, struct sbuf *, size_t *);
+static void __elfN(note_thrmisc)(void *, struct sbuf *, size_t *);
+static void __elfN(note_procstat_auxv)(void *, struct sbuf *, size_t *);
+static void __elfN(note_procstat_proc)(void *, struct sbuf *, size_t *);
+static void __elfN(note_procstat_psstrings)(void *, struct sbuf *, size_t *);
+static void note_procstat_files(void *, struct sbuf *, size_t *);
+static void note_procstat_groups(void *, struct sbuf *, size_t *);
+static void note_procstat_osrel(void *, struct sbuf *, size_t *);
+static void note_procstat_rlimit(void *, struct sbuf *, size_t *);
+static void note_procstat_umask(void *, struct sbuf *, size_t *);
+static void note_procstat_vmmap(void *, struct sbuf *, size_t *);
+
+#ifdef COMPRESS_USER_CORES
+extern int compress_user_cores;
+extern int compress_user_cores_gzlevel;
+#endif
+
+static int
+core_output(struct vnode *vp, void *base, size_t len, off_t offset,
+    struct ucred *active_cred, struct ucred *file_cred,
+    struct thread *td, char *core_buf, gzFile gzfile) {
+
+	int error;
+	if (gzfile) {
+#ifdef COMPRESS_USER_CORES
+		error = compress_core(gzfile, base, core_buf, len, td);
+#else
+		panic("shouldn't be here");
+#endif
+	} else {
+		error = vn_rdwr_inchunks(UIO_WRITE, vp, base, len, offset,
+		    UIO_USERSPACE, IO_UNIT | IO_DIRECT, active_cred, file_cred,
+		    NULL, td);
+	}
+	return (error);
+}
+
+/* Coredump output parameters for sbuf drain routine. */
+struct sbuf_drain_core_params {
+	off_t		offset;
+	struct ucred	*active_cred;
+	struct ucred	*file_cred;
+	struct thread	*td;
+	struct vnode	*vp;
+#ifdef COMPRESS_USER_CORES
+	gzFile		gzfile;
+#endif
+};
+
+/*
+ * Drain into a core file.
+ */
+static int
+sbuf_drain_core_output(void *arg, const char *data, int len)
+{
+	struct sbuf_drain_core_params *p;
+	int error, locked;
+
+	p = (struct sbuf_drain_core_params *)arg;
+
+	/*
+	 * Some kern_proc out routines that print to this sbuf may
+	 * call us with the process lock held. Draining with the
+	 * non-sleepable lock held is unsafe. The lock is needed for
+	 * those routines when dumping a live process. In our case we
+	 * can safely release the lock before draining and acquire
+	 * again after.
+	 */
+	locked = PROC_LOCKED(p->td->td_proc);
+	if (locked)
+		PROC_UNLOCK(p->td->td_proc);
+#ifdef COMPRESS_USER_CORES
+	if (p->gzfile != Z_NULL)
+		error = compress_core(p->gzfile, NULL, __DECONST(char *, data),
+		    len, p->td);
+	else
+#endif
+		error = vn_rdwr_inchunks(UIO_WRITE, p->vp,
+		    __DECONST(void *, data), len, p->offset, UIO_SYSSPACE,
+		    IO_UNIT | IO_DIRECT, p->active_cred, p->file_cred, NULL,
+		    p->td);
+	if (locked)
+		PROC_LOCK(p->td->td_proc);
+	if (error != 0)
+		return (-error);
+	p->offset += len;
+	return (len);
+}
+
+/*
+ * Drain into a counter.
+ */
+static int
+sbuf_drain_count(void *arg, const char *data __unused, int len)
+{
+	size_t *sizep;
+
+	sizep = (size_t *)arg;
+	*sizep += len;
+	return (len);
+}
+
+int
+__elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
+{
+	struct ucred *cred = td->td_ucred;
+	int error = 0;
+	struct sseg_closure seginfo;
+	struct note_info_list notelst;
+	struct note_info *ninfo;
+	void *hdr;
+	size_t hdrsize, notesz, coresize;
+
+	gzFile gzfile = Z_NULL;
+	char *core_buf = NULL;
+#ifdef COMPRESS_USER_CORES
+	char gzopen_flags[8];
+	char *p;
+	int doing_compress = flags & IMGACT_CORE_COMPRESS;
+#endif
+
+	hdr = NULL;
+	TAILQ_INIT(&notelst);
+
+#ifdef COMPRESS_USER_CORES
+        if (doing_compress) {
+                p = gzopen_flags;
+                *p++ = 'w';
+                if (compress_user_cores_gzlevel >= 0 &&
+                    compress_user_cores_gzlevel <= 9)
+                        *p++ = '0' + compress_user_cores_gzlevel;
+                *p = 0;
+                gzfile = gz_open("", gzopen_flags, vp);
+                if (gzfile == Z_NULL) {
+                        error = EFAULT;
+                        goto done;
+                }
+                core_buf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO);
+                if (!core_buf) {
+                        error = ENOMEM;
+                        goto done;
+                }
+        }
+#endif
+
+	/* Size the program segments. */
+	seginfo.count = 0;
+	seginfo.size = 0;
+	each_writable_segment(td, cb_size_segment, &seginfo);
+
+	/*
+	 * Collect info about the core file header area.
+	 */
+	hdrsize = sizeof(Elf_Ehdr) + sizeof(Elf_Phdr) * (1 + seginfo.count);
+	__elfN(prepare_notes)(td, &notelst, &notesz);
+	coresize = round_page(hdrsize + notesz) + seginfo.size;
+
+#ifdef RACCT
+	PROC_LOCK(td->td_proc);
+	error = racct_add(td->td_proc, RACCT_CORE, coresize);
+	PROC_UNLOCK(td->td_proc);
+	if (error != 0) {
+		error = EFAULT;
+		goto done;
+	}
+#endif
+	if (coresize >= limit) {
+		error = EFAULT;
+		goto done;
+	}
+
+	/*
+	 * Allocate memory for building the header, fill it up,
+	 * and write it out following the notes.
+	 */
+	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
+	if (hdr == NULL) {
+		error = EINVAL;
+		goto done;
+	}
+	error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize,
+	    &notelst, notesz, gzfile);
+
+	/* Write the contents of all of the writable segments. */
+	if (error == 0) {
+		Elf_Phdr *php;
+		off_t offset;
+		int i;
+
+		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
+		offset = round_page(hdrsize + notesz);
+		for (i = 0; i < seginfo.count; i++) {
+			error = core_output(vp, (caddr_t)(uintptr_t)php->p_vaddr,
+			    php->p_filesz, offset, cred, NOCRED, curthread, core_buf, gzfile);
+			if (error != 0)
+				break;
+			offset += php->p_filesz;
+			php++;
+		}
+	}
+	if (error) {
+		log(LOG_WARNING,
+		    "Failed to write core file for process %s (error %d)\n",
+		    curproc->p_comm, error);
+	}
+
+done:
+#ifdef COMPRESS_USER_CORES
+	if (core_buf)
+		free(core_buf, M_TEMP);
+	if (gzfile)
+		gzclose(gzfile);
+#endif
+	while ((ninfo = TAILQ_FIRST(&notelst)) != NULL) {
+		TAILQ_REMOVE(&notelst, ninfo, link);
+		free(ninfo, M_TEMP);
+	}
+	if (hdr != NULL)
+		free(hdr, M_TEMP);
+
+	return (error);
+}
+
+/*
+ * A callback for each_writable_segment() to write out the segment's
+ * program header entry.
+ */
+static void
+cb_put_phdr(entry, closure)
+	vm_map_entry_t entry;
+	void *closure;
+{
+	struct phdr_closure *phc = (struct phdr_closure *)closure;
+	Elf_Phdr *phdr = phc->phdr;
+
+	phc->offset = round_page(phc->offset);
+
+	phdr->p_type = PT_LOAD;
+	phdr->p_offset = phc->offset;
+	phdr->p_vaddr = entry->start;
+	phdr->p_paddr = 0;
+	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
+	phdr->p_align = PAGE_SIZE;
+	phdr->p_flags = __elfN(untrans_prot)(entry->protection);
+
+	phc->offset += phdr->p_filesz;
+	phc->phdr++;
+}
+
+/*
+ * A callback for each_writable_segment() to gather information about
+ * the number of segments and their total size.
+ */
+static void
+cb_size_segment(entry, closure)
+	vm_map_entry_t entry;
+	void *closure;
+{
+	struct sseg_closure *ssc = (struct sseg_closure *)closure;
+
+	ssc->count++;
+	ssc->size += entry->end - entry->start;
+}
+
+/*
+ * For each writable segment in the process's memory map, call the given
+ * function with a pointer to the map entry and some arbitrary
+ * caller-supplied data.
+ */
+static void
+each_writable_segment(td, func, closure)
+	struct thread *td;
+	segment_callback func;
+	void *closure;
+{
+	struct proc *p = td->td_proc;
+	vm_map_t map = &p->p_vmspace->vm_map;
+	vm_map_entry_t entry;
+	vm_object_t backing_object, object;
+	boolean_t ignore_entry;
+
+	vm_map_lock_read(map);
+	for (entry = map->header.next; entry != &map->header;
+	    entry = entry->next) {
+		/*
+		 * Don't dump inaccessible mappings, deal with legacy
+		 * coredump mode.
+		 *
+		 * Note that read-only segments related to the elf binary
+		 * are marked MAP_ENTRY_NOCOREDUMP now so we no longer
+		 * need to arbitrarily ignore such segments.
+		 */
+		if (elf_legacy_coredump) {
+			if ((entry->protection & VM_PROT_RW) != VM_PROT_RW)
+				continue;
+		} else {
+			if ((entry->protection & VM_PROT_ALL) == 0)
+				continue;
+		}
+
+		/*
+		 * Dont include memory segment in the coredump if
+		 * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
+		 * madvise(2).  Do not dump submaps (i.e. parts of the
+		 * kernel map).
+		 */
+		if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
+			continue;
+
+		if ((object = entry->object.vm_object) == NULL)
+			continue;
+
+		/* Ignore memory-mapped devices and such things. */
+		VM_OBJECT_RLOCK(object);
+		while ((backing_object = object->backing_object) != NULL) {
+			VM_OBJECT_RLOCK(backing_object);
+			VM_OBJECT_RUNLOCK(object);
+			object = backing_object;
+		}
+		ignore_entry = object->type != OBJT_DEFAULT &&
+		    object->type != OBJT_SWAP && object->type != OBJT_VNODE;
+		VM_OBJECT_RUNLOCK(object);
+		if (ignore_entry)
+			continue;
+
+		(*func)(entry, closure);
+	}
+	vm_map_unlock_read(map);
+}
+
+/*
+ * Write the core file header to the file, including padding up to
+ * the page boundary.
+ */
+static int
+__elfN(corehdr)(struct thread *td, struct vnode *vp, struct ucred *cred,
+    int numsegs, void *hdr, size_t hdrsize, struct note_info_list *notelst,
+    size_t notesz, gzFile gzfile)
+{
+	struct sbuf_drain_core_params params;
+	struct note_info *ninfo;
+	struct sbuf *sb;
+	int error;
+
+	/* Fill in the header. */
+	bzero(hdr, hdrsize);
+	__elfN(puthdr)(td, hdr, hdrsize, numsegs, notesz);
+
+	params.offset = 0;
+	params.active_cred = cred;
+	params.file_cred = NOCRED;
+	params.td = td;
+	params.vp = vp;
+#ifdef COMPRESS_USER_CORES
+	params.gzfile = gzfile;
+#endif
+	sb = sbuf_new(NULL, NULL, CORE_BUF_SIZE, SBUF_FIXEDLEN);
+	sbuf_set_drain(sb, sbuf_drain_core_output, &params);
+	sbuf_start_section(sb, NULL);
+	sbuf_bcat(sb, hdr, hdrsize);
+	TAILQ_FOREACH(ninfo, notelst, link)
+	    __elfN(putnote)(ninfo, sb);
+	/* Align up to a page boundary for the program segments. */
+	sbuf_end_section(sb, -1, PAGE_SIZE, 0);
+	error = sbuf_finish(sb);
+	sbuf_delete(sb);
+
+	return (error);
+}
+
+static void
+__elfN(prepare_notes)(struct thread *td, struct note_info_list *list,
+    size_t *sizep)
+{
+	struct proc *p;
+	struct thread *thr;
+	size_t size;
+
+	p = td->td_proc;
+	size = 0;
+
+	size += register_note(list, NT_PRPSINFO, __elfN(note_prpsinfo), p);
+
+	/*
+	 * To have the debugger select the right thread (LWP) as the initial
+	 * thread, we dump the state of the thread passed to us in td first.
+	 * This is the thread that causes the core dump and thus likely to
+	 * be the right thread one wants to have selected in the debugger.
+	 */
+	thr = td;
+	while (thr != NULL) {
+		size += register_note(list, NT_PRSTATUS,
+		    __elfN(note_prstatus), thr);
+		size += register_note(list, NT_FPREGSET,
+		    __elfN(note_fpregset), thr);
+		size += register_note(list, NT_THRMISC,
+		    __elfN(note_thrmisc), thr);
+		size += register_note(list, -1,
+		    __elfN(note_threadmd), thr);
+
+		thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) :
+		    TAILQ_NEXT(thr, td_plist);
+		if (thr == td)
+			thr = TAILQ_NEXT(thr, td_plist);
+	}
+
+	size += register_note(list, NT_PROCSTAT_PROC,
+	    __elfN(note_procstat_proc), p);
+	size += register_note(list, NT_PROCSTAT_FILES,
+	    note_procstat_files, p);
+	size += register_note(list, NT_PROCSTAT_VMMAP,
+	    note_procstat_vmmap, p);
+	size += register_note(list, NT_PROCSTAT_GROUPS,
+	    note_procstat_groups, p);
+	size += register_note(list, NT_PROCSTAT_UMASK,
+	    note_procstat_umask, p);
+	size += register_note(list, NT_PROCSTAT_RLIMIT,
+	    note_procstat_rlimit, p);
+	size += register_note(list, NT_PROCSTAT_OSREL,
+	    note_procstat_osrel, p);
+	size += register_note(list, NT_PROCSTAT_PSSTRINGS,
+	    __elfN(note_procstat_psstrings), p);
+	size += register_note(list, NT_PROCSTAT_AUXV,
+	    __elfN(note_procstat_auxv), p);
+
+	*sizep = size;
+}
+
+static void
+__elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs,
+    size_t notesz)
+{
+	Elf_Ehdr *ehdr;
+	Elf_Phdr *phdr;
+	struct phdr_closure phc;
+
+	ehdr = (Elf_Ehdr *)hdr;
+	phdr = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr));
+
+	ehdr->e_ident[EI_MAG0] = ELFMAG0;
+	ehdr->e_ident[EI_MAG1] = ELFMAG1;
+	ehdr->e_ident[EI_MAG2] = ELFMAG2;
+	ehdr->e_ident[EI_MAG3] = ELFMAG3;
+	ehdr->e_ident[EI_CLASS] = ELF_CLASS;
+	ehdr->e_ident[EI_DATA] = ELF_DATA;
+	ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+	ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
+	ehdr->e_ident[EI_ABIVERSION] = 0;
+	ehdr->e_ident[EI_PAD] = 0;
+	ehdr->e_type = ET_CORE;
+#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
+	ehdr->e_machine = ELF_ARCH32;
+#else
+	ehdr->e_machine = ELF_ARCH;
+#endif
+	ehdr->e_version = EV_CURRENT;
+	ehdr->e_entry = 0;
+	ehdr->e_phoff = sizeof(Elf_Ehdr);
+	ehdr->e_flags = 0;
+	ehdr->e_ehsize = sizeof(Elf_Ehdr);
+	ehdr->e_phentsize = sizeof(Elf_Phdr);
+	ehdr->e_phnum = numsegs + 1;
+	ehdr->e_shentsize = sizeof(Elf_Shdr);
+	ehdr->e_shnum = 0;
+	ehdr->e_shstrndx = SHN_UNDEF;
+
+	/*
+	 * Fill in the program header entries.
+	 */
+
+	/* The note segement. */
+	phdr->p_type = PT_NOTE;
+	phdr->p_offset = hdrsize;
+	phdr->p_vaddr = 0;
+	phdr->p_paddr = 0;
+	phdr->p_filesz = notesz;
+	phdr->p_memsz = 0;
+	phdr->p_flags = PF_R;
+	phdr->p_align = ELF_NOTE_ROUNDSIZE;
+	phdr++;
+
+	/* All the writable segments from the program. */
+	phc.phdr = phdr;
+	phc.offset = round_page(hdrsize + notesz);
+	each_writable_segment(td, cb_put_phdr, &phc);
+}
+
+static size_t
+register_note(struct note_info_list *list, int type, outfunc_t out, void *arg)
+{
+	struct note_info *ninfo;
+	size_t size, notesize;
+
+	size = 0;
+	out(arg, NULL, &size);
+	ninfo = malloc(sizeof(*ninfo), M_TEMP, M_ZERO | M_WAITOK);
+	ninfo->type = type;
+	ninfo->outfunc = out;
+	ninfo->outarg = arg;
+	ninfo->outsize = size;
+	TAILQ_INSERT_TAIL(list, ninfo, link);
+
+	if (type == -1)
+		return (size);
+
+	notesize = sizeof(Elf_Note) +		/* note header */
+	    roundup2(8, ELF_NOTE_ROUNDSIZE) +	/* note name ("FreeBSD") */
+	    roundup2(size, ELF_NOTE_ROUNDSIZE);	/* note description */
+
+	return (notesize);
+}
+
+static void
+__elfN(putnote)(struct note_info *ninfo, struct sbuf *sb)
+{
+	Elf_Note note;
+	ssize_t old_len;
+
+	if (ninfo->type == -1) {
+		ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
+		return;
+	}
+
+	note.n_namesz = 8; /* strlen("FreeBSD") + 1 */
+	note.n_descsz = ninfo->outsize;
+	note.n_type = ninfo->type;
+
+	sbuf_bcat(sb, &note, sizeof(note));
+	sbuf_start_section(sb, &old_len);
+	sbuf_bcat(sb, "FreeBSD", note.n_namesz);
+	sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
+	if (note.n_descsz == 0)
+		return;
+	sbuf_start_section(sb, &old_len);
+	ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
+	sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
+}
+
+/*
+ * Miscellaneous note out functions.
+ */
+
+#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
+#include <compat/freebsd32/freebsd32.h>
+
+typedef struct prstatus32 elf_prstatus_t;
+typedef struct prpsinfo32 elf_prpsinfo_t;
+typedef struct fpreg32 elf_prfpregset_t;
+typedef struct fpreg32 elf_fpregset_t;
+typedef struct reg32 elf_gregset_t;
+typedef struct thrmisc32 elf_thrmisc_t;
+#define ELF_KERN_PROC_MASK	KERN_PROC_MASK32
+typedef struct kinfo_proc32 elf_kinfo_proc_t;
+typedef uint32_t elf_ps_strings_t;
+#else
+typedef prstatus_t elf_prstatus_t;
+typedef prpsinfo_t elf_prpsinfo_t;
+typedef prfpregset_t elf_prfpregset_t;
+typedef prfpregset_t elf_fpregset_t;
+typedef gregset_t elf_gregset_t;
+typedef thrmisc_t elf_thrmisc_t;
+#define ELF_KERN_PROC_MASK	0
+typedef struct kinfo_proc elf_kinfo_proc_t;
+typedef vm_offset_t elf_ps_strings_t;
+#endif
+
+static void
+__elfN(note_prpsinfo)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct proc *p;
+	elf_prpsinfo_t *psinfo;
+
+	p = (struct proc *)arg;
+	if (sb != NULL) {
+		KASSERT(*sizep == sizeof(*psinfo), ("invalid size"));
+		psinfo = malloc(sizeof(*psinfo), M_TEMP, M_ZERO | M_WAITOK);
+		psinfo->pr_version = PRPSINFO_VERSION;
+		psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t);
+		strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname));
+		/*
+		 * XXX - We don't fill in the command line arguments properly
+		 * yet.
+		 */
+		strlcpy(psinfo->pr_psargs, p->p_comm,
+		    sizeof(psinfo->pr_psargs));
+
+		sbuf_bcat(sb, psinfo, sizeof(*psinfo));
+		free(psinfo, M_TEMP);
+	}
+	*sizep = sizeof(*psinfo);
+}
+
+static void
+__elfN(note_prstatus)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct thread *td;
+	elf_prstatus_t *status;
+
+	td = (struct thread *)arg;
+	if (sb != NULL) {
+		KASSERT(*sizep == sizeof(*status), ("invalid size"));
+		status = malloc(sizeof(*status), M_TEMP, M_ZERO | M_WAITOK);
+		status->pr_version = PRSTATUS_VERSION;
+		status->pr_statussz = sizeof(elf_prstatus_t);
+		status->pr_gregsetsz = sizeof(elf_gregset_t);
+		status->pr_fpregsetsz = sizeof(elf_fpregset_t);
+		status->pr_osreldate = osreldate;
+		status->pr_cursig = td->td_proc->p_sig;
+		status->pr_pid = td->td_tid;
+#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
+		fill_regs32(td, &status->pr_reg);
+#else
+		fill_regs(td, &status->pr_reg);
+#endif
+		sbuf_bcat(sb, status, sizeof(*status));
+		free(status, M_TEMP);
+	}
+	*sizep = sizeof(*status);
+}
+
+static void
+__elfN(note_fpregset)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct thread *td;
+	elf_prfpregset_t *fpregset;
+
+	td = (struct thread *)arg;
+	if (sb != NULL) {
+		KASSERT(*sizep == sizeof(*fpregset), ("invalid size"));
+		fpregset = malloc(sizeof(*fpregset), M_TEMP, M_ZERO | M_WAITOK);
+#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
+		fill_fpregs32(td, fpregset);
+#else
+		fill_fpregs(td, fpregset);
+#endif
+		sbuf_bcat(sb, fpregset, sizeof(*fpregset));
+		free(fpregset, M_TEMP);
+	}
+	*sizep = sizeof(*fpregset);
+}
+
+static void
+__elfN(note_thrmisc)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct thread *td;
+	elf_thrmisc_t thrmisc;
+
+	td = (struct thread *)arg;
+	if (sb != NULL) {
+		KASSERT(*sizep == sizeof(thrmisc), ("invalid size"));
+		bzero(&thrmisc._pad, sizeof(thrmisc._pad));
+		strcpy(thrmisc.pr_tname, td->td_name);
+		sbuf_bcat(sb, &thrmisc, sizeof(thrmisc));
+	}
+	*sizep = sizeof(thrmisc);
+}
+
+/*
+ * Allow for MD specific notes, as well as any MD
+ * specific preparations for writing MI notes.
+ */
+static void
+__elfN(note_threadmd)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct thread *td;
+	void *buf;
+	size_t size;
+
+	td = (struct thread *)arg;
+	size = *sizep;
+	buf = NULL;
+	if (size != 0 && sb != NULL)
+		buf = malloc(size, M_TEMP, M_ZERO | M_WAITOK);
+	size = 0;
+	__elfN(dump_thread)(td, buf, &size);
+	KASSERT(*sizep == size, ("invalid size"));
+	if (size != 0 && sb != NULL)
+		sbuf_bcat(sb, buf, size);
+	*sizep = size;
+}
+
+#ifdef KINFO_PROC_SIZE
+CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
+#endif
+
+static void
+__elfN(note_procstat_proc)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct proc *p;
+	size_t size;
+	int structsize;
+
+	p = (struct proc *)arg;
+	size = sizeof(structsize) + p->p_numthreads *
+	    sizeof(elf_kinfo_proc_t);
+
+	if (sb != NULL) {
+		KASSERT(*sizep == size, ("invalid size"));
+		structsize = sizeof(elf_kinfo_proc_t);
+		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		PROC_LOCK(p);
+		kern_proc_out(p, sb, ELF_KERN_PROC_MASK);
+	}
+	*sizep = size;
+}
+
+#ifdef KINFO_FILE_SIZE
+CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
+#endif
+
+static void
+note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct proc *p;
+	size_t size;
+	int structsize;
+
+	p = (struct proc *)arg;
+	if (sb == NULL) {
+		size = 0;
+		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
+		sbuf_set_drain(sb, sbuf_drain_count, &size);
+		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		PROC_LOCK(p);
+		kern_proc_filedesc_out(p, sb, -1);
+		sbuf_finish(sb);
+		sbuf_delete(sb);
+		*sizep = size;
+	} else {
+		structsize = sizeof(struct kinfo_file);
+		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		PROC_LOCK(p);
+		kern_proc_filedesc_out(p, sb, -1);
+	}
+}
+
+#ifdef KINFO_VMENTRY_SIZE
+CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE);
+#endif
+
+static void
+note_procstat_vmmap(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct proc *p;
+	size_t size;
+	int structsize;
+
+	p = (struct proc *)arg;
+	if (sb == NULL) {
+		size = 0;
+		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
+		sbuf_set_drain(sb, sbuf_drain_count, &size);
+		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		PROC_LOCK(p);
+		kern_proc_vmmap_out(p, sb);
+		sbuf_finish(sb);
+		sbuf_delete(sb);
+		*sizep = size;
+	} else {
+		structsize = sizeof(struct kinfo_vmentry);
+		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		PROC_LOCK(p);
+		kern_proc_vmmap_out(p, sb);
+	}
+}
+
+static void
+note_procstat_groups(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct proc *p;
+	size_t size;
+	int structsize;
+
+	p = (struct proc *)arg;
+	size = sizeof(structsize) + p->p_ucred->cr_ngroups * sizeof(gid_t);
+	if (sb != NULL) {
+		KASSERT(*sizep == size, ("invalid size"));
+		structsize = sizeof(gid_t);
+		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		sbuf_bcat(sb, p->p_ucred->cr_groups, p->p_ucred->cr_ngroups *
+		    sizeof(gid_t));
+	}
+	*sizep = size;
+}
+
+static void
+note_procstat_umask(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct proc *p;
+	size_t size;
+	int structsize;
+
+	p = (struct proc *)arg;
+	size = sizeof(structsize) + sizeof(p->p_fd->fd_cmask);
+	if (sb != NULL) {
+		KASSERT(*sizep == size, ("invalid size"));
+		structsize = sizeof(p->p_fd->fd_cmask);
+		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		sbuf_bcat(sb, &p->p_fd->fd_cmask, sizeof(p->p_fd->fd_cmask));
+	}
+	*sizep = size;
+}
+
+static void
+note_procstat_rlimit(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct proc *p;
+	struct rlimit rlim[RLIM_NLIMITS];
+	size_t size;
+	int structsize, i;
+
+	p = (struct proc *)arg;
+	size = sizeof(structsize) + sizeof(rlim);
+	if (sb != NULL) {
+		KASSERT(*sizep == size, ("invalid size"));
+		structsize = sizeof(rlim);
+		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		PROC_LOCK(p);
+		for (i = 0; i < RLIM_NLIMITS; i++)
+			lim_rlimit(p, i, &rlim[i]);
+		PROC_UNLOCK(p);
+		sbuf_bcat(sb, rlim, sizeof(rlim));
+	}
+	*sizep = size;
+}
+
+static void
+note_procstat_osrel(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct proc *p;
+	size_t size;
+	int structsize;
+
+	p = (struct proc *)arg;
+	size = sizeof(structsize) + sizeof(p->p_osrel);
+	if (sb != NULL) {
+		KASSERT(*sizep == size, ("invalid size"));
+		structsize = sizeof(p->p_osrel);
+		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		sbuf_bcat(sb, &p->p_osrel, sizeof(p->p_osrel));
+	}
+	*sizep = size;
+}
+
+static void
+__elfN(note_procstat_psstrings)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct proc *p;
+	elf_ps_strings_t ps_strings;
+	size_t size;
+	int structsize;
+
+	p = (struct proc *)arg;
+	size = sizeof(structsize) + sizeof(ps_strings);
+	if (sb != NULL) {
+		KASSERT(*sizep == size, ("invalid size"));
+		structsize = sizeof(ps_strings);
+#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
+		ps_strings = PTROUT(p->p_sysent->sv_psstrings);
+#else
+		ps_strings = p->p_sysent->sv_psstrings;
+#endif
+		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		sbuf_bcat(sb, &ps_strings, sizeof(ps_strings));
+	}
+	*sizep = size;
+}
+
+static void
+__elfN(note_procstat_auxv)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+	struct proc *p;
+	size_t size;
+	int structsize;
+
+	p = (struct proc *)arg;
+	if (sb == NULL) {
+		size = 0;
+		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
+		sbuf_set_drain(sb, sbuf_drain_count, &size);
+		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		PHOLD(p);
+		proc_getauxv(curthread, p, sb);
+		PRELE(p);
+		sbuf_finish(sb);
+		sbuf_delete(sb);
+		*sizep = size;
+	} else {
+		structsize = sizeof(Elf_Auxinfo);
+		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		PHOLD(p);
+		proc_getauxv(curthread, p, sb);
+		PRELE(p);
+	}
+}
+
+static boolean_t
+__elfN(parse_notes)(struct image_params *imgp, Elf_Brandnote *checknote,
+    int32_t *osrel, const Elf_Phdr *pnote)
+{
+	const Elf_Note *note, *note0, *note_end;
+	const char *note_name;
+	int i;
+
+	if (pnote == NULL || pnote->p_offset > PAGE_SIZE ||
+	    pnote->p_filesz > PAGE_SIZE - pnote->p_offset)
+		return (FALSE);
+
+	note = note0 = (const Elf_Note *)(imgp->image_header + pnote->p_offset);
+	note_end = (const Elf_Note *)(imgp->image_header +
+	    pnote->p_offset + pnote->p_filesz);
+	for (i = 0; i < 100 && note >= note0 && note < note_end; i++) {
+		if (!aligned(note, Elf32_Addr) || (const char *)note_end -
+		    (const char *)note < sizeof(Elf_Note))
+			return (FALSE);
+		if (note->n_namesz != checknote->hdr.n_namesz ||
+		    note->n_descsz != checknote->hdr.n_descsz ||
+		    note->n_type != checknote->hdr.n_type)
+			goto nextnote;
+		note_name = (const char *)(note + 1);
+		if (note_name + checknote->hdr.n_namesz >=
+		    (const char *)note_end || strncmp(checknote->vendor,
+		    note_name, checknote->hdr.n_namesz) != 0)
+			goto nextnote;
+
+		/*
+		 * Fetch the osreldate for binary
+		 * from the ELF OSABI-note if necessary.
+		 */
+		if ((checknote->flags & BN_TRANSLATE_OSREL) != 0 &&
+		    checknote->trans_osrel != NULL)
+			return (checknote->trans_osrel(note, osrel));
+		return (TRUE);
+
+nextnote:
+		note = (const Elf_Note *)((const char *)(note + 1) +
+		    roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) +
+		    roundup2(note->n_descsz, ELF_NOTE_ROUNDSIZE));
+	}
+
+	return (FALSE);
+}
+
+/*
+ * Try to find the appropriate ABI-note section for checknote,
+ * fetch the osreldate for binary from the ELF OSABI-note. Only the
+ * first page of the image is searched, the same as for headers.
+ */
+static boolean_t
+__elfN(check_note)(struct image_params *imgp, Elf_Brandnote *checknote,
+    int32_t *osrel)
+{
+	const Elf_Phdr *phdr;
+	const Elf_Ehdr *hdr;
+	int i;
+
+	hdr = (const Elf_Ehdr *)imgp->image_header;
+	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+
+	for (i = 0; i < hdr->e_phnum; i++) {
+		if (phdr[i].p_type == PT_NOTE &&
+		    __elfN(parse_notes)(imgp, checknote, osrel, &phdr[i]))
+			return (TRUE);
+	}
+	return (FALSE);
+
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw __elfN(execsw) = {
+	__CONCAT(exec_, __elfN(imgact)),
+	__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
+};
+EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
+
+#ifdef COMPRESS_USER_CORES
+/*
+ * Compress and write out a core segment for a user process.
+ *
+ * 'inbuf' is the starting address of a VM segment in the process' address
+ * space that is to be compressed and written out to the core file.  'dest_buf'
+ * is a buffer in the kernel's address space.  The segment is copied from 
+ * 'inbuf' to 'dest_buf' first before being processed by the compression
+ * routine gzwrite().  This copying is necessary because the content of the VM
+ * segment may change between the compression pass and the crc-computation pass
+ * in gzwrite().  This is because realtime threads may preempt the UNIX kernel.
+ *
+ * If inbuf is NULL it is assumed that data is already copied to 'dest_buf'.
+ */
+static int
+compress_core (gzFile file, char *inbuf, char *dest_buf, unsigned int len,
+    struct thread *td)
+{
+	int len_compressed;
+	int error = 0;
+	unsigned int chunk_len;
+
+	while (len) {
+		if (inbuf != NULL) {
+			chunk_len = (len > CORE_BUF_SIZE) ? CORE_BUF_SIZE : len;
+			copyin(inbuf, dest_buf, chunk_len);
+			inbuf += chunk_len;
+		} else {
+			chunk_len = len;
+		}
+		len_compressed = gzwrite(file, dest_buf, chunk_len);
+
+		EVENTHANDLER_INVOKE(app_coredump_progress, td, len_compressed);
+
+		if ((unsigned int)len_compressed != chunk_len) {
+			log(LOG_WARNING,
+			    "compress_core: length mismatch (0x%x returned, "
+			    "0x%x expected)\n", len_compressed, chunk_len);
+			EVENTHANDLER_INVOKE(app_coredump_error, td,
+			    "compress_core: length mismatch %x -> %x",
+			    chunk_len, len_compressed);
+			error = EFAULT;
+			break;
+		}
+		len -= chunk_len;
+		maybe_yield();
+	}
+
+	return (error);
+}
+#endif /* COMPRESS_USER_CORES */
+
+static vm_prot_t
+__elfN(trans_prot)(Elf_Word flags)
+{
+	vm_prot_t prot;
+
+	prot = 0;
+	if (flags & PF_X)
+		prot |= VM_PROT_EXECUTE;
+	if (flags & PF_W)
+		prot |= VM_PROT_WRITE;
+	if (flags & PF_R)
+		prot |= VM_PROT_READ;
+#if __ELF_WORD_SIZE == 32
+#if defined(__amd64__) || defined(__ia64__)
+	if (i386_read_exec && (flags & PF_R))
+		prot |= VM_PROT_EXECUTE;
+#endif
+#endif
+	return (prot);
+}
+
+static Elf_Word
+__elfN(untrans_prot)(vm_prot_t prot)
+{
+	Elf_Word flags;
+
+	flags = 0;
+	if (prot & VM_PROT_EXECUTE)
+		flags |= PF_X;
+	if (prot & VM_PROT_READ)
+		flags |= PF_R;
+	if (prot & VM_PROT_WRITE)
+		flags |= PF_W;
+	return (flags);
+}
diff --git a/sys/kern/imgact_elf32.c b/sys/kern/imgact_elf32.c
new file mode 100644
index 0000000..b286f31
--- /dev/null
+++ b/sys/kern/imgact_elf32.c
@@ -0,0 +1,31 @@
+/*-
+ * Copyright (c) 2002 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define __ELF_WORD_SIZE 32
+#include <kern/imgact_elf.c>
diff --git a/sys/kern/imgact_elf64.c b/sys/kern/imgact_elf64.c
new file mode 100644
index 0000000..db2470d
--- /dev/null
+++ b/sys/kern/imgact_elf64.c
@@ -0,0 +1,31 @@
+/*-
+ * Copyright (c) 2002 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define __ELF_WORD_SIZE 64
+#include <kern/imgact_elf.c>
diff --git a/sys/kern/imgact_gzip.c b/sys/kern/imgact_gzip.c
new file mode 100644
index 0000000..230854b
--- /dev/null
+++ b/sys/kern/imgact_gzip.c
@@ -0,0 +1,393 @@
+/*-
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * This module handles execution of a.out files which have been run through
+ * "gzip".  This saves diskspace, but wastes cpu-cycles and VM.
+ *
+ * TODO:
+ *	text-segments should be made R/O after being filled
+ *	is the vm-stuff safe ?
+ * 	should handle the entire header of gzip'ed stuff.
+ *	inflate isn't quite reentrant yet...
+ *	error-handling is a mess...
+ *	so is the rest...
+ *	tidy up unnecessary includes
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/inflate.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+struct imgact_gzip {
+	struct image_params *ip;
+	struct exec     a_out;
+	int             error;
+	int		gotheader;
+	int             where;
+	u_char         *inbuf;
+	u_long          offset;
+	u_long          output;
+	u_long          len;
+	int             idx;
+	u_long          virtual_offset, file_offset, file_end, bss_size;
+};
+
+static int exec_gzip_imgact(struct image_params *imgp);
+static int NextByte(void *vp);
+static int do_aout_hdr(struct imgact_gzip *);
+static int Flush(void *vp, u_char *, u_long siz);
+
+static int
+exec_gzip_imgact(imgp)
+	struct image_params *imgp;
+{
+	int             error;
+	const u_char   *p = (const u_char *) imgp->image_header;
+	struct imgact_gzip igz;
+	struct inflate  infl;
+	struct vmspace *vmspace;
+
+	/* If these four are not OK, it isn't a gzip file */
+	if (p[0] != 0x1f)
+		return -1;	/* 0    Simply magic	 */
+	if (p[1] != 0x8b)
+		return -1;	/* 1    Simply magic	 */
+	if (p[2] != 0x08)
+		return -1;	/* 2    Compression method	 */
+	if (p[9] != 0x03)
+		return -1;	/* 9    OS compressed on	 */
+
+	/*
+	 * If this one contains anything but a comment or a filename marker,
+	 * we don't want to chew on it
+	 */
+	if (p[3] & ~(0x18))
+		return ENOEXEC;	/* 3    Flags		 */
+
+	/* These are of no use to us */
+	/* 4-7  Timestamp		 */
+	/* 8    Extra flags		 */
+
+	bzero(&igz, sizeof igz);
+	bzero(&infl, sizeof infl);
+	infl.gz_private = (void *) &igz;
+	infl.gz_input = NextByte;
+	infl.gz_output = Flush;
+
+	igz.ip = imgp;
+	igz.idx = 10;
+
+	if (p[3] & 0x08) {	/* skip a filename */
+		while (p[igz.idx++])
+			if (igz.idx >= PAGE_SIZE)
+				return ENOEXEC;
+	}
+	if (p[3] & 0x10) {	/* skip a comment */
+		while (p[igz.idx++])
+			if (igz.idx >= PAGE_SIZE)
+				return ENOEXEC;
+	}
+	igz.len = imgp->attr->va_size;
+
+	error = inflate(&infl);
+
+	/*
+	 * The unzipped file may not even have been long enough to contain
+	 * a header giving Flush() a chance to return error.  Check for this.
+	 */
+	if ( !igz.gotheader )
+		return ENOEXEC;
+
+	if ( !error ) {
+		vmspace = imgp->proc->p_vmspace;
+		error = vm_map_protect(&vmspace->vm_map,
+			(vm_offset_t) vmspace->vm_taddr,
+			(vm_offset_t) (vmspace->vm_taddr + 
+				      (vmspace->vm_tsize << PAGE_SHIFT)) ,
+			VM_PROT_READ|VM_PROT_EXECUTE,0);
+	}
+
+	if (igz.inbuf)
+		kmap_free_wakeup(exec_map, (vm_offset_t)igz.inbuf, PAGE_SIZE);
+	if (igz.error || error) {
+		printf("Output=%lu ", igz.output);
+		printf("Inflate_error=%d igz.error=%d where=%d\n",
+		       error, igz.error, igz.where);
+	}
+	if (igz.error)
+		return igz.error;
+	if (error)
+		return ENOEXEC;
+	return 0;
+}
+
+static int
+do_aout_hdr(struct imgact_gzip * gz)
+{
+	int             error;
+	struct vmspace *vmspace;
+	vm_offset_t     vmaddr;
+
+	/*
+	 * Set file/virtual offset based on a.out variant. We do two cases:
+	 * host byte order and network byte order (for NetBSD compatibility)
+	 */
+	switch ((int) (gz->a_out.a_midmag & 0xffff)) {
+	case ZMAGIC:
+		gz->virtual_offset = 0;
+		if (gz->a_out.a_text) {
+			gz->file_offset = PAGE_SIZE;
+		} else {
+			/* Bill's "screwball mode" */
+			gz->file_offset = 0;
+		}
+		break;
+	case QMAGIC:
+		gz->virtual_offset = PAGE_SIZE;
+		gz->file_offset = 0;
+		break;
+	default:
+		/* NetBSD compatibility */
+		switch ((int) (ntohl(gz->a_out.a_midmag) & 0xffff)) {
+		case ZMAGIC:
+		case QMAGIC:
+			gz->virtual_offset = PAGE_SIZE;
+			gz->file_offset = 0;
+			break;
+		default:
+			gz->where = __LINE__;
+			return (-1);
+		}
+	}
+
+	gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE);
+
+	/*
+	 * Check various fields in header for validity/bounds.
+	 */
+	if (			/* entry point must lay with text region */
+	    gz->a_out.a_entry < gz->virtual_offset ||
+	    gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text ||
+
+	/* text and data size must each be page rounded */
+	    gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) {
+		gz->where = __LINE__;
+		return (-1);
+	}
+	/*
+	 * text/data/bss must not exceed limits
+	 */
+	PROC_LOCK(gz->ip->proc);
+	if (			/* text can't exceed maximum text size */
+	    gz->a_out.a_text > maxtsiz ||
+
+	/* data + bss can't exceed rlimit */
+	    gz->a_out.a_data + gz->bss_size >
+	    lim_cur(gz->ip->proc, RLIMIT_DATA) ||
+	    racct_set(gz->ip->proc, RACCT_DATA,
+	    gz->a_out.a_data + gz->bss_size) != 0) {
+		PROC_UNLOCK(gz->ip->proc);
+		gz->where = __LINE__;
+		return (ENOMEM);
+	}
+	PROC_UNLOCK(gz->ip->proc);
+	/* Find out how far we should go */
+	gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data;
+
+	/*
+	 * Avoid a possible deadlock if the current address space is destroyed
+	 * and that address space maps the locked vnode.  In the common case,
+	 * the locked vnode's v_usecount is decremented but remains greater
+	 * than zero.  Consequently, the vnode lock is not needed by vrele().
+	 * However, in cases where the vnode lock is external, such as nullfs,
+	 * v_usecount may become zero.
+	 */
+	VOP_UNLOCK(gz->ip->vp, 0);
+
+	/*
+	 * Destroy old process VM and create a new one (with a new stack)
+	 */
+	error = exec_new_vmspace(gz->ip, &aout_sysvec);
+
+	vn_lock(gz->ip->vp, LK_EXCLUSIVE | LK_RETRY);
+	if (error) {
+		gz->where = __LINE__;
+		return (error);
+	}
+
+	vmspace = gz->ip->proc->p_vmspace;
+
+	vmaddr = gz->virtual_offset;
+
+	error = vm_mmap(&vmspace->vm_map,
+			&vmaddr,
+			gz->a_out.a_text + gz->a_out.a_data,
+			VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED,
+			OBJT_DEFAULT,
+			NULL,
+			0);
+
+	if (error) {
+		gz->where = __LINE__;
+		return (error);
+	}
+
+	if (gz->bss_size != 0) {
+		/*
+		 * Allocate demand-zeroed area for uninitialized data.
+		 * "bss" = 'block started by symbol' - named after the 
+		 * IBM 7090 instruction of the same name.
+		 */
+		vmaddr = gz->virtual_offset + gz->a_out.a_text + 
+			gz->a_out.a_data;
+		error = vm_map_find(&vmspace->vm_map,
+				NULL,
+				0,
+				&vmaddr, 
+				gz->bss_size,
+				FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
+		if (error) {
+			gz->where = __LINE__;
+			return (error);
+		}
+	}
+	/* Fill in process VM information */
+	vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT;
+	vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT;
+	vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset;
+	vmspace->vm_daddr = (caddr_t) (uintptr_t)
+			    (gz->virtual_offset + gz->a_out.a_text);
+
+	/* Fill in image_params */
+	gz->ip->interpreted = 0;
+	gz->ip->entry_addr = gz->a_out.a_entry;
+
+	gz->ip->proc->p_sysent = &aout_sysvec;
+
+	return 0;
+}
+
+static int
+NextByte(void *vp)
+{
+	int             error;
+	struct imgact_gzip *igz = (struct imgact_gzip *) vp;
+
+	if (igz->idx >= igz->len) {
+		igz->where = __LINE__;
+		return GZ_EOF;
+	}
+	if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) {
+		return igz->inbuf[(igz->idx++) - igz->offset];
+	}
+	if (igz->inbuf)
+		kmap_free_wakeup(exec_map, (vm_offset_t)igz->inbuf, PAGE_SIZE);
+	igz->offset = igz->idx & ~PAGE_MASK;
+
+	error = vm_mmap(exec_map,	/* map */
+			(vm_offset_t *) & igz->inbuf,	/* address */
+			PAGE_SIZE,	/* size */
+			VM_PROT_READ,	/* protection */
+			VM_PROT_READ,	/* max protection */
+			0,	/* flags */
+			OBJT_VNODE,	/* handle type */
+			igz->ip->vp,	/* vnode */
+			igz->offset);	/* offset */
+	if (error) {
+		igz->where = __LINE__;
+		igz->error = error;
+		return GZ_EOF;
+	}
+	return igz->inbuf[(igz->idx++) - igz->offset];
+}
+
+static int
+Flush(void *vp, u_char * ptr, u_long siz)
+{
+	struct imgact_gzip *gz = (struct imgact_gzip *) vp;
+	u_char         *p = ptr, *q;
+	int             i;
+
+	/* First, find an a.out-header. */
+	if (gz->output < sizeof gz->a_out) {
+		q = (u_char *) & gz->a_out;
+		i = min(siz, sizeof gz->a_out - gz->output);
+		bcopy(p, q + gz->output, i);
+		gz->output += i;
+		p += i;
+		siz -= i;
+		if (gz->output == sizeof gz->a_out) {
+			gz->gotheader = 1;
+			i = do_aout_hdr(gz);
+			if (i == -1) {
+				if (!gz->where)
+					gz->where = __LINE__;
+				gz->error = ENOEXEC;
+				return ENOEXEC;
+			} else if (i) {
+				gz->where = __LINE__;
+				gz->error = i;
+				return ENOEXEC;
+			}
+			if (gz->file_offset == 0) {
+				q = (u_char *) (uintptr_t) gz->virtual_offset;
+				copyout(&gz->a_out, q, sizeof gz->a_out);
+			}
+		}
+	}
+	/* Skip over zero-padded first PAGE if needed */
+	if (gz->output < gz->file_offset &&
+	    gz->output + siz > gz->file_offset) {
+		i = min(siz, gz->file_offset - gz->output);
+		gz->output += i;
+		p += i;
+		siz -= i;
+	}
+	if (gz->output >= gz->file_offset && gz->output < gz->file_end) {
+		i = min(siz, gz->file_end - gz->output);
+		q = (u_char *) (uintptr_t)
+		    (gz->virtual_offset + gz->output - gz->file_offset);
+		copyout(p, q, i);
+		gz->output += i;
+		p += i;
+		siz -= i;
+	}
+	gz->output += siz;
+	return 0;
+}
+
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"};
+EXEC_SET(execgzip, gzip_execsw);
diff --git a/sys/kern/imgact_shell.c b/sys/kern/imgact_shell.c
new file mode 100644
index 0000000..d9884f5
--- /dev/null
+++ b/sys/kern/imgact_shell.c
@@ -0,0 +1,258 @@
+/*-
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/kernel.h>
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define SHELLMAGIC	0x2123 /* #! */
+#else
+#define SHELLMAGIC	0x2321
+#endif
+
+/*
+ * At the time of this writing, MAXSHELLCMDLEN == PAGE_SIZE.  This is
+ * significant because the caller has only mapped in one page of the
+ * file we're reading.
+ */
+#if MAXSHELLCMDLEN > PAGE_SIZE
+#error "MAXSHELLCMDLEN is larger than a single page!"
+#endif
+
+/*
+ * MAXSHELLCMDLEN must be at least MAXINTERP plus the size of the `#!'
+ * prefix and terminating newline.
+ */
+CTASSERT(MAXSHELLCMDLEN >= MAXINTERP + 3);
+
+/**
+ * Shell interpreter image activator. An interpreter name beginning at
+ * imgp->args->begin_argv is the minimal successful exit requirement.
+ *
+ * If the given file is a shell-script, then the first line will start
+ * with the two characters `#!' (aka SHELLMAGIC), followed by the name
+ * of the shell-interpreter to run, followed by zero or more tokens.
+ *
+ * The interpreter is then started up such that it will see:
+ *    arg[0] -> The name of interpreter as specified after `#!' in the
+ *		first line of the script.  The interpreter name must
+ *		not be longer than MAXSHELLCMDLEN bytes.
+ *    arg[1] -> *If* there are any additional tokens on the first line,
+ *		then we add a new arg[1], which is a copy of the rest of
+ *		that line.  The copy starts at the first token after the
+ *		interpreter name.  We leave it to the interpreter to
+ *		parse the tokens in that value.
+ *    arg[x] -> the full pathname of the script.  This will either be
+ *		arg[2] or arg[1], depending on whether or not tokens
+ *		were found after the interpreter name.
+ *  arg[x+1] -> all the arguments that were specified on the original
+ *		command line.
+ *
+ * This processing is described in the execve(2) man page.
+ */
+
+/*
+ * HISTORICAL NOTE: From 1993 to mid-2005, FreeBSD parsed out the tokens as
+ * found on the first line of the script, and setup each token as a separate
+ * value in arg[].  This extra processing did not match the behavior of other
+ * OS's, and caused a few subtle problems.  For one, it meant the kernel was
+ * deciding how those values should be parsed (wrt characters for quoting or
+ * comments, etc), while the interpreter might have other rules for parsing.
+ * It also meant the interpreter had no way of knowing which arguments came
+ * from the first line of the shell script, and which arguments were specified
+ * by the user on the command line.  That extra processing was dropped in the
+ * 6.x branch on May 28, 2005 (matching __FreeBSD_version 600029).
+ */
+int
+exec_shell_imgact(imgp)
+	struct image_params *imgp;
+{
+	const char *image_header = imgp->image_header;
+	const char *ihp, *interpb, *interpe, *maxp, *optb, *opte, *fname;
+	int error, offset;
+	size_t length;
+	struct vattr vattr;
+	struct sbuf *sname;
+
+	/* a shell script? */
+	if (((const short *)image_header)[0] != SHELLMAGIC)
+		return (-1);
+
+	/*
+	 * Don't allow a shell script to be the shell for a shell
+	 *	script. :-)
+	 */
+	if (imgp->interpreted)
+		return (ENOEXEC);
+
+	imgp->interpreted = 1;
+
+	/*
+	 * At this point we have the first page of the file mapped.
+	 * However, we don't know how far into the page the contents are
+	 * valid -- the actual file might be much shorter than the page.
+	 * So find out the file size.
+ 	 */
+	error = VOP_GETATTR(imgp->vp, &vattr, imgp->proc->p_ucred);
+	if (error)
+		return (error);
+
+	/*
+	 * Copy shell name and arguments from image_header into a string
+	 * buffer.
+	 */
+	maxp = &image_header[MIN(vattr.va_size, MAXSHELLCMDLEN)];
+	ihp = &image_header[2];
+
+	/*
+	 * Find the beginning and end of the interpreter_name.  If the
+	 * line does not include any interpreter, or if the name which
+	 * was found is too long, we bail out.
+	 */
+	while (ihp < maxp && ((*ihp == ' ') || (*ihp == '\t')))
+		ihp++;
+	interpb = ihp;
+	while (ihp < maxp && ((*ihp != ' ') && (*ihp != '\t') && (*ihp != '\n')
+	    && (*ihp != '\0')))
+		ihp++;
+	interpe = ihp;
+	if (interpb == interpe)
+		return (ENOEXEC);
+	if (interpe - interpb >= MAXINTERP)
+		return (ENAMETOOLONG);
+
+	/*
+	 * Find the beginning of the options (if any), and the end-of-line.
+	 * Then trim the trailing blanks off the value.  Note that some
+	 * other operating systems do *not* trim the trailing whitespace...
+	 */
+	while (ihp < maxp && ((*ihp == ' ') || (*ihp == '\t')))
+		ihp++;
+	optb = ihp;
+	while (ihp < maxp && ((*ihp != '\n') && (*ihp != '\0')))
+		ihp++;
+	opte = ihp;
+	if (opte == maxp)
+		return (ENOEXEC);
+	while (--ihp > optb && ((*ihp == ' ') || (*ihp == '\t')))
+		opte = ihp;
+
+	if (imgp->args->fname != NULL) {
+		fname = imgp->args->fname;
+		sname = NULL;
+	} else {
+		sname = sbuf_new_auto();
+		sbuf_printf(sname, "/dev/fd/%d", imgp->args->fd);
+		sbuf_finish(sname);
+		fname = sbuf_data(sname);
+	}
+
+	/*
+	 * We need to "pop" (remove) the present value of arg[0], and "push"
+	 * either two or three new values in the arg[] list.  To do this,
+	 * we first shift all the other values in the `begin_argv' area to
+	 * provide the exact amount of room for the values added.  Set up
+	 * `offset' as the number of bytes to be added to the `begin_argv'
+	 * area, and 'length' as the number of bytes being removed.
+	 */
+	offset = interpe - interpb + 1;			/* interpreter */
+	if (opte > optb)				/* options (if any) */
+		offset += opte - optb + 1;
+	offset += strlen(fname) + 1;			/* fname of script */
+	length = (imgp->args->argc == 0) ? 0 :
+	    strlen(imgp->args->begin_argv) + 1;		/* bytes to delete */
+
+	if (offset > imgp->args->stringspace + length) {
+		if (sname != NULL)
+			sbuf_delete(sname);
+		return (E2BIG);
+	}
+
+	bcopy(imgp->args->begin_argv + length, imgp->args->begin_argv + offset,
+	    imgp->args->endp - (imgp->args->begin_argv + length));
+
+	offset -= length;		/* calculate actual adjustment */
+	imgp->args->begin_envv += offset;
+	imgp->args->endp += offset;
+	imgp->args->stringspace -= offset;
+
+	/*
+	 * If there was no arg[0] when we started, then the interpreter_name
+	 * is adding an argument (instead of replacing the arg[0] we started
+	 * with).  And we're always adding an argument when we include the
+	 * full pathname of the original script.
+	 */
+	if (imgp->args->argc == 0)
+		imgp->args->argc = 1;
+	imgp->args->argc++;
+
+	/*
+	 * The original arg[] list has been shifted appropriately.  Copy in
+	 * the interpreter name and options-string.
+	 */
+	length = interpe - interpb;
+	bcopy(interpb, imgp->args->begin_argv, length);
+	*(imgp->args->begin_argv + length) = '\0';
+	offset = length + 1;
+	if (opte > optb) {
+		length = opte - optb;
+		bcopy(optb, imgp->args->begin_argv + offset, length);
+		*(imgp->args->begin_argv + offset + length) = '\0';
+		offset += length + 1;
+		imgp->args->argc++;
+	}
+
+	/*
+	 * Finally, add the filename onto the end for the interpreter to
+	 * use and copy the interpreter's name to imgp->interpreter_name
+	 * for exec to use.
+	 */
+	error = copystr(fname, imgp->args->begin_argv + offset,
+	    imgp->args->stringspace, NULL);
+
+	if (error == 0)
+		imgp->interpreter_name = imgp->args->begin_argv;
+
+	if (sname != NULL)
+		sbuf_delete(sname);
+	return (error);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw shell_execsw = { exec_shell_imgact, "#!" };
+EXEC_SET(shell, shell_execsw);
diff --git a/sys/kern/inflate.c b/sys/kern/inflate.c
new file mode 100644
index 0000000..383ebc4
--- /dev/null
+++ b/sys/kern/inflate.c
@@ -0,0 +1,1077 @@
+/*
+ * Most parts of this file are not covered by:
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/inflate.h>
+#ifdef _KERNEL
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#endif
+#include <sys/malloc.h>
+
+#ifdef _KERNEL
+static MALLOC_DEFINE(M_GZIP, "gzip_trees", "Gzip trees");
+#endif
+
+/* needed to make inflate() work */
+#define	uch u_char
+#define	ush u_short
+#define	ulg u_long
+
+/* Stuff to make inflate() work */
+#ifdef _KERNEL
+#define memzero(dest,len)      bzero(dest,len)
+#endif
+#define NOMEMCPY
+#ifdef _KERNEL
+#define FPRINTF printf
+#else
+extern void putstr (char *);
+#define FPRINTF putstr
+#endif
+
+#define FLUSH(x,y) {						\
+	int foo = (*x->gz_output)(x->gz_private,x->gz_slide,y);	\
+	if (foo) 						\
+		return foo;					\
+	}
+
+static const int qflag = 0;
+
+#ifndef _KERNEL /* want to use this file in kzip also */
+extern unsigned char *kzipmalloc (int);
+extern void kzipfree (void*);
+#define malloc(x, y, z) kzipmalloc((x))
+#define free(x, y) kzipfree((x))
+#endif
+
+/*
+ * This came from unzip-5.12.  I have changed it the flow to pass
+ * a structure pointer around, thus hopefully making it re-entrant.
+ * Poul-Henning
+ */
+
+/* inflate.c -- put in the public domain by Mark Adler
+   version c14o, 23 August 1994 */
+
+/* You can do whatever you like with this source file, though I would
+   prefer that if you modify it and redistribute it that you include
+   comments to that effect with your name and the date.  Thank you.
+
+   History:
+   vers    date          who           what
+   ----  ---------  --------------  ------------------------------------
+    a    ~~ Feb 92  M. Adler        used full (large, one-step) lookup table
+    b1   21 Mar 92  M. Adler        first version with partial lookup tables
+    b2   21 Mar 92  M. Adler        fixed bug in fixed-code blocks
+    b3   22 Mar 92  M. Adler        sped up match copies, cleaned up some
+    b4   25 Mar 92  M. Adler        added prototypes; removed window[] (now
+                                    is the responsibility of unzip.h--also
+                                    changed name to slide[]), so needs diffs
+                                    for unzip.c and unzip.h (this allows
+                                    compiling in the small model on MSDOS);
+                                    fixed cast of q in huft_build();
+    b5   26 Mar 92  M. Adler        got rid of unintended macro recursion.
+    b6   27 Mar 92  M. Adler        got rid of nextbyte() routine.  fixed
+                                    bug in inflate_fixed().
+    c1   30 Mar 92  M. Adler        removed lbits, dbits environment variables.
+                                    changed BMAX to 16 for explode.  Removed
+                                    OUTB usage, and replaced it with flush()--
+                                    this was a 20% speed improvement!  Added
+                                    an explode.c (to replace unimplod.c) that
+                                    uses the huft routines here.  Removed
+                                    register union.
+    c2    4 Apr 92  M. Adler        fixed bug for file sizes a multiple of 32k.
+    c3   10 Apr 92  M. Adler        reduced memory of code tables made by
+                                    huft_build significantly (factor of two to
+                                    three).
+    c4   15 Apr 92  M. Adler        added NOMEMCPY do kill use of memcpy().
+                                    worked around a Turbo C optimization bug.
+    c5   21 Apr 92  M. Adler        added the GZ_WSIZE #define to allow reducing
+                                    the 32K window size for specialized
+                                    applications.
+    c6   31 May 92  M. Adler        added some typecasts to eliminate warnings
+    c7   27 Jun 92  G. Roelofs      added some more typecasts (444:  MSC bug).
+    c8    5 Oct 92  J-l. Gailly     added ifdef'd code to deal with PKZIP bug.
+    c9    9 Oct 92  M. Adler        removed a memory error message (~line 416).
+    c10  17 Oct 92  G. Roelofs      changed ULONG/UWORD/byte to ulg/ush/uch,
+                                    removed old inflate, renamed inflate_entry
+                                    to inflate, added Mark's fix to a comment.
+   c10.5 14 Dec 92  M. Adler        fix up error messages for incomplete trees.
+    c11   2 Jan 93  M. Adler        fixed bug in detection of incomplete
+                                    tables, and removed assumption that EOB is
+                                    the longest code (bad assumption).
+    c12   3 Jan 93  M. Adler        make tables for fixed blocks only once.
+    c13   5 Jan 93  M. Adler        allow all zero length codes (pkzip 2.04c
+                                    outputs one zero length code for an empty
+                                    distance tree).
+    c14  12 Mar 93  M. Adler        made inflate.c standalone with the
+                                    introduction of inflate.h.
+   c14b  16 Jul 93  G. Roelofs      added (unsigned) typecast to w at 470.
+   c14c  19 Jul 93  J. Bush         changed v[N_MAX], l[288], ll[28x+3x] arrays
+                                    to static for Amiga.
+   c14d  13 Aug 93  J-l. Gailly     de-complicatified Mark's c[*p++]++ thing.
+   c14e   8 Oct 93  G. Roelofs      changed memset() to memzero().
+   c14f  22 Oct 93  G. Roelofs      renamed quietflg to qflag; made Trace()
+                                    conditional; added inflate_free().
+   c14g  28 Oct 93  G. Roelofs      changed l/(lx+1) macro to pointer (Cray bug)
+   c14h   7 Dec 93  C. Ghisler      huft_build() optimizations.
+   c14i   9 Jan 94  A. Verheijen    set fixed_t{d,l} to NULL after freeing;
+                    G. Roelofs      check NEXTBYTE macro for GZ_EOF.
+   c14j  23 Jan 94  G. Roelofs      removed Ghisler "optimizations"; ifdef'd
+                                    GZ_EOF check.
+   c14k  27 Feb 94  G. Roelofs      added some typecasts to avoid warnings.
+   c14l   9 Apr 94  G. Roelofs      fixed split comments on preprocessor lines
+                                    to avoid bug in Encore compiler.
+   c14m   7 Jul 94  P. Kienitz      modified to allow assembler version of
+                                    inflate_codes() (define ASM_INFLATECODES)
+   c14n  22 Jul 94  G. Roelofs      changed fprintf to FPRINTF for DLL versions
+   c14o  23 Aug 94  C. Spieler      added a newline to a debug statement;
+                    G. Roelofs      added another typecast to avoid MSC warning
+ */
+
+
+/*
+   Inflate deflated (PKZIP's method 8 compressed) data.  The compression
+   method searches for as much of the current string of bytes (up to a
+   length of 258) in the previous 32K bytes.  If it doesn't find any
+   matches (of at least length 3), it codes the next byte.  Otherwise, it
+   codes the length of the matched string and its distance backwards from
+   the current position.  There is a single Huffman code that codes both
+   single bytes (called "literals") and match lengths.  A second Huffman
+   code codes the distance information, which follows a length code.  Each
+   length or distance code actually represents a base value and a number
+   of "extra" (sometimes zero) bits to get to add to the base value.  At
+   the end of each deflated block is a special end-of-block (EOB) literal/
+   length code.  The decoding process is basically: get a literal/length
+   code; if EOB then done; if a literal, emit the decoded byte; if a
+   length then get the distance and emit the referred-to bytes from the
+   sliding window of previously emitted data.
+
+   There are (currently) three kinds of inflate blocks: stored, fixed, and
+   dynamic.  The compressor outputs a chunk of data at a time and decides
+   which method to use on a chunk-by-chunk basis.  A chunk might typically
+   be 32K to 64K, uncompressed.  If the chunk is uncompressible, then the
+   "stored" method is used.  In this case, the bytes are simply stored as
+   is, eight bits per byte, with none of the above coding.  The bytes are
+   preceded by a count, since there is no longer an EOB code.
+
+   If the data is compressible, then either the fixed or dynamic methods
+   are used.  In the dynamic method, the compressed data is preceded by
+   an encoding of the literal/length and distance Huffman codes that are
+   to be used to decode this block.  The representation is itself Huffman
+   coded, and so is preceded by a description of that code.  These code
+   descriptions take up a little space, and so for small blocks, there is
+   a predefined set of codes, called the fixed codes.  The fixed method is
+   used if the block ends up smaller that way (usually for quite small
+   chunks); otherwise the dynamic method is used.  In the latter case, the
+   codes are customized to the probabilities in the current block and so
+   can code it much better than the pre-determined fixed codes can.
+
+   The Huffman codes themselves are decoded using a mutli-level table
+   lookup, in order to maximize the speed of decoding plus the speed of
+   building the decoding tables.  See the comments below that precede the
+   lbits and dbits tuning parameters.
+ */
+
+
+/*
+   Notes beyond the 1.93a appnote.txt:
+
+   1. Distance pointers never point before the beginning of the output
+      stream.
+   2. Distance pointers can point back across blocks, up to 32k away.
+   3. There is an implied maximum of 7 bits for the bit length table and
+      15 bits for the actual data.
+   4. If only one code exists, then it is encoded using one bit.  (Zero
+      would be more efficient, but perhaps a little confusing.)  If two
+      codes exist, they are coded using one bit each (0 and 1).
+   5. There is no way of sending zero distance codes--a dummy must be
+      sent if there are none.  (History: a pre 2.0 version of PKZIP would
+      store blocks with no distance codes, but this was discovered to be
+      too harsh a criterion.)  Valid only for 1.93a.  2.04c does allow
+      zero distance codes, which is sent as one code of zero bits in
+      length.
+   6. There are up to 286 literal/length codes.  Code 256 represents the
+      end-of-block.  Note however that the static length tree defines
+      288 codes just to fill out the Huffman codes.  Codes 286 and 287
+      cannot be used though, since there is no length base or extra bits
+      defined for them.  Similarily, there are up to 30 distance codes.
+      However, static trees define 32 codes (all 5 bits) to fill out the
+      Huffman codes, but the last two had better not show up in the data.
+   7. Unzip can check dynamic Huffman blocks for complete code sets.
+      The exception is that a single code would not be complete (see #4).
+   8. The five bits following the block type is really the number of
+      literal codes sent minus 257.
+   9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits
+      (1+6+6).  Therefore, to output three times the length, you output
+      three codes (1+1+1), whereas to output four times the same length,
+      you only need two codes (1+3).  Hmm.
+  10. In the tree reconstruction algorithm, Code = Code + Increment
+      only if BitLength(i) is not zero.  (Pretty obvious.)
+  11. Correction: 4 Bits: # of Bit Length codes - 4     (4 - 19)
+  12. Note: length code 284 can represent 227-258, but length code 285
+      really is 258.  The last length deserves its own, short code
+      since it gets used a lot in very redundant files.  The length
+      258 is special since 258 - 3 (the min match length) is 255.
+  13. The literal/length and distance code bit lengths are read as a
+      single stream of lengths.  It is possible (and advantageous) for
+      a repeat code (16, 17, or 18) to go across the boundary between
+      the two sets of lengths.
+ */
+
+
+#define PKZIP_BUG_WORKAROUND	/* PKZIP 1.93a problem--live with it */
+
+/*
+    inflate.h must supply the uch slide[GZ_WSIZE] array and the NEXTBYTE,
+    FLUSH() and memzero macros.  If the window size is not 32K, it
+    should also define GZ_WSIZE.  If INFMOD is defined, it can include
+    compiled functions to support the NEXTBYTE and/or FLUSH() macros.
+    There are defaults for NEXTBYTE and FLUSH() below for use as
+    examples of what those functions need to do.  Normally, you would
+    also want FLUSH() to compute a crc on the data.  inflate.h also
+    needs to provide these typedefs:
+
+        typedef unsigned char uch;
+        typedef unsigned short ush;
+        typedef unsigned long ulg;
+
+    This module uses the external functions malloc() and free() (and
+    probably memset() or bzero() in the memzero() macro).  Their
+    prototypes are normally found in <string.h> and <stdlib.h>.
+ */
+#define INFMOD			/* tell inflate.h to include code to be
+				 * compiled */
+
+/* Huffman code lookup table entry--this entry is four bytes for machines
+   that have 16-bit pointers (e.g. PC's in the small or medium model).
+   Valid extra bits are 0..13.  e == 15 is EOB (end of block), e == 16
+   means that v is a literal, 16 < e < 32 means that v is a pointer to
+   the next table, which codes e - 16 bits, and lastly e == 99 indicates
+   an unused code.  If a code with e == 99 is looked up, this implies an
+   error in the data. */
+struct huft {
+	uch             e;	/* number of extra bits or operation */
+	uch             b;	/* number of bits in this code or subcode */
+	union {
+		ush             n;	/* literal, length base, or distance
+					 * base */
+		struct huft    *t;	/* pointer to next level of table */
+	}               v;
+};
+
+
+/* Function prototypes */
+static int huft_build(struct inflate *, unsigned *, unsigned, unsigned, const ush *, const ush *, struct huft **, int *);
+static int huft_free(struct inflate *, struct huft *);
+static int inflate_codes(struct inflate *, struct huft *, struct huft *, int, int);
+static int inflate_stored(struct inflate *);
+static int xinflate(struct inflate *);
+static int inflate_fixed(struct inflate *);
+static int inflate_dynamic(struct inflate *);
+static int inflate_block(struct inflate *, int *);
+
+/* The inflate algorithm uses a sliding 32K byte window on the uncompressed
+   stream to find repeated byte strings.  This is implemented here as a
+   circular buffer.  The index is updated simply by incrementing and then
+   and'ing with 0x7fff (32K-1). */
+/* It is left to other modules to supply the 32K area.  It is assumed
+   to be usable as if it were declared "uch slide[32768];" or as just
+   "uch *slide;" and then malloc'ed in the latter case.  The definition
+   must be in unzip.h, included above. */
+
+
+/* Tables for deflate from PKZIP's appnote.txt. */
+
+/* Order of the bit length code lengths */
+static const unsigned border[] = {
+	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+static const ush cplens[] = {	/* Copy lengths for literal codes 257..285 */
+	3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+	35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+ /* note: see note #13 above about the 258 in this list. */
+
+static const ush cplext[] = {	/* Extra bits for literal codes 257..285 */
+	0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+	3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99};	/* 99==invalid */
+
+static const ush cpdist[] = {	/* Copy offsets for distance codes 0..29 */
+	1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+	257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+	8193, 12289, 16385, 24577};
+
+static const ush cpdext[] = {	/* Extra bits for distance codes */
+	0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+	7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+	12, 12, 13, 13};
+
+/* And'ing with mask[n] masks the lower n bits */
+static const ush mask[] = {
+	0x0000,
+	0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff,
+	0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff
+};
+
+
+/* Macros for inflate() bit peeking and grabbing.
+   The usage is:
+
+        NEEDBITS(glbl,j)
+        x = b & mask[j];
+        DUMPBITS(j)
+
+   where NEEDBITS makes sure that b has at least j bits in it, and
+   DUMPBITS removes the bits from b.  The macros use the variable k
+   for the number of bits in b.  Normally, b and k are register
+   variables for speed, and are initialized at the begining of a
+   routine that uses these macros from a global bit buffer and count.
+
+   In order to not ask for more bits than there are in the compressed
+   stream, the Huffman tables are constructed to only ask for just
+   enough bits to make up the end-of-block code (value 256).  Then no
+   bytes need to be "returned" to the buffer at the end of the last
+   block.  See the huft_build() routine.
+ */
+
+/*
+ * The following 2 were global variables.
+ * They are now fields of the inflate structure.
+ */
+
+#define NEEDBITS(glbl,n) {						\
+		while(k<(n)) {						\
+			int c=(*glbl->gz_input)(glbl->gz_private);	\
+			if(c==GZ_EOF)					\
+				return 1; 				\
+			b|=((ulg)c)<<k;					\
+			k+=8;						\
+		}							\
+	}
+
+#define DUMPBITS(n) {b>>=(n);k-=(n);}
+
+/*
+   Huffman code decoding is performed using a multi-level table lookup.
+   The fastest way to decode is to simply build a lookup table whose
+   size is determined by the longest code.  However, the time it takes
+   to build this table can also be a factor if the data being decoded
+   is not very long.  The most common codes are necessarily the
+   shortest codes, so those codes dominate the decoding time, and hence
+   the speed.  The idea is you can have a shorter table that decodes the
+   shorter, more probable codes, and then point to subsidiary tables for
+   the longer codes.  The time it costs to decode the longer codes is
+   then traded against the time it takes to make longer tables.
+
+   This results of this trade are in the variables lbits and dbits
+   below.  lbits is the number of bits the first level table for literal/
+   length codes can decode in one step, and dbits is the same thing for
+   the distance codes.  Subsequent tables are also less than or equal to
+   those sizes.  These values may be adjusted either when all of the
+   codes are shorter than that, in which case the longest code length in
+   bits is used, or when the shortest code is *longer* than the requested
+   table size, in which case the length of the shortest code in bits is
+   used.
+
+   There are two different values for the two tables, since they code a
+   different number of possibilities each.  The literal/length table
+   codes 286 possible values, or in a flat code, a little over eight
+   bits.  The distance table codes 30 possible values, or a little less
+   than five bits, flat.  The optimum values for speed end up being
+   about one bit more than those, so lbits is 8+1 and dbits is 5+1.
+   The optimum values may differ though from machine to machine, and
+   possibly even between compilers.  Your mileage may vary.
+ */
+
+static const int lbits = 9;	/* bits in base literal/length lookup table */
+static const int dbits = 6;	/* bits in base distance lookup table */
+
+
+/* If BMAX needs to be larger than 16, then h and x[] should be ulg. */
+#define BMAX 16			/* maximum bit length of any code (16 for
+				 * explode) */
+#define N_MAX 288		/* maximum number of codes in any set */
+
+/* Given a list of code lengths and a maximum table size, make a set of
+   tables to decode that set of codes.  Return zero on success, one if
+   the given code set is incomplete (the tables are still built in this
+   case), two if the input is invalid (all zero length codes or an
+   oversubscribed set of lengths), and three if not enough memory.
+   The code with value 256 is special, and the tables are constructed
+   so that no bits beyond that code are fetched when that code is
+   decoded. */
+static int
+huft_build(glbl, b, n, s, d, e, t, m)
+	struct inflate *glbl;
+	unsigned       *b;	/* code lengths in bits (all assumed <= BMAX) */
+	unsigned        n;	/* number of codes (assumed <= N_MAX) */
+	unsigned        s;	/* number of simple-valued codes (0..s-1) */
+	const ush      *d;	/* list of base values for non-simple codes */
+	const ush      *e;	/* list of extra bits for non-simple codes */
+	struct huft   **t;	/* result: starting table */
+	int            *m;	/* maximum lookup bits, returns actual */
+{
+	unsigned        a;	/* counter for codes of length k */
+	unsigned        c[BMAX + 1];	/* bit length count table */
+	unsigned        el;	/* length of EOB code (value 256) */
+	unsigned        f;	/* i repeats in table every f entries */
+	int             g;	/* maximum code length */
+	int             h;	/* table level */
+	register unsigned i;	/* counter, current code */
+	register unsigned j;	/* counter */
+	register int    k;	/* number of bits in current code */
+	int             lx[BMAX + 1];	/* memory for l[-1..BMAX-1] */
+	int            *l = lx + 1;	/* stack of bits per table */
+	register unsigned *p;	/* pointer into c[], b[], or v[] */
+	register struct huft *q;/* points to current table */
+	struct huft     r;	/* table entry for structure assignment */
+	struct huft    *u[BMAX];/* table stack */
+	unsigned        v[N_MAX];	/* values in order of bit length */
+	register int    w;	/* bits before this table == (l * h) */
+	unsigned        x[BMAX + 1];	/* bit offsets, then code stack */
+	unsigned       *xp;	/* pointer into x */
+	int             y;	/* number of dummy codes added */
+	unsigned        z;	/* number of entries in current table */
+
+	/* Generate counts for each bit length */
+	el = n > 256 ? b[256] : BMAX;	/* set length of EOB code, if any */
+#ifdef _KERNEL
+	memzero((char *) c, sizeof(c));
+#else
+	for (i = 0; i < BMAX+1; i++)
+		c [i] = 0;
+#endif
+	p = b;
+	i = n;
+	do {
+		c[*p]++;
+		p++;		/* assume all entries <= BMAX */
+	} while (--i);
+	if (c[0] == n) {	/* null input--all zero length codes */
+		*t = (struct huft *) NULL;
+		*m = 0;
+		return 0;
+	}
+	/* Find minimum and maximum length, bound *m by those */
+	for (j = 1; j <= BMAX; j++)
+		if (c[j])
+			break;
+	k = j;			/* minimum code length */
+	if ((unsigned) *m < j)
+		*m = j;
+	for (i = BMAX; i; i--)
+		if (c[i])
+			break;
+	g = i;			/* maximum code length */
+	if ((unsigned) *m > i)
+		*m = i;
+
+	/* Adjust last length count to fill out codes, if needed */
+	for (y = 1 << j; j < i; j++, y <<= 1)
+		if ((y -= c[j]) < 0)
+			return 2;	/* bad input: more codes than bits */
+	if ((y -= c[i]) < 0)
+		return 2;
+	c[i] += y;
+
+	/* Generate starting offsets into the value table for each length */
+	x[1] = j = 0;
+	p = c + 1;
+	xp = x + 2;
+	while (--i) {		/* note that i == g from above */
+		*xp++ = (j += *p++);
+	}
+
+	/* Make a table of values in order of bit lengths */
+	p = b;
+	i = 0;
+	do {
+		if ((j = *p++) != 0)
+			v[x[j]++] = i;
+	} while (++i < n);
+
+	/* Generate the Huffman codes and for each, make the table entries */
+	x[0] = i = 0;		/* first Huffman code is zero */
+	p = v;			/* grab values in bit order */
+	h = -1;			/* no tables yet--level -1 */
+	w = l[-1] = 0;		/* no bits decoded yet */
+	u[0] = (struct huft *) NULL;	/* just to keep compilers happy */
+	q = (struct huft *) NULL;	/* ditto */
+	z = 0;			/* ditto */
+
+	/* go through the bit lengths (k already is bits in shortest code) */
+	for (; k <= g; k++) {
+		a = c[k];
+		while (a--) {
+			/*
+			 * here i is the Huffman code of length k bits for
+			 * value *p
+			 */
+			/* make tables up to required level */
+			while (k > w + l[h]) {
+				w += l[h++];	/* add bits already decoded */
+
+				/*
+				 * compute minimum size table less than or
+				 * equal to *m bits
+				 */
+				z = (z = g - w) > (unsigned) *m ? *m : z;	/* upper limit */
+				if ((f = 1 << (j = k - w)) > a + 1) {	/* try a k-w bit table *//* t
+									 * oo few codes for k-w
+									 * bit table */
+					f -= a + 1;	/* deduct codes from
+							 * patterns left */
+					xp = c + k;
+					while (++j < z) {	/* try smaller tables up
+								 * to z bits */
+						if ((f <<= 1) <= *++xp)
+							break;	/* enough codes to use
+								 * up j bits */
+						f -= *xp;	/* else deduct codes
+								 * from patterns */
+					}
+				}
+				if ((unsigned) w + j > el && (unsigned) w < el)
+					j = el - w;	/* make EOB code end at
+							 * table */
+				z = 1 << j;	/* table entries for j-bit
+						 * table */
+				l[h] = j;	/* set table size in stack */
+
+				/* allocate and link in new table */
+				if ((q = (struct huft *) malloc((z + 1) * sizeof(struct huft), M_GZIP, M_WAITOK)) ==
+				    (struct huft *) NULL) {
+					if (h)
+						huft_free(glbl, u[0]);
+					return 3;	/* not enough memory */
+				}
+				glbl->gz_hufts += z + 1;	/* track memory usage */
+				*t = q + 1;	/* link to list for
+						 * huft_free() */
+				*(t = &(q->v.t)) = (struct huft *) NULL;
+				u[h] = ++q;	/* table starts after link */
+
+				/* connect to last table, if there is one */
+				if (h) {
+					x[h] = i;	/* save pattern for
+							 * backing up */
+					r.b = (uch) l[h - 1];	/* bits to dump before
+								 * this table */
+					r.e = (uch) (16 + j);	/* bits in this table */
+					r.v.t = q;	/* pointer to this table */
+					j = (i & ((1 << w) - 1)) >> (w - l[h - 1]);
+					u[h - 1][j] = r;	/* connect to last table */
+				}
+			}
+
+			/* set up table entry in r */
+			r.b = (uch) (k - w);
+			if (p >= v + n)
+				r.e = 99;	/* out of values--invalid
+						 * code */
+			else if (*p < s) {
+				r.e = (uch) (*p < 256 ? 16 : 15);	/* 256 is end-of-block
+									 * code */
+				r.v.n = *p++;	/* simple code is just the
+						 * value */
+			} else {
+				r.e = (uch) e[*p - s];	/* non-simple--look up
+							 * in lists */
+				r.v.n = d[*p++ - s];
+			}
+
+			/* fill code-like entries with r */
+			f = 1 << (k - w);
+			for (j = i >> w; j < z; j += f)
+				q[j] = r;
+
+			/* backwards increment the k-bit code i */
+			for (j = 1 << (k - 1); i & j; j >>= 1)
+				i ^= j;
+			i ^= j;
+
+			/* backup over finished tables */
+			while ((i & ((1 << w) - 1)) != x[h])
+				w -= l[--h];	/* don't need to update q */
+		}
+	}
+
+	/* return actual size of base table */
+	*m = l[0];
+
+	/* Return true (1) if we were given an incomplete table */
+	return y != 0 && g != 1;
+}
+
+static int
+huft_free(glbl, t)
+	struct inflate *glbl;
+	struct huft    *t;	/* table to free */
+/* Free the malloc'ed tables built by huft_build(), which makes a linked
+   list of the tables it made, with the links in a dummy first entry of
+   each table. */
+{
+	register struct huft *p, *q;
+
+	/* Go through linked list, freeing from the malloced (t[-1]) address. */
+	p = t;
+	while (p != (struct huft *) NULL) {
+		q = (--p)->v.t;
+		free(p, M_GZIP);
+		p = q;
+	}
+	return 0;
+}
+
+/* inflate (decompress) the codes in a deflated (compressed) block.
+   Return an error code or zero if it all goes ok. */
+static int
+inflate_codes(glbl, tl, td, bl, bd)
+	struct inflate *glbl;
+	struct huft    *tl, *td;/* literal/length and distance decoder tables */
+	int             bl, bd;	/* number of bits decoded by tl[] and td[] */
+{
+	register unsigned e;	/* table entry flag/number of extra bits */
+	unsigned        n, d;	/* length and index for copy */
+	unsigned        w;	/* current window position */
+	struct huft    *t;	/* pointer to table entry */
+	unsigned        ml, md;	/* masks for bl and bd bits */
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local copies of globals */
+	b = glbl->gz_bb;			/* initialize bit buffer */
+	k = glbl->gz_bk;
+	w = glbl->gz_wp;	/* initialize window position */
+
+	/* inflate the coded data */
+	ml = mask[bl];		/* precompute masks for speed */
+	md = mask[bd];
+	while (1) {		/* do until end of block */
+		NEEDBITS(glbl, (unsigned) bl)
+			if ((e = (t = tl + ((unsigned) b & ml))->e) > 16)
+			do {
+				if (e == 99)
+					return 1;
+				DUMPBITS(t->b)
+					e -= 16;
+				NEEDBITS(glbl, e)
+			} while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+		DUMPBITS(t->b)
+			if (e == 16) {	/* then it's a literal */
+			glbl->gz_slide[w++] = (uch) t->v.n;
+			if (w == GZ_WSIZE) {
+				FLUSH(glbl, w);
+				w = 0;
+			}
+		} else {	/* it's an EOB or a length */
+			/* exit if end of block */
+			if (e == 15)
+				break;
+
+			/* get length of block to copy */
+			NEEDBITS(glbl, e)
+				n = t->v.n + ((unsigned) b & mask[e]);
+			DUMPBITS(e);
+
+			/* decode distance of block to copy */
+			NEEDBITS(glbl, (unsigned) bd)
+				if ((e = (t = td + ((unsigned) b & md))->e) > 16)
+				do {
+					if (e == 99)
+						return 1;
+					DUMPBITS(t->b)
+						e -= 16;
+					NEEDBITS(glbl, e)
+				} while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+			DUMPBITS(t->b)
+				NEEDBITS(glbl, e)
+				d = w - t->v.n - ((unsigned) b & mask[e]);
+			DUMPBITS(e)
+			/* do the copy */
+				do {
+				n -= (e = (e = GZ_WSIZE - ((d &= GZ_WSIZE - 1) > w ? d : w)) > n ? n : e);
+#ifndef NOMEMCPY
+				if (w - d >= e) {	/* (this test assumes
+							 * unsigned comparison) */
+					memcpy(glbl->gz_slide + w, glbl->gz_slide + d, e);
+					w += e;
+					d += e;
+				} else	/* do it slow to avoid memcpy()
+					 * overlap */
+#endif				/* !NOMEMCPY */
+					do {
+						glbl->gz_slide[w++] = glbl->gz_slide[d++];
+					} while (--e);
+				if (w == GZ_WSIZE) {
+					FLUSH(glbl, w);
+					w = 0;
+				}
+			} while (n);
+		}
+	}
+
+	/* restore the globals from the locals */
+	glbl->gz_wp = w;	/* restore global window pointer */
+	glbl->gz_bb = b;			/* restore global bit buffer */
+	glbl->gz_bk = k;
+
+	/* done */
+	return 0;
+}
+
+/* "decompress" an inflated type 0 (stored) block. */
+static int
+inflate_stored(glbl)
+	struct inflate *glbl;
+{
+	unsigned        n;	/* number of bytes in block */
+	unsigned        w;	/* current window position */
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local copies of globals */
+	b = glbl->gz_bb;			/* initialize bit buffer */
+	k = glbl->gz_bk;
+	w = glbl->gz_wp;	/* initialize window position */
+
+	/* go to byte boundary */
+	n = k & 7;
+	DUMPBITS(n);
+
+	/* get the length and its complement */
+	NEEDBITS(glbl, 16)
+		n = ((unsigned) b & 0xffff);
+	DUMPBITS(16)
+		NEEDBITS(glbl, 16)
+		if (n != (unsigned) ((~b) & 0xffff))
+		return 1;	/* error in compressed data */
+	DUMPBITS(16)
+	/* read and output the compressed data */
+		while (n--) {
+		NEEDBITS(glbl, 8)
+			glbl->gz_slide[w++] = (uch) b;
+		if (w == GZ_WSIZE) {
+			FLUSH(glbl, w);
+			w = 0;
+		}
+		DUMPBITS(8)
+	}
+
+	/* restore the globals from the locals */
+	glbl->gz_wp = w;	/* restore global window pointer */
+	glbl->gz_bb = b;			/* restore global bit buffer */
+	glbl->gz_bk = k;
+	return 0;
+}
+
+/* decompress an inflated type 1 (fixed Huffman codes) block.  We should
+   either replace this with a custom decoder, or at least precompute the
+   Huffman tables. */
+static int
+inflate_fixed(glbl)
+	struct inflate *glbl;
+{
+	/* if first time, set up tables for fixed blocks */
+	if (glbl->gz_fixed_tl == (struct huft *) NULL) {
+		int             i;	/* temporary variable */
+		static unsigned l[288];	/* length list for huft_build */
+
+		/* literal table */
+		for (i = 0; i < 144; i++)
+			l[i] = 8;
+		for (; i < 256; i++)
+			l[i] = 9;
+		for (; i < 280; i++)
+			l[i] = 7;
+		for (; i < 288; i++)	/* make a complete, but wrong code
+					 * set */
+			l[i] = 8;
+		glbl->gz_fixed_bl = 7;
+		if ((i = huft_build(glbl, l, 288, 257, cplens, cplext,
+			    &glbl->gz_fixed_tl, &glbl->gz_fixed_bl)) != 0) {
+			glbl->gz_fixed_tl = (struct huft *) NULL;
+			return i;
+		}
+		/* distance table */
+		for (i = 0; i < 30; i++)	/* make an incomplete code
+						 * set */
+			l[i] = 5;
+		glbl->gz_fixed_bd = 5;
+		if ((i = huft_build(glbl, l, 30, 0, cpdist, cpdext,
+			     &glbl->gz_fixed_td, &glbl->gz_fixed_bd)) > 1) {
+			huft_free(glbl, glbl->gz_fixed_tl);
+			glbl->gz_fixed_tl = (struct huft *) NULL;
+			return i;
+		}
+	}
+	/* decompress until an end-of-block code */
+	return inflate_codes(glbl, glbl->gz_fixed_tl, glbl->gz_fixed_td, glbl->gz_fixed_bl, glbl->gz_fixed_bd) != 0;
+}
+
+/* decompress an inflated type 2 (dynamic Huffman codes) block. */
+static int
+inflate_dynamic(glbl)
+	struct inflate *glbl;
+{
+	int             i;	/* temporary variables */
+	unsigned        j;
+	unsigned        l;	/* last length */
+	unsigned        m;	/* mask for bit lengths table */
+	unsigned        n;	/* number of lengths to get */
+	struct huft    *tl;	/* literal/length code table */
+	struct huft    *td;	/* distance code table */
+	int             bl;	/* lookup bits for tl */
+	int             bd;	/* lookup bits for td */
+	unsigned        nb;	/* number of bit length codes */
+	unsigned        nl;	/* number of literal/length codes */
+	unsigned        nd;	/* number of distance codes */
+#ifdef PKZIP_BUG_WORKAROUND
+	unsigned        ll[288 + 32];	/* literal/length and distance code
+					 * lengths */
+#else
+	unsigned        ll[286 + 30];	/* literal/length and distance code
+					 * lengths */
+#endif
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local bit buffer */
+	b = glbl->gz_bb;
+	k = glbl->gz_bk;
+
+	/* read in table lengths */
+	NEEDBITS(glbl, 5)
+		nl = 257 + ((unsigned) b & 0x1f);	/* number of
+							 * literal/length codes */
+	DUMPBITS(5)
+		NEEDBITS(glbl, 5)
+		nd = 1 + ((unsigned) b & 0x1f);	/* number of distance codes */
+	DUMPBITS(5)
+		NEEDBITS(glbl, 4)
+		nb = 4 + ((unsigned) b & 0xf);	/* number of bit length codes */
+	DUMPBITS(4)
+#ifdef PKZIP_BUG_WORKAROUND
+		if (nl > 288 || nd > 32)
+#else
+		if (nl > 286 || nd > 30)
+#endif
+		return 1;	/* bad lengths */
+	/* read in bit-length-code lengths */
+	for (j = 0; j < nb; j++) {
+		NEEDBITS(glbl, 3)
+			ll[border[j]] = (unsigned) b & 7;
+		DUMPBITS(3)
+	}
+	for (; j < 19; j++)
+		ll[border[j]] = 0;
+
+	/* build decoding table for trees--single level, 7 bit lookup */
+	bl = 7;
+	if ((i = huft_build(glbl, ll, 19, 19, NULL, NULL, &tl, &bl)) != 0) {
+		if (i == 1)
+			huft_free(glbl, tl);
+		return i;	/* incomplete code set */
+	}
+	/* read in literal and distance code lengths */
+	n = nl + nd;
+	m = mask[bl];
+	i = l = 0;
+	while ((unsigned) i < n) {
+		NEEDBITS(glbl, (unsigned) bl)
+			j = (td = tl + ((unsigned) b & m))->b;
+		DUMPBITS(j)
+			j = td->v.n;
+		if (j < 16)	/* length of code in bits (0..15) */
+			ll[i++] = l = j;	/* save last length in l */
+		else if (j == 16) {	/* repeat last length 3 to 6 times */
+			NEEDBITS(glbl, 2)
+				j = 3 + ((unsigned) b & 3);
+			DUMPBITS(2)
+				if ((unsigned) i + j > n)
+				return 1;
+			while (j--)
+				ll[i++] = l;
+		} else if (j == 17) {	/* 3 to 10 zero length codes */
+			NEEDBITS(glbl, 3)
+				j = 3 + ((unsigned) b & 7);
+			DUMPBITS(3)
+				if ((unsigned) i + j > n)
+				return 1;
+			while (j--)
+				ll[i++] = 0;
+			l = 0;
+		} else {	/* j == 18: 11 to 138 zero length codes */
+			NEEDBITS(glbl, 7)
+				j = 11 + ((unsigned) b & 0x7f);
+			DUMPBITS(7)
+				if ((unsigned) i + j > n)
+				return 1;
+			while (j--)
+				ll[i++] = 0;
+			l = 0;
+		}
+	}
+
+	/* free decoding table for trees */
+	huft_free(glbl, tl);
+
+	/* restore the global bit buffer */
+	glbl->gz_bb = b;
+	glbl->gz_bk = k;
+
+	/* build the decoding tables for literal/length and distance codes */
+	bl = lbits;
+	i = huft_build(glbl, ll, nl, 257, cplens, cplext, &tl, &bl);
+	if (i != 0) {
+		if (i == 1 && !qflag) {
+			FPRINTF("(incomplete l-tree)  ");
+			huft_free(glbl, tl);
+		}
+		return i;	/* incomplete code set */
+	}
+	bd = dbits;
+	i = huft_build(glbl, ll + nl, nd, 0, cpdist, cpdext, &td, &bd);
+	if (i != 0) {
+		if (i == 1 && !qflag) {
+			FPRINTF("(incomplete d-tree)  ");
+#ifdef PKZIP_BUG_WORKAROUND
+			i = 0;
+		}
+#else
+			huft_free(glbl, td);
+		}
+		huft_free(glbl, tl);
+		return i;	/* incomplete code set */
+#endif
+	}
+	/* decompress until an end-of-block code */
+	if (inflate_codes(glbl, tl, td, bl, bd))
+		return 1;
+
+	/* free the decoding tables, return */
+	huft_free(glbl, tl);
+	huft_free(glbl, td);
+	return 0;
+}
+
+/* decompress an inflated block */
+static int
+inflate_block(glbl, e)
+	struct inflate *glbl;
+	int            *e;	/* last block flag */
+{
+	unsigned        t;	/* block type */
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local bit buffer */
+	b = glbl->gz_bb;
+	k = glbl->gz_bk;
+
+	/* read in last block bit */
+	NEEDBITS(glbl, 1)
+		* e = (int) b & 1;
+	DUMPBITS(1)
+	/* read in block type */
+		NEEDBITS(glbl, 2)
+		t = (unsigned) b & 3;
+	DUMPBITS(2)
+	/* restore the global bit buffer */
+		glbl->gz_bb = b;
+	glbl->gz_bk = k;
+
+	/* inflate that block type */
+	if (t == 2)
+		return inflate_dynamic(glbl);
+	if (t == 0)
+		return inflate_stored(glbl);
+	if (t == 1)
+		return inflate_fixed(glbl);
+	/* bad block type */
+	return 2;
+}
+
+
+
+/* decompress an inflated entry */
+static int
+xinflate(glbl)
+	struct inflate *glbl;
+{
+	int             e;	/* last block flag */
+	int             r;	/* result code */
+	unsigned        h;	/* maximum struct huft's malloc'ed */
+
+	glbl->gz_fixed_tl = (struct huft *) NULL;
+
+	/* initialize window, bit buffer */
+	glbl->gz_wp = 0;
+	glbl->gz_bk = 0;
+	glbl->gz_bb = 0;
+
+	/* decompress until the last block */
+	h = 0;
+	do {
+		glbl->gz_hufts = 0;
+		if ((r = inflate_block(glbl, &e)) != 0)
+			return r;
+		if (glbl->gz_hufts > h)
+			h = glbl->gz_hufts;
+	} while (!e);
+
+	/* flush out slide */
+	FLUSH(glbl, glbl->gz_wp);
+
+	/* return success */
+	return 0;
+}
+
+/* Nobody uses this - why not? */
+int
+inflate(glbl)
+	struct inflate *glbl;
+{
+	int             i;
+#ifdef _KERNEL
+	u_char		*p = NULL;
+
+	if (!glbl->gz_slide)
+		p = glbl->gz_slide = malloc(GZ_WSIZE, M_GZIP, M_WAITOK);
+#endif
+	if (!glbl->gz_slide)
+#ifdef _KERNEL
+		return(ENOMEM);
+#else
+		return 3; /* kzip expects 3 */
+#endif
+	i = xinflate(glbl);
+
+	if (glbl->gz_fixed_td != (struct huft *) NULL) {
+		huft_free(glbl, glbl->gz_fixed_td);
+		glbl->gz_fixed_td = (struct huft *) NULL;
+	}
+	if (glbl->gz_fixed_tl != (struct huft *) NULL) {
+		huft_free(glbl, glbl->gz_fixed_tl);
+		glbl->gz_fixed_tl = (struct huft *) NULL;
+	}
+#ifdef _KERNEL
+	if (p == glbl->gz_slide) {
+		free(glbl->gz_slide, M_GZIP);
+		glbl->gz_slide = NULL;
+	}
+#endif
+	return i;
+}
+/* ----------------------- END INFLATE.C */
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
new file mode 100644
index 0000000..40eff02
--- /dev/null
+++ b/sys/kern/init_main.c
@@ -0,0 +1,855 @@
+/*-
+ * Copyright (c) 1995 Terrence R. Lambert
+ * All rights reserved.
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_init_path.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/exec.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/jail.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/loginclass.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/vnode.h>
+#include <sys/sysent.h>
+#include <sys/reboot.h>
+#include <sys/sched.h>
+#include <sys/sx.h>
+#include <sys/sysproto.h>
+#include <sys/vmmeter.h>
+#include <sys/unistd.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/cpuset.h>
+
+#include <machine/cpu.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/copyright.h>
+
+#include <ddb/ddb.h>
+#include <ddb/db_sym.h>
+
+void mi_startup(void);				/* Should be elsewhere */
+
+/* Components of the first process -- never freed. */
+static struct session session0;
+static struct pgrp pgrp0;
+struct	proc proc0;
+struct	thread thread0 __aligned(16);
+struct	vmspace vmspace0;
+struct	proc *initproc;
+
+#ifndef BOOTHOWTO
+#define	BOOTHOWTO	0
+#endif
+int	boothowto = BOOTHOWTO;	/* initialized so that it can be patched */
+SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0,
+	"Boot control flags, passed from loader");
+
+#ifndef BOOTVERBOSE
+#define	BOOTVERBOSE	0
+#endif
+int	bootverbose = BOOTVERBOSE;
+SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0,
+	"Control the output of verbose kernel messages");
+
+/*
+ * This ensures that there is at least one entry so that the sysinit_set
+ * symbol is not undefined.  A sybsystem ID of SI_SUB_DUMMY is never
+ * executed.
+ */
+SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL);
+
+/*
+ * The sysinit table itself.  Items are checked off as the are run.
+ * If we want to register new sysinit types, add them to newsysinit.
+ */
+SET_DECLARE(sysinit_set, struct sysinit);
+struct sysinit **sysinit, **sysinit_end;
+struct sysinit **newsysinit, **newsysinit_end;
+
+/*
+ * Merge a new sysinit set into the current set, reallocating it if
+ * necessary.  This can only be called after malloc is running.
+ */
+void
+sysinit_add(struct sysinit **set, struct sysinit **set_end)
+{
+	struct sysinit **newset;
+	struct sysinit **sipp;
+	struct sysinit **xipp;
+	int count;
+
+	count = set_end - set;
+	if (newsysinit)
+		count += newsysinit_end - newsysinit;
+	else
+		count += sysinit_end - sysinit;
+	newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
+	if (newset == NULL)
+		panic("cannot malloc for sysinit");
+	xipp = newset;
+	if (newsysinit)
+		for (sipp = newsysinit; sipp < newsysinit_end; sipp++)
+			*xipp++ = *sipp;
+	else
+		for (sipp = sysinit; sipp < sysinit_end; sipp++)
+			*xipp++ = *sipp;
+	for (sipp = set; sipp < set_end; sipp++)
+		*xipp++ = *sipp;
+	if (newsysinit)
+		free(newsysinit, M_TEMP);
+	newsysinit = newset;
+	newsysinit_end = newset + count;
+}
+
+#if defined (DDB) && defined(VERBOSE_SYSINIT)
+static const char *
+symbol_name(vm_offset_t va, db_strategy_t strategy)
+{
+	const char *name;
+	c_db_sym_t sym;
+	db_expr_t  offset;
+
+	if (va == 0)
+		return (NULL);
+	sym = db_search_symbol(va, strategy, &offset);
+	if (offset != 0)
+		return (NULL);
+	db_symbol_values(sym, &name, NULL);
+	return (name);
+}
+#endif
+
+/*
+ * System startup; initialize the world, create process 0, mount root
+ * filesystem, and fork to create init and pagedaemon.  Most of the
+ * hard work is done in the lower-level initialization routines including
+ * startup(), which does memory initialization and autoconfiguration.
+ *
+ * This allows simple addition of new kernel subsystems that require
+ * boot time initialization.  It also allows substitution of subsystem
+ * (for instance, a scheduler, kernel profiler, or VM system) by object
+ * module.  Finally, it allows for optional "kernel threads".
+ */
+void
+mi_startup(void)
+{
+
+	register struct sysinit **sipp;		/* system initialization*/
+	register struct sysinit **xipp;		/* interior loop of sort*/
+	register struct sysinit *save;		/* bubble*/
+
+#if defined(VERBOSE_SYSINIT)
+	int last;
+	int verbose;
+#endif
+
+	if (boothowto & RB_VERBOSE)
+		bootverbose++;
+
+	if (sysinit == NULL) {
+		sysinit = SET_BEGIN(sysinit_set);
+		sysinit_end = SET_LIMIT(sysinit_set);
+	}
+
+restart:
+	/*
+	 * Perform a bubble sort of the system initialization objects by
+	 * their subsystem (primary key) and order (secondary key).
+	 */
+	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
+		for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
+			if ((*sipp)->subsystem < (*xipp)->subsystem ||
+			     ((*sipp)->subsystem == (*xipp)->subsystem &&
+			      (*sipp)->order <= (*xipp)->order))
+				continue;	/* skip*/
+			save = *sipp;
+			*sipp = *xipp;
+			*xipp = save;
+		}
+	}
+
+#if defined(VERBOSE_SYSINIT)
+	last = SI_SUB_COPYRIGHT;
+	verbose = 0;
+#if !defined(DDB)
+	printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n");
+#endif
+#endif
+
+	/*
+	 * Traverse the (now) ordered list of system initialization tasks.
+	 * Perform each task, and continue on to the next task.
+	 */
+	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
+
+		if ((*sipp)->subsystem == SI_SUB_DUMMY)
+			continue;	/* skip dummy task(s)*/
+
+		if ((*sipp)->subsystem == SI_SUB_DONE)
+			continue;
+
+#if defined(VERBOSE_SYSINIT)
+		if ((*sipp)->subsystem > last) {
+			verbose = 1;
+			last = (*sipp)->subsystem;
+			printf("subsystem %x\n", last);
+		}
+		if (verbose) {
+#if defined(DDB)
+			const char *func, *data;
+
+			func = symbol_name((vm_offset_t)(*sipp)->func,
+			    DB_STGY_PROC);
+			data = symbol_name((vm_offset_t)(*sipp)->udata,
+			    DB_STGY_ANY);
+			if (func != NULL && data != NULL)
+				printf("   %s(&%s)... ", func, data);
+			else if (func != NULL)
+				printf("   %s(%p)... ", func, (*sipp)->udata);
+			else
+#endif
+				printf("   %p(%p)... ", (*sipp)->func,
+				    (*sipp)->udata);
+		}
+#endif
+
+		/* Call function */
+		(*((*sipp)->func))((*sipp)->udata);
+
+#if defined(VERBOSE_SYSINIT)
+		if (verbose)
+			printf("done.\n");
+#endif
+
+		/* Check off the one we're just done */
+		(*sipp)->subsystem = SI_SUB_DONE;
+
+		/* Check if we've installed more sysinit items via KLD */
+		if (newsysinit != NULL) {
+			if (sysinit != SET_BEGIN(sysinit_set))
+				free(sysinit, M_TEMP);
+			sysinit = newsysinit;
+			sysinit_end = newsysinit_end;
+			newsysinit = NULL;
+			newsysinit_end = NULL;
+			goto restart;
+		}
+	}
+
+	mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
+	mtx_unlock(&Giant);
+
+	/*
+	 * Now hand over this thread to swapper.
+	 */
+	swapper();
+	/* NOTREACHED*/
+}
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's belong elsewhere, but have not yet
+ **** been moved.
+ ****
+ ***************************************************************************
+ */
+static void
+print_caddr_t(void *data)
+{
+	printf("%s", (char *)data);
+}
+
+static void
+print_version(void *data __unused)
+{
+	int len;
+
+	/* Strip a trailing newline from version. */
+	len = strlen(version);
+	while (len > 0 && version[len - 1] == '\n')
+		len--;
+	printf("%.*s %s\n", len, version, machine);
+	printf("%s\n", compiler_version);
+}
+
+SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t,
+    copyright);
+SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t,
+    trademark);
+SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL);
+
+#ifdef WITNESS
+static char wit_warn[] =
+     "WARNING: WITNESS option enabled, expect reduced performance.\n";
+SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1,
+   print_caddr_t, wit_warn);
+SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1,
+   print_caddr_t, wit_warn);
+#endif
+
+#ifdef DIAGNOSTIC
+static char diag_warn[] =
+     "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n";
+SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2,
+    print_caddr_t, diag_warn);
+SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2,
+    print_caddr_t, diag_warn);
+#endif
+
+static int
+null_fetch_syscall_args(struct thread *td __unused,
+    struct syscall_args *sa __unused)
+{
+
+	panic("null_fetch_syscall_args");
+}
+
+static void
+null_set_syscall_retval(struct thread *td __unused, int error __unused)
+{
+
+	panic("null_set_syscall_retval");
+}
+
+struct sysentvec null_sysvec = {
+	.sv_size	= 0,
+	.sv_table	= NULL,
+	.sv_mask	= 0,
+	.sv_sigsize	= 0,
+	.sv_sigtbl	= NULL,
+	.sv_errsize	= 0,
+	.sv_errtbl	= NULL,
+	.sv_transtrap	= NULL,
+	.sv_fixup	= NULL,
+	.sv_sendsig	= NULL,
+	.sv_sigcode	= NULL,
+	.sv_szsigcode	= NULL,
+	.sv_prepsyscall	= NULL,
+	.sv_name	= "null",
+	.sv_coredump	= NULL,
+	.sv_imgact_try	= NULL,
+	.sv_minsigstksz	= 0,
+	.sv_pagesize	= PAGE_SIZE,
+	.sv_minuser	= VM_MIN_ADDRESS,
+	.sv_maxuser	= VM_MAXUSER_ADDRESS,
+	.sv_usrstack	= USRSTACK,
+	.sv_psstrings	= PS_STRINGS,
+	.sv_stackprot	= VM_PROT_ALL,
+	.sv_copyout_strings	= NULL,
+	.sv_setregs	= NULL,
+	.sv_fixlimit	= NULL,
+	.sv_maxssiz	= NULL,
+	.sv_flags	= 0,
+	.sv_set_syscall_retval = null_set_syscall_retval,
+	.sv_fetch_syscall_args = null_fetch_syscall_args,
+	.sv_syscallnames = NULL,
+	.sv_schedtail	= NULL,
+};
+
+/*
+ ***************************************************************************
+ ****
+ **** The two following SYSINIT's are proc0 specific glue code.  I am not
+ **** convinced that they can not be safely combined, but their order of
+ **** operation has been maintained as the same as the original init_main.c
+ **** for right now.
+ ****
+ **** These probably belong in init_proc.c or kern_proc.c, since they
+ **** deal with proc0 (the fork template process).
+ ****
+ ***************************************************************************
+ */
+/* ARGSUSED*/
+static void
+proc0_init(void *dummy __unused)
+{
+	struct proc *p;
+	struct thread *td;
+	vm_paddr_t pageablemem;
+	int i;
+
+	GIANT_REQUIRED;
+	p = &proc0;
+	td = &thread0;
+	
+	/*
+	 * Initialize magic number and osrel.
+	 */
+	p->p_magic = P_MAGIC;
+	p->p_osrel = osreldate;
+
+	/*
+	 * Initialize thread and process structures.
+	 */
+	procinit();	/* set up proc zone */
+	threadinit();	/* set up UMA zones */
+
+	/*
+	 * Initialise scheduler resources.
+	 * Add scheduler specific parts to proc, thread as needed.
+	 */
+	schedinit();	/* scheduler gets its house in order */
+
+	/*
+	 * Create process 0 (the swapper).
+	 */
+	LIST_INSERT_HEAD(&allproc, p, p_list);
+	LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
+	mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
+	p->p_pgrp = &pgrp0;
+	LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
+	LIST_INIT(&pgrp0.pg_members);
+	LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
+
+	pgrp0.pg_session = &session0;
+	mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF);
+	refcount_init(&session0.s_count, 1);
+	session0.s_leader = p;
+
+	p->p_sysent = &null_sysvec;
+	p->p_flag = P_SYSTEM | P_INMEM;
+	p->p_state = PRS_NORMAL;
+	knlist_init_mtx(&p->p_klist, &p->p_mtx);
+	STAILQ_INIT(&p->p_ktr);
+	p->p_nice = NZERO;
+	/* pid_max cannot be greater than PID_MAX */
+	td->td_tid = PID_MAX + 1;
+	LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
+	td->td_state = TDS_RUNNING;
+	td->td_pri_class = PRI_TIMESHARE;
+	td->td_user_pri = PUSER;
+	td->td_base_user_pri = PUSER;
+	td->td_lend_user_pri = PRI_MAX;
+	td->td_priority = PVM;
+	td->td_base_pri = PVM;
+	td->td_oncpu = 0;
+	td->td_flags = TDF_INMEM;
+	td->td_pflags = TDP_KTHREAD;
+	td->td_cpuset = cpuset_thread0();
+	prison0.pr_cpuset = cpuset_ref(td->td_cpuset);
+	p->p_peers = 0;
+	p->p_leader = p;
+
+
+	strncpy(p->p_comm, "kernel", sizeof (p->p_comm));
+	strncpy(td->td_name, "swapper", sizeof (td->td_name));
+
+	callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0);
+	callout_init_mtx(&p->p_limco, &p->p_mtx, 0);
+	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
+
+	/* Create credentials. */
+	p->p_ucred = crget();
+	p->p_ucred->cr_ngroups = 1;	/* group 0 */
+	p->p_ucred->cr_uidinfo = uifind(0);
+	p->p_ucred->cr_ruidinfo = uifind(0);
+	p->p_ucred->cr_prison = &prison0;
+	p->p_ucred->cr_loginclass = loginclass_find("default");
+#ifdef AUDIT
+	audit_cred_kproc0(p->p_ucred);
+#endif
+#ifdef MAC
+	mac_cred_create_swapper(p->p_ucred);
+#endif
+	td->td_ucred = crhold(p->p_ucred);
+
+	/* Create sigacts. */
+	p->p_sigacts = sigacts_alloc();
+
+	/* Initialize signal state for process 0. */
+	siginit(&proc0);
+
+	/* Create the file descriptor table. */
+	p->p_fd = fdinit(NULL);
+	p->p_fdtol = NULL;
+
+	/* Create the limits structures. */
+	p->p_limit = lim_alloc();
+	for (i = 0; i < RLIM_NLIMITS; i++)
+		p->p_limit->pl_rlimit[i].rlim_cur =
+		    p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY;
+	p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur =
+	    p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
+	p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur =
+	    p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
+	p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
+	p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
+	p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
+	p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
+	/* Cast to avoid overflow on i386/PAE. */
+	pageablemem = ptoa((vm_paddr_t)cnt.v_free_count);
+	p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur =
+	    p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem;
+	p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3;
+	p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem;
+	p->p_cpulimit = RLIM_INFINITY;
+
+	/* Initialize resource accounting structures. */
+	racct_create(&p->p_racct);
+
+	p->p_stats = pstats_alloc();
+
+	/* Allocate a prototype map so we have something to fork. */
+	pmap_pinit0(vmspace_pmap(&vmspace0));
+	p->p_vmspace = &vmspace0;
+	vmspace0.vm_refcnt = 1;
+
+	/*
+	 * proc0 is not expected to enter usermode, so there is no special
+	 * handling for sv_minuser here, like is done for exec_new_vmspace().
+	 */
+	vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0),
+	    p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser);
+
+	/*
+	 * Call the init and ctor for the new thread and proc.  We wait
+	 * to do this until all other structures are fairly sane.
+	 */
+	EVENTHANDLER_INVOKE(process_init, p);
+	EVENTHANDLER_INVOKE(thread_init, td);
+	EVENTHANDLER_INVOKE(process_ctor, p);
+	EVENTHANDLER_INVOKE(thread_ctor, td);
+
+	/*
+	 * Charge root for one process.
+	 */
+	(void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
+	PROC_LOCK(p);
+	racct_add_force(p, RACCT_NPROC, 1);
+	PROC_UNLOCK(p);
+}
+SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL);
+
+/* ARGSUSED*/
+static void
+proc0_post(void *dummy __unused)
+{
+	struct timespec ts;
+	struct proc *p;
+	struct rusage ru;
+	struct thread *td;
+
+	/*
+	 * Now we can look at the time, having had a chance to verify the
+	 * time from the filesystem.  Pretend that proc0 started now.
+	 */
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		microuptime(&p->p_stats->p_start);
+		PROC_SLOCK(p);
+		rufetch(p, &ru);	/* Clears thread stats */
+		PROC_SUNLOCK(p);
+		p->p_rux.rux_runtime = 0;
+		p->p_rux.rux_uticks = 0;
+		p->p_rux.rux_sticks = 0;
+		p->p_rux.rux_iticks = 0;
+		FOREACH_THREAD_IN_PROC(p, td) {
+			td->td_runtime = 0;
+		}
+	}
+	sx_sunlock(&allproc_lock);
+	PCPU_SET(switchtime, cpu_ticks());
+	PCPU_SET(switchticks, ticks);
+
+	/*
+	 * Give the ``random'' number generator a thump.
+	 */
+	nanotime(&ts);
+	srandom(ts.tv_sec ^ ts.tv_nsec);
+}
+SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL);
+
+static void
+random_init(void *dummy __unused)
+{
+
+	/*
+	 * After CPU has been started we have some randomness on most
+	 * platforms via get_cyclecount().  For platforms that don't
+	 * we will reseed random(9) in proc0_post() as well.
+	 */
+	srandom(get_cyclecount());
+}
+SYSINIT(random, SI_SUB_RANDOM, SI_ORDER_FIRST, random_init, NULL);
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's and glue code should be moved to the
+ **** respective files on a per subsystem basis.
+ ****
+ ***************************************************************************
+ */
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following code probably belongs in another file, like
+ **** kern/init_init.c.
+ ****
+ ***************************************************************************
+ */
+
+/*
+ * List of paths to try when searching for "init".
+ */
+static char init_path[MAXPATHLEN] =
+#ifdef	INIT_PATH
+    __XSTRING(INIT_PATH);
+#else
+    "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init";
+#endif
+SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
+	"Path used to search the init process");
+
+/*
+ * Shutdown timeout of init(8).
+ * Unused within kernel, but used to control init(8), hence do not remove.
+ */
+#ifndef INIT_SHUTDOWN_TIMEOUT
+#define INIT_SHUTDOWN_TIMEOUT 120
+#endif
+static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT;
+SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout,
+	CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). "
+	"Unused within kernel, but used to control init(8)");
+
+/*
+ * Start the initial user process; try exec'ing each pathname in init_path.
+ * The program is invoked with one argument containing the boot flags.
+ */
+static void
+start_init(void *dummy)
+{
+	vm_offset_t addr;
+	struct execve_args args;
+	int options, error;
+	char *var, *path, *next, *s;
+	char *ucp, **uap, *arg0, *arg1;
+	struct thread *td;
+	struct proc *p;
+
+	mtx_lock(&Giant);
+
+	GIANT_REQUIRED;
+
+	td = curthread;
+	p = td->td_proc;
+
+	vfs_mountroot();
+
+	/*
+	 * Need just enough stack to hold the faked-up "execve()" arguments.
+	 */
+	addr = p->p_sysent->sv_usrstack - PAGE_SIZE;
+	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE,
+			FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
+		panic("init: couldn't allocate argument space");
+	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
+	p->p_vmspace->vm_ssize = 1;
+
+	if ((var = getenv("init_path")) != NULL) {
+		strlcpy(init_path, var, sizeof(init_path));
+		freeenv(var);
+	}
+	
+	for (path = init_path; *path != '\0'; path = next) {
+		while (*path == ':')
+			path++;
+		if (*path == '\0')
+			break;
+		for (next = path; *next != '\0' && *next != ':'; next++)
+			/* nothing */ ;
+		if (bootverbose)
+			printf("start_init: trying %.*s\n", (int)(next - path),
+			    path);
+			
+		/*
+		 * Move out the boot flag argument.
+		 */
+		options = 0;
+		ucp = (char *)p->p_sysent->sv_usrstack;
+		(void)subyte(--ucp, 0);		/* trailing zero */
+		if (boothowto & RB_SINGLE) {
+			(void)subyte(--ucp, 's');
+			options = 1;
+		}
+#ifdef notyet
+                if (boothowto & RB_FASTBOOT) {
+			(void)subyte(--ucp, 'f');
+			options = 1;
+		}
+#endif
+
+#ifdef BOOTCDROM
+		(void)subyte(--ucp, 'C');
+		options = 1;
+#endif
+
+		if (options == 0)
+			(void)subyte(--ucp, '-');
+		(void)subyte(--ucp, '-');		/* leading hyphen */
+		arg1 = ucp;
+
+		/*
+		 * Move out the file name (also arg 0).
+		 */
+		(void)subyte(--ucp, 0);
+		for (s = next - 1; s >= path; s--)
+			(void)subyte(--ucp, *s);
+		arg0 = ucp;
+
+		/*
+		 * Move out the arg pointers.
+		 */
+		uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
+		(void)suword((caddr_t)--uap, (long)0);	/* terminator */
+		(void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
+		(void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
+
+		/*
+		 * Point at the arguments.
+		 */
+		args.fname = arg0;
+		args.argv = uap;
+		args.envv = NULL;
+
+		/*
+		 * Now try to exec the program.  If can't for any reason
+		 * other than it doesn't exist, complain.
+		 *
+		 * Otherwise, return via fork_trampoline() all the way
+		 * to user mode as init!
+		 */
+		if ((error = sys_execve(td, &args)) == 0) {
+			mtx_unlock(&Giant);
+			return;
+		}
+		if (error != ENOENT)
+			printf("exec %.*s: error %d\n", (int)(next - path), 
+			    path, error);
+	}
+	printf("init: not found in path %s\n", init_path);
+	panic("no init");
+}
+
+/*
+ * Like kproc_create(), but runs in it's own address space.
+ * We do this early to reserve pid 1.
+ *
+ * Note special case - do not make it runnable yet.  Other work
+ * in progress will change this more.
+ */
+static void
+create_init(const void *udata __unused)
+{
+	struct ucred *newcred, *oldcred;
+	int error;
+
+	error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, 0, &initproc,
+	    NULL, 0);
+	if (error)
+		panic("cannot fork init: %d\n", error);
+	KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1"));
+	/* divorce init's credentials from the kernel's */
+	newcred = crget();
+	PROC_LOCK(initproc);
+	initproc->p_flag |= P_SYSTEM | P_INMEM;
+	oldcred = initproc->p_ucred;
+	crcopy(newcred, oldcred);
+#ifdef MAC
+	mac_cred_create_init(newcred);
+#endif
+#ifdef AUDIT
+	audit_cred_proc1(newcred);
+#endif
+	initproc->p_ucred = newcred;
+	PROC_UNLOCK(initproc);
+	crfree(oldcred);
+	cred_update_thread(FIRST_THREAD_IN_PROC(initproc));
+	cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL);
+}
+SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL);
+
+/*
+ * Make it runnable now.
+ */
+static void
+kick_init(const void *udata __unused)
+{
+	struct thread *td;
+
+	td = FIRST_THREAD_IN_PROC(initproc);
+	thread_lock(td);
+	TD_SET_CAN_RUN(td);
+	sched_add(td, SRQ_BORING);
+	thread_unlock(td);
+}
+SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL);
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
new file mode 100644
index 0000000..64b0201
--- /dev/null
+++ b/sys/kern/init_sysent.c
@@ -0,0 +1,581 @@
+/*
+ * System call switch table.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * created from FreeBSD: head/sys/kern/syscalls.master 255219 2013-09-05 00:09:56Z pjd 
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+
+#define AS(name) (sizeof(struct name) / sizeof(register_t))
+
+#ifdef COMPAT_43
+#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)
+#else
+#define compat(n, name) 0, (sy_call_t *)nosys
+#endif
+
+#ifdef COMPAT_FREEBSD4
+#define compat4(n, name) n, (sy_call_t *)__CONCAT(freebsd4_,name)
+#else
+#define compat4(n, name) 0, (sy_call_t *)nosys
+#endif
+
+#ifdef COMPAT_FREEBSD7
+#define compat7(n, name) n, (sy_call_t *)__CONCAT(freebsd7_,name)
+#else
+#define compat7(n, name) 0, (sy_call_t *)nosys
+#endif
+
+/* The casts are bogus but will do for now. */
+struct sysent sysent[] = {
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },		/* 0 = syscall */
+	{ AS(sys_exit_args), (sy_call_t *)sys_sys_exit, AUE_EXIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 1 = exit */
+	{ 0, (sy_call_t *)sys_fork, AUE_FORK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 2 = fork */
+	{ AS(read_args), (sy_call_t *)sys_read, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 3 = read */
+	{ AS(write_args), (sy_call_t *)sys_write, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 4 = write */
+	{ AS(open_args), (sy_call_t *)sys_open, AUE_OPEN_RWTC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 5 = open */
+	{ AS(close_args), (sy_call_t *)sys_close, AUE_CLOSE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 6 = close */
+	{ AS(wait4_args), (sy_call_t *)sys_wait4, AUE_WAIT4, NULL, 0, 0, 0, SY_THR_STATIC },	/* 7 = wait4 */
+	{ compat(AS(ocreat_args),creat), AUE_CREAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 8 = old creat */
+	{ AS(link_args), (sy_call_t *)sys_link, AUE_LINK, NULL, 0, 0, 0, SY_THR_STATIC },	/* 9 = link */
+	{ AS(unlink_args), (sy_call_t *)sys_unlink, AUE_UNLINK, NULL, 0, 0, 0, SY_THR_STATIC },	/* 10 = unlink */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 11 = obsolete execv */
+	{ AS(chdir_args), (sy_call_t *)sys_chdir, AUE_CHDIR, NULL, 0, 0, 0, SY_THR_STATIC },	/* 12 = chdir */
+	{ AS(fchdir_args), (sy_call_t *)sys_fchdir, AUE_FCHDIR, NULL, 0, 0, 0, SY_THR_STATIC },	/* 13 = fchdir */
+	{ AS(mknod_args), (sy_call_t *)sys_mknod, AUE_MKNOD, NULL, 0, 0, 0, SY_THR_STATIC },	/* 14 = mknod */
+	{ AS(chmod_args), (sy_call_t *)sys_chmod, AUE_CHMOD, NULL, 0, 0, 0, SY_THR_STATIC },	/* 15 = chmod */
+	{ AS(chown_args), (sy_call_t *)sys_chown, AUE_CHOWN, NULL, 0, 0, 0, SY_THR_STATIC },	/* 16 = chown */
+	{ AS(obreak_args), (sy_call_t *)sys_obreak, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 17 = break */
+	{ compat4(AS(freebsd4_getfsstat_args),getfsstat), AUE_GETFSSTAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 18 = freebsd4 getfsstat */
+	{ compat(AS(olseek_args),lseek), AUE_LSEEK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 19 = old lseek */
+	{ 0, (sy_call_t *)sys_getpid, AUE_GETPID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 20 = getpid */
+	{ AS(mount_args), (sy_call_t *)sys_mount, AUE_MOUNT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 21 = mount */
+	{ AS(unmount_args), (sy_call_t *)sys_unmount, AUE_UMOUNT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 22 = unmount */
+	{ AS(setuid_args), (sy_call_t *)sys_setuid, AUE_SETUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 23 = setuid */
+	{ 0, (sy_call_t *)sys_getuid, AUE_GETUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 24 = getuid */
+	{ 0, (sy_call_t *)sys_geteuid, AUE_GETEUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 25 = geteuid */
+	{ AS(ptrace_args), (sy_call_t *)sys_ptrace, AUE_PTRACE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 26 = ptrace */
+	{ AS(recvmsg_args), (sy_call_t *)sys_recvmsg, AUE_RECVMSG, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 27 = recvmsg */
+	{ AS(sendmsg_args), (sy_call_t *)sys_sendmsg, AUE_SENDMSG, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 28 = sendmsg */
+	{ AS(recvfrom_args), (sy_call_t *)sys_recvfrom, AUE_RECVFROM, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 29 = recvfrom */
+	{ AS(accept_args), (sy_call_t *)sys_accept, AUE_ACCEPT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 30 = accept */
+	{ AS(getpeername_args), (sy_call_t *)sys_getpeername, AUE_GETPEERNAME, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 31 = getpeername */
+	{ AS(getsockname_args), (sy_call_t *)sys_getsockname, AUE_GETSOCKNAME, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 32 = getsockname */
+	{ AS(access_args), (sy_call_t *)sys_access, AUE_ACCESS, NULL, 0, 0, 0, SY_THR_STATIC },	/* 33 = access */
+	{ AS(chflags_args), (sy_call_t *)sys_chflags, AUE_CHFLAGS, NULL, 0, 0, 0, SY_THR_STATIC },	/* 34 = chflags */
+	{ AS(fchflags_args), (sy_call_t *)sys_fchflags, AUE_FCHFLAGS, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 35 = fchflags */
+	{ 0, (sy_call_t *)sys_sync, AUE_SYNC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 36 = sync */
+	{ AS(kill_args), (sy_call_t *)sys_kill, AUE_KILL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 37 = kill */
+	{ compat(AS(ostat_args),stat), AUE_STAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 38 = old stat */
+	{ 0, (sy_call_t *)sys_getppid, AUE_GETPPID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 39 = getppid */
+	{ compat(AS(olstat_args),lstat), AUE_LSTAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 40 = old lstat */
+	{ AS(dup_args), (sy_call_t *)sys_dup, AUE_DUP, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 41 = dup */
+	{ 0, (sy_call_t *)sys_pipe, AUE_PIPE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 42 = pipe */
+	{ 0, (sy_call_t *)sys_getegid, AUE_GETEGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 43 = getegid */
+	{ AS(profil_args), (sy_call_t *)sys_profil, AUE_PROFILE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 44 = profil */
+	{ AS(ktrace_args), (sy_call_t *)sys_ktrace, AUE_KTRACE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 45 = ktrace */
+	{ compat(AS(osigaction_args),sigaction), AUE_SIGACTION, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 46 = old sigaction */
+	{ 0, (sy_call_t *)sys_getgid, AUE_GETGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 47 = getgid */
+	{ compat(AS(osigprocmask_args),sigprocmask), AUE_SIGPROCMASK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 48 = old sigprocmask */
+	{ AS(getlogin_args), (sy_call_t *)sys_getlogin, AUE_GETLOGIN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 49 = getlogin */
+	{ AS(setlogin_args), (sy_call_t *)sys_setlogin, AUE_SETLOGIN, NULL, 0, 0, 0, SY_THR_STATIC },	/* 50 = setlogin */
+	{ AS(acct_args), (sy_call_t *)sys_acct, AUE_ACCT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 51 = acct */
+	{ compat(0,sigpending), AUE_SIGPENDING, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 52 = old sigpending */
+	{ AS(sigaltstack_args), (sy_call_t *)sys_sigaltstack, AUE_SIGALTSTACK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 53 = sigaltstack */
+	{ AS(ioctl_args), (sy_call_t *)sys_ioctl, AUE_IOCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 54 = ioctl */
+	{ AS(reboot_args), (sy_call_t *)sys_reboot, AUE_REBOOT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 55 = reboot */
+	{ AS(revoke_args), (sy_call_t *)sys_revoke, AUE_REVOKE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 56 = revoke */
+	{ AS(symlink_args), (sy_call_t *)sys_symlink, AUE_SYMLINK, NULL, 0, 0, 0, SY_THR_STATIC },	/* 57 = symlink */
+	{ AS(readlink_args), (sy_call_t *)sys_readlink, AUE_READLINK, NULL, 0, 0, 0, SY_THR_STATIC },	/* 58 = readlink */
+	{ AS(execve_args), (sy_call_t *)sys_execve, AUE_EXECVE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 59 = execve */
+	{ AS(umask_args), (sy_call_t *)sys_umask, AUE_UMASK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 60 = umask */
+	{ AS(chroot_args), (sy_call_t *)sys_chroot, AUE_CHROOT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 61 = chroot */
+	{ compat(AS(ofstat_args),fstat), AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 62 = old fstat */
+	{ compat(AS(getkerninfo_args),getkerninfo), AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 63 = old getkerninfo */
+	{ compat(0,getpagesize), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 64 = old getpagesize */
+	{ AS(msync_args), (sy_call_t *)sys_msync, AUE_MSYNC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 65 = msync */
+	{ 0, (sy_call_t *)sys_vfork, AUE_VFORK, NULL, 0, 0, 0, SY_THR_STATIC },	/* 66 = vfork */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 67 = obsolete vread */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 68 = obsolete vwrite */
+	{ AS(sbrk_args), (sy_call_t *)sys_sbrk, AUE_SBRK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 69 = sbrk */
+	{ AS(sstk_args), (sy_call_t *)sys_sstk, AUE_SSTK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 70 = sstk */
+	{ compat(AS(ommap_args),mmap), AUE_MMAP, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 71 = old mmap */
+	{ AS(ovadvise_args), (sy_call_t *)sys_ovadvise, AUE_O_VADVISE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 72 = vadvise */
+	{ AS(munmap_args), (sy_call_t *)sys_munmap, AUE_MUNMAP, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 73 = munmap */
+	{ AS(mprotect_args), (sy_call_t *)sys_mprotect, AUE_MPROTECT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 74 = mprotect */
+	{ AS(madvise_args), (sy_call_t *)sys_madvise, AUE_MADVISE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 75 = madvise */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 76 = obsolete vhangup */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 77 = obsolete vlimit */
+	{ AS(mincore_args), (sy_call_t *)sys_mincore, AUE_MINCORE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 78 = mincore */
+	{ AS(getgroups_args), (sy_call_t *)sys_getgroups, AUE_GETGROUPS, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 79 = getgroups */
+	{ AS(setgroups_args), (sy_call_t *)sys_setgroups, AUE_SETGROUPS, NULL, 0, 0, 0, SY_THR_STATIC },	/* 80 = setgroups */
+	{ 0, (sy_call_t *)sys_getpgrp, AUE_GETPGRP, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 81 = getpgrp */
+	{ AS(setpgid_args), (sy_call_t *)sys_setpgid, AUE_SETPGRP, NULL, 0, 0, 0, SY_THR_STATIC },	/* 82 = setpgid */
+	{ AS(setitimer_args), (sy_call_t *)sys_setitimer, AUE_SETITIMER, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 83 = setitimer */
+	{ compat(0,wait), AUE_WAIT4, NULL, 0, 0, 0, SY_THR_STATIC },			/* 84 = old wait */
+	{ AS(swapon_args), (sy_call_t *)sys_swapon, AUE_SWAPON, NULL, 0, 0, 0, SY_THR_STATIC },	/* 85 = swapon */
+	{ AS(getitimer_args), (sy_call_t *)sys_getitimer, AUE_GETITIMER, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 86 = getitimer */
+	{ compat(AS(gethostname_args),gethostname), AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 87 = old gethostname */
+	{ compat(AS(sethostname_args),sethostname), AUE_SYSCTL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 88 = old sethostname */
+	{ 0, (sy_call_t *)sys_getdtablesize, AUE_GETDTABLESIZE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 89 = getdtablesize */
+	{ AS(dup2_args), (sy_call_t *)sys_dup2, AUE_DUP2, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 90 = dup2 */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 91 = getdopt */
+	{ AS(fcntl_args), (sy_call_t *)sys_fcntl, AUE_FCNTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 92 = fcntl */
+	{ AS(select_args), (sy_call_t *)sys_select, AUE_SELECT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 93 = select */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 94 = setdopt */
+	{ AS(fsync_args), (sy_call_t *)sys_fsync, AUE_FSYNC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 95 = fsync */
+	{ AS(setpriority_args), (sy_call_t *)sys_setpriority, AUE_SETPRIORITY, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 96 = setpriority */
+	{ AS(socket_args), (sy_call_t *)sys_socket, AUE_SOCKET, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 97 = socket */
+	{ AS(connect_args), (sy_call_t *)sys_connect, AUE_CONNECT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 98 = connect */
+	{ compat(AS(accept_args),accept), AUE_ACCEPT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 99 = old accept */
+	{ AS(getpriority_args), (sy_call_t *)sys_getpriority, AUE_GETPRIORITY, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 100 = getpriority */
+	{ compat(AS(osend_args),send), AUE_SEND, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 101 = old send */
+	{ compat(AS(orecv_args),recv), AUE_RECV, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 102 = old recv */
+	{ compat(AS(osigreturn_args),sigreturn), AUE_SIGRETURN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 103 = old sigreturn */
+	{ AS(bind_args), (sy_call_t *)sys_bind, AUE_BIND, NULL, 0, 0, 0, SY_THR_STATIC },	/* 104 = bind */
+	{ AS(setsockopt_args), (sy_call_t *)sys_setsockopt, AUE_SETSOCKOPT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 105 = setsockopt */
+	{ AS(listen_args), (sy_call_t *)sys_listen, AUE_LISTEN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 106 = listen */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 107 = obsolete vtimes */
+	{ compat(AS(osigvec_args),sigvec), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 108 = old sigvec */
+	{ compat(AS(osigblock_args),sigblock), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 109 = old sigblock */
+	{ compat(AS(osigsetmask_args),sigsetmask), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 110 = old sigsetmask */
+	{ compat(AS(osigsuspend_args),sigsuspend), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 111 = old sigsuspend */
+	{ compat(AS(osigstack_args),sigstack), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 112 = old sigstack */
+	{ compat(AS(orecvmsg_args),recvmsg), AUE_RECVMSG, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 113 = old recvmsg */
+	{ compat(AS(osendmsg_args),sendmsg), AUE_SENDMSG, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 114 = old sendmsg */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 115 = obsolete vtrace */
+	{ AS(gettimeofday_args), (sy_call_t *)sys_gettimeofday, AUE_GETTIMEOFDAY, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 116 = gettimeofday */
+	{ AS(getrusage_args), (sy_call_t *)sys_getrusage, AUE_GETRUSAGE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 117 = getrusage */
+	{ AS(getsockopt_args), (sy_call_t *)sys_getsockopt, AUE_GETSOCKOPT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 118 = getsockopt */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 119 = resuba */
+	{ AS(readv_args), (sy_call_t *)sys_readv, AUE_READV, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 120 = readv */
+	{ AS(writev_args), (sy_call_t *)sys_writev, AUE_WRITEV, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 121 = writev */
+	{ AS(settimeofday_args), (sy_call_t *)sys_settimeofday, AUE_SETTIMEOFDAY, NULL, 0, 0, 0, SY_THR_STATIC },	/* 122 = settimeofday */
+	{ AS(fchown_args), (sy_call_t *)sys_fchown, AUE_FCHOWN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 123 = fchown */
+	{ AS(fchmod_args), (sy_call_t *)sys_fchmod, AUE_FCHMOD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 124 = fchmod */
+	{ compat(AS(recvfrom_args),recvfrom), AUE_RECVFROM, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 125 = old recvfrom */
+	{ AS(setreuid_args), (sy_call_t *)sys_setreuid, AUE_SETREUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 126 = setreuid */
+	{ AS(setregid_args), (sy_call_t *)sys_setregid, AUE_SETREGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 127 = setregid */
+	{ AS(rename_args), (sy_call_t *)sys_rename, AUE_RENAME, NULL, 0, 0, 0, SY_THR_STATIC },	/* 128 = rename */
+	{ compat(AS(otruncate_args),truncate), AUE_TRUNCATE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 129 = old truncate */
+	{ compat(AS(oftruncate_args),ftruncate), AUE_FTRUNCATE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 130 = old ftruncate */
+	{ AS(flock_args), (sy_call_t *)sys_flock, AUE_FLOCK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 131 = flock */
+	{ AS(mkfifo_args), (sy_call_t *)sys_mkfifo, AUE_MKFIFO, NULL, 0, 0, 0, SY_THR_STATIC },	/* 132 = mkfifo */
+	{ AS(sendto_args), (sy_call_t *)sys_sendto, AUE_SENDTO, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 133 = sendto */
+	{ AS(shutdown_args), (sy_call_t *)sys_shutdown, AUE_SHUTDOWN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 134 = shutdown */
+	{ AS(socketpair_args), (sy_call_t *)sys_socketpair, AUE_SOCKETPAIR, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 135 = socketpair */
+	{ AS(mkdir_args), (sy_call_t *)sys_mkdir, AUE_MKDIR, NULL, 0, 0, 0, SY_THR_STATIC },	/* 136 = mkdir */
+	{ AS(rmdir_args), (sy_call_t *)sys_rmdir, AUE_RMDIR, NULL, 0, 0, 0, SY_THR_STATIC },	/* 137 = rmdir */
+	{ AS(utimes_args), (sy_call_t *)sys_utimes, AUE_UTIMES, NULL, 0, 0, 0, SY_THR_STATIC },	/* 138 = utimes */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 139 = obsolete 4.2 sigreturn */
+	{ AS(adjtime_args), (sy_call_t *)sys_adjtime, AUE_ADJTIME, NULL, 0, 0, 0, SY_THR_STATIC },	/* 140 = adjtime */
+	{ compat(AS(ogetpeername_args),getpeername), AUE_GETPEERNAME, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 141 = old getpeername */
+	{ compat(0,gethostid), AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 142 = old gethostid */
+	{ compat(AS(osethostid_args),sethostid), AUE_SYSCTL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 143 = old sethostid */
+	{ compat(AS(ogetrlimit_args),getrlimit), AUE_GETRLIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 144 = old getrlimit */
+	{ compat(AS(osetrlimit_args),setrlimit), AUE_SETRLIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 145 = old setrlimit */
+	{ compat(AS(okillpg_args),killpg), AUE_KILLPG, NULL, 0, 0, 0, SY_THR_STATIC },	/* 146 = old killpg */
+	{ 0, (sy_call_t *)sys_setsid, AUE_SETSID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 147 = setsid */
+	{ AS(quotactl_args), (sy_call_t *)sys_quotactl, AUE_QUOTACTL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 148 = quotactl */
+	{ compat(0,quota), AUE_O_QUOTA, NULL, 0, 0, 0, SY_THR_STATIC },		/* 149 = old quota */
+	{ compat(AS(getsockname_args),getsockname), AUE_GETSOCKNAME, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 150 = old getsockname */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 151 = sem_lock */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 152 = sem_wakeup */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 153 = asyncdaemon */
+	{ AS(nlm_syscall_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 154 = nlm_syscall */
+	{ AS(nfssvc_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 155 = nfssvc */
+	{ compat(AS(ogetdirentries_args),getdirentries), AUE_GETDIRENTRIES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 156 = old getdirentries */
+	{ compat4(AS(freebsd4_statfs_args),statfs), AUE_STATFS, NULL, 0, 0, 0, SY_THR_STATIC },	/* 157 = freebsd4 statfs */
+	{ compat4(AS(freebsd4_fstatfs_args),fstatfs), AUE_FSTATFS, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 158 = freebsd4 fstatfs */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 159 = nosys */
+	{ AS(lgetfh_args), (sy_call_t *)sys_lgetfh, AUE_LGETFH, NULL, 0, 0, 0, SY_THR_STATIC },	/* 160 = lgetfh */
+	{ AS(getfh_args), (sy_call_t *)sys_getfh, AUE_NFS_GETFH, NULL, 0, 0, 0, SY_THR_STATIC },	/* 161 = getfh */
+	{ compat4(AS(freebsd4_getdomainname_args),getdomainname), AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 162 = freebsd4 getdomainname */
+	{ compat4(AS(freebsd4_setdomainname_args),setdomainname), AUE_SYSCTL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 163 = freebsd4 setdomainname */
+	{ compat4(AS(freebsd4_uname_args),uname), AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 164 = freebsd4 uname */
+	{ AS(sysarch_args), (sy_call_t *)sysarch, AUE_SYSARCH, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 165 = sysarch */
+	{ AS(rtprio_args), (sy_call_t *)sys_rtprio, AUE_RTPRIO, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 166 = rtprio */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 167 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 168 = nosys */
+	{ AS(semsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 169 = semsys */
+	{ AS(msgsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 170 = msgsys */
+	{ AS(shmsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 171 = shmsys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 172 = nosys */
+	{ AS(freebsd6_pread_args), (sy_call_t *)freebsd6_pread, AUE_PREAD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 173 = freebsd6_pread */
+	{ AS(freebsd6_pwrite_args), (sy_call_t *)freebsd6_pwrite, AUE_PWRITE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 174 = freebsd6_pwrite */
+	{ AS(setfib_args), (sy_call_t *)sys_setfib, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 175 = setfib */
+	{ AS(ntp_adjtime_args), (sy_call_t *)sys_ntp_adjtime, AUE_NTP_ADJTIME, NULL, 0, 0, 0, SY_THR_STATIC },	/* 176 = ntp_adjtime */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 177 = sfork */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 178 = getdescriptor */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 179 = setdescriptor */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 180 = nosys */
+	{ AS(setgid_args), (sy_call_t *)sys_setgid, AUE_SETGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 181 = setgid */
+	{ AS(setegid_args), (sy_call_t *)sys_setegid, AUE_SETEGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 182 = setegid */
+	{ AS(seteuid_args), (sy_call_t *)sys_seteuid, AUE_SETEUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 183 = seteuid */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 184 = lfs_bmapv */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 185 = lfs_markv */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 186 = lfs_segclean */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 187 = lfs_segwait */
+	{ AS(stat_args), (sy_call_t *)sys_stat, AUE_STAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 188 = stat */
+	{ AS(fstat_args), (sy_call_t *)sys_fstat, AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 189 = fstat */
+	{ AS(lstat_args), (sy_call_t *)sys_lstat, AUE_LSTAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 190 = lstat */
+	{ AS(pathconf_args), (sy_call_t *)sys_pathconf, AUE_PATHCONF, NULL, 0, 0, 0, SY_THR_STATIC },	/* 191 = pathconf */
+	{ AS(fpathconf_args), (sy_call_t *)sys_fpathconf, AUE_FPATHCONF, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 192 = fpathconf */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 193 = nosys */
+	{ AS(__getrlimit_args), (sy_call_t *)sys_getrlimit, AUE_GETRLIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 194 = getrlimit */
+	{ AS(__setrlimit_args), (sy_call_t *)sys_setrlimit, AUE_SETRLIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 195 = setrlimit */
+	{ AS(getdirentries_args), (sy_call_t *)sys_getdirentries, AUE_GETDIRENTRIES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 196 = getdirentries */
+	{ AS(freebsd6_mmap_args), (sy_call_t *)freebsd6_mmap, AUE_MMAP, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 197 = freebsd6_mmap */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },		/* 198 = __syscall */
+	{ AS(freebsd6_lseek_args), (sy_call_t *)freebsd6_lseek, AUE_LSEEK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 199 = freebsd6_lseek */
+	{ AS(freebsd6_truncate_args), (sy_call_t *)freebsd6_truncate, AUE_TRUNCATE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 200 = freebsd6_truncate */
+	{ AS(freebsd6_ftruncate_args), (sy_call_t *)freebsd6_ftruncate, AUE_FTRUNCATE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 201 = freebsd6_ftruncate */
+	{ AS(sysctl_args), (sy_call_t *)sys___sysctl, AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 202 = __sysctl */
+	{ AS(mlock_args), (sy_call_t *)sys_mlock, AUE_MLOCK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 203 = mlock */
+	{ AS(munlock_args), (sy_call_t *)sys_munlock, AUE_MUNLOCK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 204 = munlock */
+	{ AS(undelete_args), (sy_call_t *)sys_undelete, AUE_UNDELETE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 205 = undelete */
+	{ AS(futimes_args), (sy_call_t *)sys_futimes, AUE_FUTIMES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 206 = futimes */
+	{ AS(getpgid_args), (sy_call_t *)sys_getpgid, AUE_GETPGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 207 = getpgid */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 208 = newreboot */
+	{ AS(poll_args), (sy_call_t *)sys_poll, AUE_POLL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 209 = poll */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 210 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 211 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 212 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 213 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 214 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 215 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 216 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 217 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 218 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 219 = lkmnosys */
+	{ 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },		/* 220 = freebsd7 __semctl */
+	{ AS(semget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 221 = semget */
+	{ AS(semop_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 222 = semop */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 223 = semconfig */
+	{ 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },		/* 224 = freebsd7 msgctl */
+	{ AS(msgget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 225 = msgget */
+	{ AS(msgsnd_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 226 = msgsnd */
+	{ AS(msgrcv_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 227 = msgrcv */
+	{ AS(shmat_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 228 = shmat */
+	{ 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },		/* 229 = freebsd7 shmctl */
+	{ AS(shmdt_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 230 = shmdt */
+	{ AS(shmget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 231 = shmget */
+	{ AS(clock_gettime_args), (sy_call_t *)sys_clock_gettime, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 232 = clock_gettime */
+	{ AS(clock_settime_args), (sy_call_t *)sys_clock_settime, AUE_CLOCK_SETTIME, NULL, 0, 0, 0, SY_THR_STATIC },	/* 233 = clock_settime */
+	{ AS(clock_getres_args), (sy_call_t *)sys_clock_getres, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 234 = clock_getres */
+	{ AS(ktimer_create_args), (sy_call_t *)sys_ktimer_create, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 235 = ktimer_create */
+	{ AS(ktimer_delete_args), (sy_call_t *)sys_ktimer_delete, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 236 = ktimer_delete */
+	{ AS(ktimer_settime_args), (sy_call_t *)sys_ktimer_settime, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 237 = ktimer_settime */
+	{ AS(ktimer_gettime_args), (sy_call_t *)sys_ktimer_gettime, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 238 = ktimer_gettime */
+	{ AS(ktimer_getoverrun_args), (sy_call_t *)sys_ktimer_getoverrun, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 239 = ktimer_getoverrun */
+	{ AS(nanosleep_args), (sy_call_t *)sys_nanosleep, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 240 = nanosleep */
+	{ AS(ffclock_getcounter_args), (sy_call_t *)sys_ffclock_getcounter, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 241 = ffclock_getcounter */
+	{ AS(ffclock_setestimate_args), (sy_call_t *)sys_ffclock_setestimate, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 242 = ffclock_setestimate */
+	{ AS(ffclock_getestimate_args), (sy_call_t *)sys_ffclock_getestimate, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 243 = ffclock_getestimate */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 244 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 245 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 246 = nosys */
+	{ AS(clock_getcpuclockid2_args), (sy_call_t *)sys_clock_getcpuclockid2, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 247 = clock_getcpuclockid2 */
+	{ AS(ntp_gettime_args), (sy_call_t *)sys_ntp_gettime, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 248 = ntp_gettime */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 249 = nosys */
+	{ AS(minherit_args), (sy_call_t *)sys_minherit, AUE_MINHERIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 250 = minherit */
+	{ AS(rfork_args), (sy_call_t *)sys_rfork, AUE_RFORK, NULL, 0, 0, 0, SY_THR_STATIC },	/* 251 = rfork */
+	{ AS(openbsd_poll_args), (sy_call_t *)sys_openbsd_poll, AUE_POLL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 252 = openbsd_poll */
+	{ 0, (sy_call_t *)sys_issetugid, AUE_ISSETUGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 253 = issetugid */
+	{ AS(lchown_args), (sy_call_t *)sys_lchown, AUE_LCHOWN, NULL, 0, 0, 0, SY_THR_STATIC },	/* 254 = lchown */
+	{ AS(aio_read_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 255 = aio_read */
+	{ AS(aio_write_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 256 = aio_write */
+	{ AS(lio_listio_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 257 = lio_listio */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 258 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 259 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 260 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 261 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 262 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 263 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 264 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 265 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 266 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 267 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 268 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 269 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 270 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 271 = nosys */
+	{ AS(getdents_args), (sy_call_t *)sys_getdents, AUE_O_GETDENTS, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 272 = getdents */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 273 = nosys */
+	{ AS(lchmod_args), (sy_call_t *)sys_lchmod, AUE_LCHMOD, NULL, 0, 0, 0, SY_THR_STATIC },	/* 274 = lchmod */
+	{ AS(lchown_args), (sy_call_t *)sys_lchown, AUE_LCHOWN, NULL, 0, 0, 0, SY_THR_STATIC },	/* 275 = netbsd_lchown */
+	{ AS(lutimes_args), (sy_call_t *)sys_lutimes, AUE_LUTIMES, NULL, 0, 0, 0, SY_THR_STATIC },	/* 276 = lutimes */
+	{ AS(msync_args), (sy_call_t *)sys_msync, AUE_MSYNC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 277 = netbsd_msync */
+	{ AS(nstat_args), (sy_call_t *)sys_nstat, AUE_STAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 278 = nstat */
+	{ AS(nfstat_args), (sy_call_t *)sys_nfstat, AUE_FSTAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 279 = nfstat */
+	{ AS(nlstat_args), (sy_call_t *)sys_nlstat, AUE_LSTAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 280 = nlstat */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 281 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 282 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 283 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 284 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 285 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 286 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 287 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 288 = nosys */
+	{ AS(preadv_args), (sy_call_t *)sys_preadv, AUE_PREADV, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 289 = preadv */
+	{ AS(pwritev_args), (sy_call_t *)sys_pwritev, AUE_PWRITEV, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 290 = pwritev */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 291 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 292 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 293 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 294 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 295 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 296 = nosys */
+	{ compat4(AS(freebsd4_fhstatfs_args),fhstatfs), AUE_FHSTATFS, NULL, 0, 0, 0, SY_THR_STATIC },	/* 297 = freebsd4 fhstatfs */
+	{ AS(fhopen_args), (sy_call_t *)sys_fhopen, AUE_FHOPEN, NULL, 0, 0, 0, SY_THR_STATIC },	/* 298 = fhopen */
+	{ AS(fhstat_args), (sy_call_t *)sys_fhstat, AUE_FHSTAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 299 = fhstat */
+	{ AS(modnext_args), (sy_call_t *)sys_modnext, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 300 = modnext */
+	{ AS(modstat_args), (sy_call_t *)sys_modstat, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 301 = modstat */
+	{ AS(modfnext_args), (sy_call_t *)sys_modfnext, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 302 = modfnext */
+	{ AS(modfind_args), (sy_call_t *)sys_modfind, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 303 = modfind */
+	{ AS(kldload_args), (sy_call_t *)sys_kldload, AUE_MODLOAD, NULL, 0, 0, 0, SY_THR_STATIC },	/* 304 = kldload */
+	{ AS(kldunload_args), (sy_call_t *)sys_kldunload, AUE_MODUNLOAD, NULL, 0, 0, 0, SY_THR_STATIC },	/* 305 = kldunload */
+	{ AS(kldfind_args), (sy_call_t *)sys_kldfind, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 306 = kldfind */
+	{ AS(kldnext_args), (sy_call_t *)sys_kldnext, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 307 = kldnext */
+	{ AS(kldstat_args), (sy_call_t *)sys_kldstat, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 308 = kldstat */
+	{ AS(kldfirstmod_args), (sy_call_t *)sys_kldfirstmod, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 309 = kldfirstmod */
+	{ AS(getsid_args), (sy_call_t *)sys_getsid, AUE_GETSID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 310 = getsid */
+	{ AS(setresuid_args), (sy_call_t *)sys_setresuid, AUE_SETRESUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 311 = setresuid */
+	{ AS(setresgid_args), (sy_call_t *)sys_setresgid, AUE_SETRESGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 312 = setresgid */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 313 = obsolete signanosleep */
+	{ AS(aio_return_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 314 = aio_return */
+	{ AS(aio_suspend_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 315 = aio_suspend */
+	{ AS(aio_cancel_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 316 = aio_cancel */
+	{ AS(aio_error_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 317 = aio_error */
+	{ AS(oaio_read_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 318 = oaio_read */
+	{ AS(oaio_write_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 319 = oaio_write */
+	{ AS(olio_listio_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 320 = olio_listio */
+	{ 0, (sy_call_t *)sys_yield, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 321 = yield */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 322 = obsolete thr_sleep */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 323 = obsolete thr_wakeup */
+	{ AS(mlockall_args), (sy_call_t *)sys_mlockall, AUE_MLOCKALL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 324 = mlockall */
+	{ 0, (sy_call_t *)sys_munlockall, AUE_MUNLOCKALL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 325 = munlockall */
+	{ AS(__getcwd_args), (sy_call_t *)sys___getcwd, AUE_GETCWD, NULL, 0, 0, 0, SY_THR_STATIC },	/* 326 = __getcwd */
+	{ AS(sched_setparam_args), (sy_call_t *)sys_sched_setparam, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 327 = sched_setparam */
+	{ AS(sched_getparam_args), (sy_call_t *)sys_sched_getparam, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 328 = sched_getparam */
+	{ AS(sched_setscheduler_args), (sy_call_t *)sys_sched_setscheduler, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 329 = sched_setscheduler */
+	{ AS(sched_getscheduler_args), (sy_call_t *)sys_sched_getscheduler, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 330 = sched_getscheduler */
+	{ 0, (sy_call_t *)sys_sched_yield, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 331 = sched_yield */
+	{ AS(sched_get_priority_max_args), (sy_call_t *)sys_sched_get_priority_max, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 332 = sched_get_priority_max */
+	{ AS(sched_get_priority_min_args), (sy_call_t *)sys_sched_get_priority_min, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 333 = sched_get_priority_min */
+	{ AS(sched_rr_get_interval_args), (sy_call_t *)sys_sched_rr_get_interval, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 334 = sched_rr_get_interval */
+	{ AS(utrace_args), (sy_call_t *)sys_utrace, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 335 = utrace */
+	{ compat4(AS(freebsd4_sendfile_args),sendfile), AUE_SENDFILE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 336 = freebsd4 sendfile */
+	{ AS(kldsym_args), (sy_call_t *)sys_kldsym, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 337 = kldsym */
+	{ AS(jail_args), (sy_call_t *)sys_jail, AUE_JAIL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 338 = jail */
+	{ AS(nnpfs_syscall_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 339 = nnpfs_syscall */
+	{ AS(sigprocmask_args), (sy_call_t *)sys_sigprocmask, AUE_SIGPROCMASK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 340 = sigprocmask */
+	{ AS(sigsuspend_args), (sy_call_t *)sys_sigsuspend, AUE_SIGSUSPEND, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 341 = sigsuspend */
+	{ compat4(AS(freebsd4_sigaction_args),sigaction), AUE_SIGACTION, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 342 = freebsd4 sigaction */
+	{ AS(sigpending_args), (sy_call_t *)sys_sigpending, AUE_SIGPENDING, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 343 = sigpending */
+	{ compat4(AS(freebsd4_sigreturn_args),sigreturn), AUE_SIGRETURN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 344 = freebsd4 sigreturn */
+	{ AS(sigtimedwait_args), (sy_call_t *)sys_sigtimedwait, AUE_SIGWAIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 345 = sigtimedwait */
+	{ AS(sigwaitinfo_args), (sy_call_t *)sys_sigwaitinfo, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 346 = sigwaitinfo */
+	{ AS(__acl_get_file_args), (sy_call_t *)sys___acl_get_file, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 347 = __acl_get_file */
+	{ AS(__acl_set_file_args), (sy_call_t *)sys___acl_set_file, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 348 = __acl_set_file */
+	{ AS(__acl_get_fd_args), (sy_call_t *)sys___acl_get_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 349 = __acl_get_fd */
+	{ AS(__acl_set_fd_args), (sy_call_t *)sys___acl_set_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 350 = __acl_set_fd */
+	{ AS(__acl_delete_file_args), (sy_call_t *)sys___acl_delete_file, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 351 = __acl_delete_file */
+	{ AS(__acl_delete_fd_args), (sy_call_t *)sys___acl_delete_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 352 = __acl_delete_fd */
+	{ AS(__acl_aclcheck_file_args), (sy_call_t *)sys___acl_aclcheck_file, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 353 = __acl_aclcheck_file */
+	{ AS(__acl_aclcheck_fd_args), (sy_call_t *)sys___acl_aclcheck_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 354 = __acl_aclcheck_fd */
+	{ AS(extattrctl_args), (sy_call_t *)sys_extattrctl, AUE_EXTATTRCTL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 355 = extattrctl */
+	{ AS(extattr_set_file_args), (sy_call_t *)sys_extattr_set_file, AUE_EXTATTR_SET_FILE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 356 = extattr_set_file */
+	{ AS(extattr_get_file_args), (sy_call_t *)sys_extattr_get_file, AUE_EXTATTR_GET_FILE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 357 = extattr_get_file */
+	{ AS(extattr_delete_file_args), (sy_call_t *)sys_extattr_delete_file, AUE_EXTATTR_DELETE_FILE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 358 = extattr_delete_file */
+	{ AS(aio_waitcomplete_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 359 = aio_waitcomplete */
+	{ AS(getresuid_args), (sy_call_t *)sys_getresuid, AUE_GETRESUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 360 = getresuid */
+	{ AS(getresgid_args), (sy_call_t *)sys_getresgid, AUE_GETRESGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 361 = getresgid */
+	{ 0, (sy_call_t *)sys_kqueue, AUE_KQUEUE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 362 = kqueue */
+	{ AS(kevent_args), (sy_call_t *)sys_kevent, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 363 = kevent */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 364 = __cap_get_proc */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 365 = __cap_set_proc */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 366 = __cap_get_fd */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 367 = __cap_get_file */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 368 = __cap_set_fd */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 369 = __cap_set_file */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 370 = nosys */
+	{ AS(extattr_set_fd_args), (sy_call_t *)sys_extattr_set_fd, AUE_EXTATTR_SET_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 371 = extattr_set_fd */
+	{ AS(extattr_get_fd_args), (sy_call_t *)sys_extattr_get_fd, AUE_EXTATTR_GET_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 372 = extattr_get_fd */
+	{ AS(extattr_delete_fd_args), (sy_call_t *)sys_extattr_delete_fd, AUE_EXTATTR_DELETE_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 373 = extattr_delete_fd */
+	{ AS(__setugid_args), (sy_call_t *)sys___setugid, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 374 = __setugid */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 375 = nfsclnt */
+	{ AS(eaccess_args), (sy_call_t *)sys_eaccess, AUE_EACCESS, NULL, 0, 0, 0, SY_THR_STATIC },	/* 376 = eaccess */
+	{ AS(afs3_syscall_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 377 = afs3_syscall */
+	{ AS(nmount_args), (sy_call_t *)sys_nmount, AUE_NMOUNT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 378 = nmount */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 379 = kse_exit */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 380 = kse_wakeup */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 381 = kse_create */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 382 = kse_thr_interrupt */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 383 = kse_release */
+	{ AS(__mac_get_proc_args), (sy_call_t *)sys___mac_get_proc, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 384 = __mac_get_proc */
+	{ AS(__mac_set_proc_args), (sy_call_t *)sys___mac_set_proc, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 385 = __mac_set_proc */
+	{ AS(__mac_get_fd_args), (sy_call_t *)sys___mac_get_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 386 = __mac_get_fd */
+	{ AS(__mac_get_file_args), (sy_call_t *)sys___mac_get_file, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 387 = __mac_get_file */
+	{ AS(__mac_set_fd_args), (sy_call_t *)sys___mac_set_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 388 = __mac_set_fd */
+	{ AS(__mac_set_file_args), (sy_call_t *)sys___mac_set_file, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 389 = __mac_set_file */
+	{ AS(kenv_args), (sy_call_t *)sys_kenv, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 390 = kenv */
+	{ AS(lchflags_args), (sy_call_t *)sys_lchflags, AUE_LCHFLAGS, NULL, 0, 0, 0, SY_THR_STATIC },	/* 391 = lchflags */
+	{ AS(uuidgen_args), (sy_call_t *)sys_uuidgen, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 392 = uuidgen */
+	{ AS(sendfile_args), (sy_call_t *)sys_sendfile, AUE_SENDFILE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 393 = sendfile */
+	{ AS(mac_syscall_args), (sy_call_t *)sys_mac_syscall, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 394 = mac_syscall */
+	{ AS(getfsstat_args), (sy_call_t *)sys_getfsstat, AUE_GETFSSTAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 395 = getfsstat */
+	{ AS(statfs_args), (sy_call_t *)sys_statfs, AUE_STATFS, NULL, 0, 0, 0, SY_THR_STATIC },	/* 396 = statfs */
+	{ AS(fstatfs_args), (sy_call_t *)sys_fstatfs, AUE_FSTATFS, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 397 = fstatfs */
+	{ AS(fhstatfs_args), (sy_call_t *)sys_fhstatfs, AUE_FHSTATFS, NULL, 0, 0, 0, SY_THR_STATIC },	/* 398 = fhstatfs */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 399 = nosys */
+	{ AS(ksem_close_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 400 = ksem_close */
+	{ AS(ksem_post_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 401 = ksem_post */
+	{ AS(ksem_wait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 402 = ksem_wait */
+	{ AS(ksem_trywait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 403 = ksem_trywait */
+	{ AS(ksem_init_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 404 = ksem_init */
+	{ AS(ksem_open_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 405 = ksem_open */
+	{ AS(ksem_unlink_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 406 = ksem_unlink */
+	{ AS(ksem_getvalue_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 407 = ksem_getvalue */
+	{ AS(ksem_destroy_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 408 = ksem_destroy */
+	{ AS(__mac_get_pid_args), (sy_call_t *)sys___mac_get_pid, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 409 = __mac_get_pid */
+	{ AS(__mac_get_link_args), (sy_call_t *)sys___mac_get_link, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 410 = __mac_get_link */
+	{ AS(__mac_set_link_args), (sy_call_t *)sys___mac_set_link, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 411 = __mac_set_link */
+	{ AS(extattr_set_link_args), (sy_call_t *)sys_extattr_set_link, AUE_EXTATTR_SET_LINK, NULL, 0, 0, 0, SY_THR_STATIC },	/* 412 = extattr_set_link */
+	{ AS(extattr_get_link_args), (sy_call_t *)sys_extattr_get_link, AUE_EXTATTR_GET_LINK, NULL, 0, 0, 0, SY_THR_STATIC },	/* 413 = extattr_get_link */
+	{ AS(extattr_delete_link_args), (sy_call_t *)sys_extattr_delete_link, AUE_EXTATTR_DELETE_LINK, NULL, 0, 0, 0, SY_THR_STATIC },	/* 414 = extattr_delete_link */
+	{ AS(__mac_execve_args), (sy_call_t *)sys___mac_execve, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 415 = __mac_execve */
+	{ AS(sigaction_args), (sy_call_t *)sys_sigaction, AUE_SIGACTION, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 416 = sigaction */
+	{ AS(sigreturn_args), (sy_call_t *)sys_sigreturn, AUE_SIGRETURN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 417 = sigreturn */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 418 = __xstat */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 419 = __xfstat */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 420 = __xlstat */
+	{ AS(getcontext_args), (sy_call_t *)sys_getcontext, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 421 = getcontext */
+	{ AS(setcontext_args), (sy_call_t *)sys_setcontext, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 422 = setcontext */
+	{ AS(swapcontext_args), (sy_call_t *)sys_swapcontext, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 423 = swapcontext */
+	{ AS(swapoff_args), (sy_call_t *)sys_swapoff, AUE_SWAPOFF, NULL, 0, 0, 0, SY_THR_STATIC },	/* 424 = swapoff */
+	{ AS(__acl_get_link_args), (sy_call_t *)sys___acl_get_link, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 425 = __acl_get_link */
+	{ AS(__acl_set_link_args), (sy_call_t *)sys___acl_set_link, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 426 = __acl_set_link */
+	{ AS(__acl_delete_link_args), (sy_call_t *)sys___acl_delete_link, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 427 = __acl_delete_link */
+	{ AS(__acl_aclcheck_link_args), (sy_call_t *)sys___acl_aclcheck_link, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 428 = __acl_aclcheck_link */
+	{ AS(sigwait_args), (sy_call_t *)sys_sigwait, AUE_SIGWAIT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 429 = sigwait */
+	{ AS(thr_create_args), (sy_call_t *)sys_thr_create, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 430 = thr_create */
+	{ AS(thr_exit_args), (sy_call_t *)sys_thr_exit, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 431 = thr_exit */
+	{ AS(thr_self_args), (sy_call_t *)sys_thr_self, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 432 = thr_self */
+	{ AS(thr_kill_args), (sy_call_t *)sys_thr_kill, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 433 = thr_kill */
+	{ AS(_umtx_lock_args), (sy_call_t *)sys__umtx_lock, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 434 = _umtx_lock */
+	{ AS(_umtx_unlock_args), (sy_call_t *)sys__umtx_unlock, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 435 = _umtx_unlock */
+	{ AS(jail_attach_args), (sy_call_t *)sys_jail_attach, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 436 = jail_attach */
+	{ AS(extattr_list_fd_args), (sy_call_t *)sys_extattr_list_fd, AUE_EXTATTR_LIST_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 437 = extattr_list_fd */
+	{ AS(extattr_list_file_args), (sy_call_t *)sys_extattr_list_file, AUE_EXTATTR_LIST_FILE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 438 = extattr_list_file */
+	{ AS(extattr_list_link_args), (sy_call_t *)sys_extattr_list_link, AUE_EXTATTR_LIST_LINK, NULL, 0, 0, 0, SY_THR_STATIC },	/* 439 = extattr_list_link */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 440 = kse_switchin */
+	{ AS(ksem_timedwait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 441 = ksem_timedwait */
+	{ AS(thr_suspend_args), (sy_call_t *)sys_thr_suspend, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 442 = thr_suspend */
+	{ AS(thr_wake_args), (sy_call_t *)sys_thr_wake, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 443 = thr_wake */
+	{ AS(kldunloadf_args), (sy_call_t *)sys_kldunloadf, AUE_MODUNLOAD, NULL, 0, 0, 0, SY_THR_STATIC },	/* 444 = kldunloadf */
+	{ AS(audit_args), (sy_call_t *)sys_audit, AUE_AUDIT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 445 = audit */
+	{ AS(auditon_args), (sy_call_t *)sys_auditon, AUE_AUDITON, NULL, 0, 0, 0, SY_THR_STATIC },	/* 446 = auditon */
+	{ AS(getauid_args), (sy_call_t *)sys_getauid, AUE_GETAUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 447 = getauid */
+	{ AS(setauid_args), (sy_call_t *)sys_setauid, AUE_SETAUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 448 = setauid */
+	{ AS(getaudit_args), (sy_call_t *)sys_getaudit, AUE_GETAUDIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 449 = getaudit */
+	{ AS(setaudit_args), (sy_call_t *)sys_setaudit, AUE_SETAUDIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 450 = setaudit */
+	{ AS(getaudit_addr_args), (sy_call_t *)sys_getaudit_addr, AUE_GETAUDIT_ADDR, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 451 = getaudit_addr */
+	{ AS(setaudit_addr_args), (sy_call_t *)sys_setaudit_addr, AUE_SETAUDIT_ADDR, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 452 = setaudit_addr */
+	{ AS(auditctl_args), (sy_call_t *)sys_auditctl, AUE_AUDITCTL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 453 = auditctl */
+	{ AS(_umtx_op_args), (sy_call_t *)sys__umtx_op, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 454 = _umtx_op */
+	{ AS(thr_new_args), (sy_call_t *)sys_thr_new, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 455 = thr_new */
+	{ AS(sigqueue_args), (sy_call_t *)sys_sigqueue, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 456 = sigqueue */
+	{ AS(kmq_open_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 457 = kmq_open */
+	{ AS(kmq_setattr_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 458 = kmq_setattr */
+	{ AS(kmq_timedreceive_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 459 = kmq_timedreceive */
+	{ AS(kmq_timedsend_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 460 = kmq_timedsend */
+	{ AS(kmq_notify_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 461 = kmq_notify */
+	{ AS(kmq_unlink_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 462 = kmq_unlink */
+	{ AS(abort2_args), (sy_call_t *)sys_abort2, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 463 = abort2 */
+	{ AS(thr_set_name_args), (sy_call_t *)sys_thr_set_name, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 464 = thr_set_name */
+	{ AS(aio_fsync_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 465 = aio_fsync */
+	{ AS(rtprio_thread_args), (sy_call_t *)sys_rtprio_thread, AUE_RTPRIO, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 466 = rtprio_thread */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 467 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 468 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 469 = __getpath_fromfd */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 470 = __getpath_fromaddr */
+	{ AS(sctp_peeloff_args), (sy_call_t *)sys_sctp_peeloff, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 471 = sctp_peeloff */
+	{ AS(sctp_generic_sendmsg_args), (sy_call_t *)sys_sctp_generic_sendmsg, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 472 = sctp_generic_sendmsg */
+	{ AS(sctp_generic_sendmsg_iov_args), (sy_call_t *)sys_sctp_generic_sendmsg_iov, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 473 = sctp_generic_sendmsg_iov */
+	{ AS(sctp_generic_recvmsg_args), (sy_call_t *)sys_sctp_generic_recvmsg, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 474 = sctp_generic_recvmsg */
+	{ AS(pread_args), (sy_call_t *)sys_pread, AUE_PREAD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 475 = pread */
+	{ AS(pwrite_args), (sy_call_t *)sys_pwrite, AUE_PWRITE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 476 = pwrite */
+	{ AS(mmap_args), (sy_call_t *)sys_mmap, AUE_MMAP, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 477 = mmap */
+	{ AS(lseek_args), (sy_call_t *)sys_lseek, AUE_LSEEK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 478 = lseek */
+	{ AS(truncate_args), (sy_call_t *)sys_truncate, AUE_TRUNCATE, NULL, 0, 0, 0, SY_THR_STATIC },	/* 479 = truncate */
+	{ AS(ftruncate_args), (sy_call_t *)sys_ftruncate, AUE_FTRUNCATE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 480 = ftruncate */
+	{ AS(thr_kill2_args), (sy_call_t *)sys_thr_kill2, AUE_KILL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 481 = thr_kill2 */
+	{ AS(shm_open_args), (sy_call_t *)sys_shm_open, AUE_SHMOPEN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 482 = shm_open */
+	{ AS(shm_unlink_args), (sy_call_t *)sys_shm_unlink, AUE_SHMUNLINK, NULL, 0, 0, 0, SY_THR_STATIC },	/* 483 = shm_unlink */
+	{ AS(cpuset_args), (sy_call_t *)sys_cpuset, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 484 = cpuset */
+	{ AS(cpuset_setid_args), (sy_call_t *)sys_cpuset_setid, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 485 = cpuset_setid */
+	{ AS(cpuset_getid_args), (sy_call_t *)sys_cpuset_getid, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 486 = cpuset_getid */
+	{ AS(cpuset_getaffinity_args), (sy_call_t *)sys_cpuset_getaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 487 = cpuset_getaffinity */
+	{ AS(cpuset_setaffinity_args), (sy_call_t *)sys_cpuset_setaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 488 = cpuset_setaffinity */
+	{ AS(faccessat_args), (sy_call_t *)sys_faccessat, AUE_FACCESSAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 489 = faccessat */
+	{ AS(fchmodat_args), (sy_call_t *)sys_fchmodat, AUE_FCHMODAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 490 = fchmodat */
+	{ AS(fchownat_args), (sy_call_t *)sys_fchownat, AUE_FCHOWNAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 491 = fchownat */
+	{ AS(fexecve_args), (sy_call_t *)sys_fexecve, AUE_FEXECVE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 492 = fexecve */
+	{ AS(fstatat_args), (sy_call_t *)sys_fstatat, AUE_FSTATAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 493 = fstatat */
+	{ AS(futimesat_args), (sy_call_t *)sys_futimesat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 494 = futimesat */
+	{ AS(linkat_args), (sy_call_t *)sys_linkat, AUE_LINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 495 = linkat */
+	{ AS(mkdirat_args), (sy_call_t *)sys_mkdirat, AUE_MKDIRAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 496 = mkdirat */
+	{ AS(mkfifoat_args), (sy_call_t *)sys_mkfifoat, AUE_MKFIFOAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 497 = mkfifoat */
+	{ AS(mknodat_args), (sy_call_t *)sys_mknodat, AUE_MKNODAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 498 = mknodat */
+	{ AS(openat_args), (sy_call_t *)sys_openat, AUE_OPENAT_RWTC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 499 = openat */
+	{ AS(readlinkat_args), (sy_call_t *)sys_readlinkat, AUE_READLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 500 = readlinkat */
+	{ AS(renameat_args), (sy_call_t *)sys_renameat, AUE_RENAMEAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 501 = renameat */
+	{ AS(symlinkat_args), (sy_call_t *)sys_symlinkat, AUE_SYMLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 502 = symlinkat */
+	{ AS(unlinkat_args), (sy_call_t *)sys_unlinkat, AUE_UNLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 503 = unlinkat */
+	{ AS(posix_openpt_args), (sy_call_t *)sys_posix_openpt, AUE_POSIX_OPENPT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 504 = posix_openpt */
+	{ AS(gssd_syscall_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 505 = gssd_syscall */
+	{ AS(jail_get_args), (sy_call_t *)sys_jail_get, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 506 = jail_get */
+	{ AS(jail_set_args), (sy_call_t *)sys_jail_set, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 507 = jail_set */
+	{ AS(jail_remove_args), (sy_call_t *)sys_jail_remove, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 508 = jail_remove */
+	{ AS(closefrom_args), (sy_call_t *)sys_closefrom, AUE_CLOSEFROM, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 509 = closefrom */
+	{ AS(__semctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 510 = __semctl */
+	{ AS(msgctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 511 = msgctl */
+	{ AS(shmctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 512 = shmctl */
+	{ AS(lpathconf_args), (sy_call_t *)sys_lpathconf, AUE_LPATHCONF, NULL, 0, 0, 0, SY_THR_STATIC },	/* 513 = lpathconf */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 514 = obsolete cap_new */
+	{ AS(__cap_rights_get_args), (sy_call_t *)sys___cap_rights_get, AUE_CAP_RIGHTS_GET, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 515 = __cap_rights_get */
+	{ 0, (sy_call_t *)sys_cap_enter, AUE_CAP_ENTER, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 516 = cap_enter */
+	{ AS(cap_getmode_args), (sy_call_t *)sys_cap_getmode, AUE_CAP_GETMODE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 517 = cap_getmode */
+	{ AS(pdfork_args), (sy_call_t *)sys_pdfork, AUE_PDFORK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 518 = pdfork */
+	{ AS(pdkill_args), (sy_call_t *)sys_pdkill, AUE_PDKILL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 519 = pdkill */
+	{ AS(pdgetpid_args), (sy_call_t *)sys_pdgetpid, AUE_PDGETPID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 520 = pdgetpid */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 521 = pdwait4 */
+	{ AS(pselect_args), (sy_call_t *)sys_pselect, AUE_SELECT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 522 = pselect */
+	{ AS(getloginclass_args), (sy_call_t *)sys_getloginclass, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 523 = getloginclass */
+	{ AS(setloginclass_args), (sy_call_t *)sys_setloginclass, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 524 = setloginclass */
+	{ AS(rctl_get_racct_args), (sy_call_t *)sys_rctl_get_racct, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 525 = rctl_get_racct */
+	{ AS(rctl_get_rules_args), (sy_call_t *)sys_rctl_get_rules, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 526 = rctl_get_rules */
+	{ AS(rctl_get_limits_args), (sy_call_t *)sys_rctl_get_limits, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 527 = rctl_get_limits */
+	{ AS(rctl_add_rule_args), (sy_call_t *)sys_rctl_add_rule, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 528 = rctl_add_rule */
+	{ AS(rctl_remove_rule_args), (sy_call_t *)sys_rctl_remove_rule, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 529 = rctl_remove_rule */
+	{ AS(posix_fallocate_args), (sy_call_t *)sys_posix_fallocate, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 530 = posix_fallocate */
+	{ AS(posix_fadvise_args), (sy_call_t *)sys_posix_fadvise, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 531 = posix_fadvise */
+	{ AS(wait6_args), (sy_call_t *)sys_wait6, AUE_WAIT6, NULL, 0, 0, 0, SY_THR_STATIC },	/* 532 = wait6 */
+	{ AS(cap_rights_limit_args), (sy_call_t *)sys_cap_rights_limit, AUE_CAP_RIGHTS_LIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 533 = cap_rights_limit */
+	{ AS(cap_ioctls_limit_args), (sy_call_t *)sys_cap_ioctls_limit, AUE_CAP_IOCTLS_LIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 534 = cap_ioctls_limit */
+	{ AS(cap_ioctls_get_args), (sy_call_t *)sys_cap_ioctls_get, AUE_CAP_IOCTLS_GET, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 535 = cap_ioctls_get */
+	{ AS(cap_fcntls_limit_args), (sy_call_t *)sys_cap_fcntls_limit, AUE_CAP_FCNTLS_LIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 536 = cap_fcntls_limit */
+	{ AS(cap_fcntls_get_args), (sy_call_t *)sys_cap_fcntls_get, AUE_CAP_FCNTLS_GET, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 537 = cap_fcntls_get */
+	{ AS(bindat_args), (sy_call_t *)sys_bindat, AUE_BINDAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 538 = bindat */
+	{ AS(connectat_args), (sy_call_t *)sys_connectat, AUE_CONNECTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 539 = connectat */
+	{ AS(chflagsat_args), (sy_call_t *)sys_chflagsat, AUE_CHFLAGSAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 540 = chflagsat */
+	{ AS(accept4_args), (sy_call_t *)sys_accept4, AUE_ACCEPT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 541 = accept4 */
+	{ AS(pipe2_args), (sy_call_t *)sys_pipe2, AUE_PIPE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 542 = pipe2 */
+	{ AS(aio_mlock_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 543 = aio_mlock */
+};
diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c
new file mode 100644
index 0000000..ef3fd2e
--- /dev/null
+++ b/sys/kern/kern_acct.c
@@ -0,0 +1,647 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * Copyright (c) 2005 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (c) 1994 Christopher G. Demetriou
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_acct.c	8.1 (Berkeley) 6/14/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/acct.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+#include <sys/tty.h>
+#include <sys/vnode.h>
+
+#include <security/mac/mac_framework.h>
+
+/*
+ * The routines implemented in this file are described in:
+ *      Leffler, et al.: The Design and Implementation of the 4.3BSD
+ *	    UNIX Operating System (Addison Welley, 1989)
+ * on pages 62-63.
+ * On May 2007 the historic 3 bits base 8 exponent, 13 bit fraction
+ * compt_t representation described in the above reference was replaced
+ * with that of IEEE-754 floats.
+ *
+ * Arguably, to simplify accounting operations, this mechanism should
+ * be replaced by one in which an accounting log file (similar to /dev/klog)
+ * is read by a user process, etc.  However, that has its own problems.
+ */
+
+/* Floating point definitions from <float.h>. */
+#define FLT_MANT_DIG    24              /* p */
+#define FLT_MAX_EXP     128             /* emax */
+
+/*
+ * Internal accounting functions.
+ * The former's operation is described in Leffler, et al., and the latter
+ * was provided by UCB with the 4.4BSD-Lite release
+ */
+static uint32_t	encode_timeval(struct timeval);
+static uint32_t	encode_long(long);
+static void	acctwatch(void);
+static void	acct_thread(void *);
+static int	acct_disable(struct thread *, int);
+
+/*
+ * Accounting vnode pointer, saved vnode pointer, and flags for each.
+ * acct_sx protects against changes to the active vnode and credentials
+ * while accounting records are being committed to disk.
+ */
+static int		 acct_configured;
+static int		 acct_suspended;
+static struct vnode	*acct_vp;
+static struct ucred	*acct_cred;
+static struct plimit	*acct_limit;
+static int		 acct_flags;
+static struct sx	 acct_sx;
+
+SX_SYSINIT(acct, &acct_sx, "acct_sx");
+
+/*
+ * State of the accounting kthread.
+ */
+static int		 acct_state;
+
+#define	ACCT_RUNNING	1	/* Accounting kthread is running. */
+#define	ACCT_EXITREQ	2	/* Accounting kthread should exit. */
+
+/*
+ * Values associated with enabling and disabling accounting
+ */
+static int acctsuspend = 2;	/* stop accounting when < 2% free space left */
+SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW,
+	&acctsuspend, 0, "percentage of free disk space below which accounting stops");
+
+static int acctresume = 4;	/* resume when free space risen to > 4% */
+SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW,
+	&acctresume, 0, "percentage of free disk space above which accounting resumes");
+
+static int acctchkfreq = 15;	/* frequency (in seconds) to check space */
+
+static int
+sysctl_acct_chkfreq(SYSCTL_HANDLER_ARGS)
+{
+	int error, value;
+
+	/* Write out the old value. */
+	error = SYSCTL_OUT(req, &acctchkfreq, sizeof(int));
+	if (error || req->newptr == NULL)
+		return (error);
+
+	/* Read in and verify the new value. */
+	error = SYSCTL_IN(req, &value, sizeof(int));
+	if (error)
+		return (error);
+	if (value <= 0)
+		return (EINVAL);
+	acctchkfreq = value;
+	return (0);
+}
+SYSCTL_PROC(_kern, OID_AUTO, acct_chkfreq, CTLTYPE_INT|CTLFLAG_RW,
+    &acctchkfreq, 0, sysctl_acct_chkfreq, "I",
+    "frequency for checking the free space");
+
+SYSCTL_INT(_kern, OID_AUTO, acct_configured, CTLFLAG_RD, &acct_configured, 0,
+	"Accounting configured or not");
+
+SYSCTL_INT(_kern, OID_AUTO, acct_suspended, CTLFLAG_RD, &acct_suspended, 0,
+	"Accounting suspended or not");
+
+/*
+ * Accounting system call.  Written based on the specification and previous
+ * implementation done by Mark Tinguely.
+ */
+int
+sys_acct(struct thread *td, struct acct_args *uap)
+{
+	struct nameidata nd;
+	int error, flags, i, replacing;
+
+	error = priv_check(td, PRIV_ACCT);
+	if (error)
+		return (error);
+
+	/*
+	 * If accounting is to be started to a file, open that file for
+	 * appending and make sure it's a 'normal'.
+	 */
+	if (uap->path != NULL) {
+		NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1,
+		    UIO_USERSPACE, uap->path, td);
+		flags = FWRITE | O_APPEND;
+		error = vn_open(&nd, &flags, 0, NULL);
+		if (error)
+			return (error);
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+#ifdef MAC
+		error = mac_system_check_acct(td->td_ucred, nd.ni_vp);
+		if (error) {
+			VOP_UNLOCK(nd.ni_vp, 0);
+			vn_close(nd.ni_vp, flags, td->td_ucred, td);
+			return (error);
+		}
+#endif
+		VOP_UNLOCK(nd.ni_vp, 0);
+		if (nd.ni_vp->v_type != VREG) {
+			vn_close(nd.ni_vp, flags, td->td_ucred, td);
+			return (EACCES);
+		}
+#ifdef MAC
+	} else {
+		error = mac_system_check_acct(td->td_ucred, NULL);
+		if (error)
+			return (error);
+#endif
+	}
+
+	/*
+	 * Disallow concurrent access to the accounting vnode while we swap
+	 * it out, in order to prevent access after close.
+	 */
+	sx_xlock(&acct_sx);
+
+	/*
+	 * Don't log spurious disable/enable messages if we are
+	 * switching from one accounting file to another due to log
+	 * rotation.
+	 */
+	replacing = (acct_vp != NULL && uap->path != NULL);
+
+	/*
+	 * If accounting was previously enabled, kill the old space-watcher,
+	 * close the file, and (if no new file was specified, leave).  Reset
+	 * the suspended state regardless of whether accounting remains
+	 * enabled.
+	 */
+	acct_suspended = 0;
+	if (acct_vp != NULL)
+		error = acct_disable(td, !replacing);
+	if (uap->path == NULL) {
+		if (acct_state & ACCT_RUNNING) {
+			acct_state |= ACCT_EXITREQ;
+			wakeup(&acct_state);
+		}
+		sx_xunlock(&acct_sx);
+		return (error);
+	}
+
+	/*
+	 * Create our own plimit object without limits. It will be assigned
+	 * to exiting processes.
+	 */
+	acct_limit = lim_alloc();
+	for (i = 0; i < RLIM_NLIMITS; i++)
+		acct_limit->pl_rlimit[i].rlim_cur =
+		    acct_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY;
+
+	/*
+	 * Save the new accounting file vnode, and schedule the new
+	 * free space watcher.
+	 */
+	acct_vp = nd.ni_vp;
+	acct_cred = crhold(td->td_ucred);
+	acct_flags = flags;
+	if (acct_state & ACCT_RUNNING)
+		acct_state &= ~ACCT_EXITREQ;
+	else {
+		/*
+		 * Try to start up an accounting kthread.  We may start more
+		 * than one, but if so the extras will commit suicide as
+		 * soon as they start up.
+		 */
+		error = kproc_create(acct_thread, NULL, NULL, 0, 0,
+		    "accounting");
+		if (error) {
+			(void) acct_disable(td, 0);
+			sx_xunlock(&acct_sx);
+			log(LOG_NOTICE, "Unable to start accounting thread\n");
+			return (error);
+		}
+	}
+	acct_configured = 1;
+	sx_xunlock(&acct_sx);
+	if (!replacing)
+		log(LOG_NOTICE, "Accounting enabled\n");
+	return (error);
+}
+
+/*
+ * Disable currently in-progress accounting by closing the vnode, dropping
+ * our reference to the credential, and clearing the vnode's flags.
+ */
+static int
+acct_disable(struct thread *td, int logging)
+{
+	int error;
+
+	sx_assert(&acct_sx, SX_XLOCKED);
+	error = vn_close(acct_vp, acct_flags, acct_cred, td);
+	crfree(acct_cred);
+	lim_free(acct_limit);
+	acct_configured = 0;
+	acct_vp = NULL;
+	acct_cred = NULL;
+	acct_flags = 0;
+	if (logging)
+		log(LOG_NOTICE, "Accounting disabled\n");
+	return (error);
+}
+
+/*
+ * Write out process accounting information, on process exit.
+ * Data to be written out is specified in Leffler, et al.
+ * and are enumerated below.  (They're also noted in the system
+ * "acct.h" header file.)
+ */
+int
+acct_process(struct thread *td)
+{
+	struct acctv2 acct;
+	struct timeval ut, st, tmp;
+	struct plimit *oldlim;
+	struct proc *p;
+	struct rusage ru;
+	int t, ret;
+
+	/*
+	 * Lockless check of accounting condition before doing the hard
+	 * work.
+	 */
+	if (acct_vp == NULL || acct_suspended)
+		return (0);
+
+	sx_slock(&acct_sx);
+
+	/*
+	 * If accounting isn't enabled, don't bother.  Have to check again
+	 * once we own the lock in case we raced with disabling of accounting
+	 * by another thread.
+	 */
+	if (acct_vp == NULL || acct_suspended) {
+		sx_sunlock(&acct_sx);
+		return (0);
+	}
+
+	p = td->td_proc;
+
+	/*
+	 * Get process accounting information.
+	 */
+
+	sx_slock(&proctree_lock);
+	PROC_LOCK(p);
+
+	/* (1) The terminal from which the process was started */
+	if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp)
+		acct.ac_tty = tty_udev(p->p_pgrp->pg_session->s_ttyp);
+	else
+		acct.ac_tty = NODEV;
+	sx_sunlock(&proctree_lock);
+
+	/* (2) The name of the command that ran */
+	bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
+
+	/* (3) The amount of user and system time that was used */
+	rufetchcalc(p, &ru, &ut, &st);
+	acct.ac_utime = encode_timeval(ut);
+	acct.ac_stime = encode_timeval(st);
+
+	/* (4) The elapsed time the command ran (and its starting time) */
+	tmp = boottime;
+	timevaladd(&tmp, &p->p_stats->p_start);
+	acct.ac_btime = tmp.tv_sec;
+	microuptime(&tmp);
+	timevalsub(&tmp, &p->p_stats->p_start);
+	acct.ac_etime = encode_timeval(tmp);
+
+	/* (5) The average amount of memory used */
+	tmp = ut;
+	timevaladd(&tmp, &st);
+	/* Convert tmp (i.e. u + s) into hz units to match ru_i*. */
+	t = tmp.tv_sec * hz + tmp.tv_usec / tick;
+	if (t)
+		acct.ac_mem = encode_long((ru.ru_ixrss + ru.ru_idrss +
+		    + ru.ru_isrss) / t);
+	else
+		acct.ac_mem = 0;
+
+	/* (6) The number of disk I/O operations done */
+	acct.ac_io = encode_long(ru.ru_inblock + ru.ru_oublock);
+
+	/* (7) The UID and GID of the process */
+	acct.ac_uid = p->p_ucred->cr_ruid;
+	acct.ac_gid = p->p_ucred->cr_rgid;
+
+	/* (8) The boolean flags that tell how the process terminated, etc. */
+	acct.ac_flagx = p->p_acflag;
+
+	/* Setup ancillary structure fields. */
+	acct.ac_flagx |= ANVER;
+	acct.ac_zero = 0;
+	acct.ac_version = 2;
+	acct.ac_len = acct.ac_len2 = sizeof(acct);
+
+	/*
+	 * Eliminate rlimits (file size limit in particular).
+	 */
+	oldlim = p->p_limit;
+	p->p_limit = lim_hold(acct_limit);
+	PROC_UNLOCK(p);
+	lim_free(oldlim);
+
+	/*
+	 * Write the accounting information to the file.
+	 */
+	ret = vn_rdwr(UIO_WRITE, acct_vp, (caddr_t)&acct, sizeof (acct),
+	    (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, acct_cred, NOCRED,
+	    NULL, td);
+	sx_sunlock(&acct_sx);
+	return (ret);
+}
+
+/* FLOAT_CONVERSION_START (Regression testing; don't remove this line.) */
+
+/* Convert timevals and longs into IEEE-754 bit patterns. */
+
+/* Mantissa mask (MSB is implied, so subtract 1). */
+#define MANT_MASK ((1 << (FLT_MANT_DIG - 1)) - 1)
+
+/*
+ * We calculate integer values to a precision of approximately
+ * 28 bits.
+ * This is high-enough precision to fill the 24 float bits
+ * and low-enough to avoid overflowing the 32 int bits.
+ */
+#define CALC_BITS 28
+
+/* log_2(1000000). */
+#define LOG2_1M 20
+
+/*
+ * Convert the elements of a timeval into a 32-bit word holding
+ * the bits of a IEEE-754 float.
+ * The float value represents the timeval's value in microsecond units.
+ */
+static uint32_t
+encode_timeval(struct timeval tv)
+{
+	int log2_s;
+	int val, exp;	/* Unnormalized value and exponent */
+	int norm_exp;	/* Normalized exponent */
+	int shift;
+
+	/*
+	 * First calculate value and exponent to about CALC_BITS precision.
+	 * Note that the following conditionals have been ordered so that
+	 * the most common cases appear first.
+	 */
+	if (tv.tv_sec == 0) {
+		if (tv.tv_usec == 0)
+			return (0);
+		exp = 0;
+		val = tv.tv_usec;
+	} else {
+		/*
+		 * Calculate the value to a precision of approximately
+		 * CALC_BITS.
+		 */
+		log2_s = fls(tv.tv_sec) - 1;
+		if (log2_s + LOG2_1M < CALC_BITS) {
+			exp = 0;
+			val = 1000000 * tv.tv_sec + tv.tv_usec;
+		} else {
+			exp = log2_s + LOG2_1M - CALC_BITS;
+			val = (unsigned int)(((uint64_t)1000000 * tv.tv_sec +
+			    tv.tv_usec) >> exp);
+		}
+	}
+	/* Now normalize and pack the value into an IEEE-754 float. */
+	norm_exp = fls(val) - 1;
+	shift = FLT_MANT_DIG - norm_exp - 1;
+#ifdef ACCT_DEBUG
+	printf("val=%d exp=%d shift=%d log2(val)=%d\n",
+	    val, exp, shift, norm_exp);
+	printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
+	    ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
+#endif
+	return (((FLT_MAX_EXP - 1 + exp + norm_exp) << (FLT_MANT_DIG - 1)) |
+	    ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
+}
+
+/*
+ * Convert a non-negative long value into the bit pattern of
+ * an IEEE-754 float value.
+ */
+static uint32_t
+encode_long(long val)
+{
+	int norm_exp;	/* Normalized exponent */
+	int shift;
+
+	if (val == 0)
+		return (0);
+	if (val < 0) {
+		log(LOG_NOTICE,
+		    "encode_long: negative value %ld in accounting record\n",
+		    val);
+		val = LONG_MAX;
+	}
+	norm_exp = fls(val) - 1;
+	shift = FLT_MANT_DIG - norm_exp - 1;
+#ifdef ACCT_DEBUG
+	printf("val=%d shift=%d log2(val)=%d\n",
+	    val, shift, norm_exp);
+	printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
+	    ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
+#endif
+	return (((FLT_MAX_EXP - 1 + norm_exp) << (FLT_MANT_DIG - 1)) |
+	    ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
+}
+
+/* FLOAT_CONVERSION_END (Regression testing; don't remove this line.) */
+
+/*
+ * Periodically check the filesystem to see if accounting
+ * should be turned on or off.  Beware the case where the vnode
+ * has been vgone()'d out from underneath us, e.g. when the file
+ * system containing the accounting file has been forcibly unmounted.
+ */
+/* ARGSUSED */
+static void
+acctwatch(void)
+{
+	struct statfs sb;
+
+	sx_assert(&acct_sx, SX_XLOCKED);
+
+	/*
+	 * If accounting was disabled before our kthread was scheduled,
+	 * then acct_vp might be NULL.  If so, just ask our kthread to
+	 * exit and return.
+	 */
+	if (acct_vp == NULL) {
+		acct_state |= ACCT_EXITREQ;
+		return;
+	}
+
+	/*
+	 * If our vnode is no longer valid, tear it down and signal the
+	 * accounting thread to die.
+	 */
+	if (acct_vp->v_type == VBAD) {
+		(void) acct_disable(NULL, 1);
+		acct_state |= ACCT_EXITREQ;
+		return;
+	}
+
+	/*
+	 * Stopping here is better than continuing, maybe it will be VBAD
+	 * next time around.
+	 */
+	if (VFS_STATFS(acct_vp->v_mount, &sb) < 0)
+		return;
+	if (acct_suspended) {
+		if (sb.f_bavail > (int64_t)(acctresume * sb.f_blocks /
+		    100)) {
+			acct_suspended = 0;
+			log(LOG_NOTICE, "Accounting resumed\n");
+		}
+	} else {
+		if (sb.f_bavail <= (int64_t)(acctsuspend * sb.f_blocks /
+		    100)) {
+			acct_suspended = 1;
+			log(LOG_NOTICE, "Accounting suspended\n");
+		}
+	}
+}
+
+/*
+ * The main loop for the dedicated kernel thread that periodically calls
+ * acctwatch().
+ */
+static void
+acct_thread(void *dummy)
+{
+	u_char pri;
+
+	/* This is a low-priority kernel thread. */
+	pri = PRI_MAX_KERN;
+	thread_lock(curthread);
+	sched_prio(curthread, pri);
+	thread_unlock(curthread);
+
+	/* If another accounting kthread is already running, just die. */
+	sx_xlock(&acct_sx);
+	if (acct_state & ACCT_RUNNING) {
+		sx_xunlock(&acct_sx);
+		kproc_exit(0);
+	}
+	acct_state |= ACCT_RUNNING;
+
+	/* Loop until we are asked to exit. */
+	while (!(acct_state & ACCT_EXITREQ)) {
+
+		/* Perform our periodic checks. */
+		acctwatch();
+
+		/*
+		 * We check this flag again before sleeping since the
+		 * acctwatch() might have shut down accounting and asked us
+		 * to exit.
+		 */
+		if (!(acct_state & ACCT_EXITREQ)) {
+			sx_sleep(&acct_state, &acct_sx, 0, "-",
+			    acctchkfreq * hz);
+		}
+	}
+
+	/*
+	 * Acknowledge the exit request and shutdown.  We clear both the
+	 * exit request and running flags.
+	 */
+	acct_state = 0;
+	sx_xunlock(&acct_sx);
+	kproc_exit(0);
+}
diff --git a/sys/kern/kern_alq.c b/sys/kern/kern_alq.c
new file mode 100644
index 0000000..1e6fcf7
--- /dev/null
+++ b/sys/kern/kern_alq.c
@@ -0,0 +1,971 @@
+/*-
+ * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2008-2009, Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2009-2010, The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_mac.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/alq.h>
+#include <sys/malloc.h>
+#include <sys/unistd.h>
+#include <sys/fcntl.h>
+#include <sys/eventhandler.h>
+
+#include <security/mac/mac_framework.h>
+
+/* Async. Logging Queue */
+struct alq {
+	char	*aq_entbuf;		/* Buffer for stored entries */
+	int	aq_entmax;		/* Max entries */
+	int	aq_entlen;		/* Entry length */
+	int	aq_freebytes;		/* Bytes available in buffer */
+	int	aq_buflen;		/* Total length of our buffer */
+	int	aq_writehead;		/* Location for next write */
+	int	aq_writetail;		/* Flush starts at this location */
+	int	aq_wrapearly;		/* # bytes left blank at end of buf */
+	int	aq_flags;		/* Queue flags */
+	int	aq_waiters;		/* Num threads waiting for resources
+					 * NB: Used as a wait channel so must
+					 * not be first field in the alq struct
+					 */
+	struct	ale	aq_getpost;	/* ALE for use by get/post */
+	struct mtx	aq_mtx;		/* Queue lock */
+	struct vnode	*aq_vp;		/* Open vnode handle */
+	struct ucred	*aq_cred;	/* Credentials of the opening thread */
+	LIST_ENTRY(alq)	aq_act;		/* List of active queues */
+	LIST_ENTRY(alq)	aq_link;	/* List of all queues */
+};
+
+#define	AQ_WANTED	0x0001		/* Wakeup sleeper when io is done */
+#define	AQ_ACTIVE	0x0002		/* on the active list */
+#define	AQ_FLUSHING	0x0004		/* doing IO */
+#define	AQ_SHUTDOWN	0x0008		/* Queue no longer valid */
+#define	AQ_ORDERED	0x0010		/* Queue enforces ordered writes */
+#define	AQ_LEGACY	0x0020		/* Legacy queue (fixed length writes) */
+
+#define	ALQ_LOCK(alq)	mtx_lock_spin(&(alq)->aq_mtx)
+#define	ALQ_UNLOCK(alq)	mtx_unlock_spin(&(alq)->aq_mtx)
+
+#define HAS_PENDING_DATA(alq) ((alq)->aq_freebytes != (alq)->aq_buflen)
+
+static MALLOC_DEFINE(M_ALD, "ALD", "ALD");
+
+/*
+ * The ald_mtx protects the ald_queues list and the ald_active list.
+ */
+static struct mtx ald_mtx;
+static LIST_HEAD(, alq) ald_queues;
+static LIST_HEAD(, alq) ald_active;
+static int ald_shutingdown = 0;
+struct thread *ald_thread;
+static struct proc *ald_proc;
+static eventhandler_tag alq_eventhandler_tag = NULL;
+
+#define	ALD_LOCK()	mtx_lock(&ald_mtx)
+#define	ALD_UNLOCK()	mtx_unlock(&ald_mtx)
+
+/* Daemon functions */
+static int ald_add(struct alq *);
+static int ald_rem(struct alq *);
+static void ald_startup(void *);
+static void ald_daemon(void);
+static void ald_shutdown(void *, int);
+static void ald_activate(struct alq *);
+static void ald_deactivate(struct alq *);
+
+/* Internal queue functions */
+static void alq_shutdown(struct alq *);
+static void alq_destroy(struct alq *);
+static int alq_doio(struct alq *);
+
+
+/*
+ * Add a new queue to the global list.  Fail if we're shutting down.
+ */
+static int
+ald_add(struct alq *alq)
+{
+	int error;
+
+	error = 0;
+
+	ALD_LOCK();
+	if (ald_shutingdown) {
+		error = EBUSY;
+		goto done;
+	}
+	LIST_INSERT_HEAD(&ald_queues, alq, aq_link);
+done:
+	ALD_UNLOCK();
+	return (error);
+}
+
+/*
+ * Remove a queue from the global list unless we're shutting down.  If so,
+ * the ald will take care of cleaning up it's resources.
+ */
+static int
+ald_rem(struct alq *alq)
+{
+	int error;
+
+	error = 0;
+
+	ALD_LOCK();
+	if (ald_shutingdown) {
+		error = EBUSY;
+		goto done;
+	}
+	LIST_REMOVE(alq, aq_link);
+done:
+	ALD_UNLOCK();
+	return (error);
+}
+
+/*
+ * Put a queue on the active list.  This will schedule it for writing.
+ */
+static void
+ald_activate(struct alq *alq)
+{
+	LIST_INSERT_HEAD(&ald_active, alq, aq_act);
+	wakeup(&ald_active);
+}
+
+static void
+ald_deactivate(struct alq *alq)
+{
+	LIST_REMOVE(alq, aq_act);
+	alq->aq_flags &= ~AQ_ACTIVE;
+}
+
+static void
+ald_startup(void *unused)
+{
+	mtx_init(&ald_mtx, "ALDmtx", NULL, MTX_DEF|MTX_QUIET);
+	LIST_INIT(&ald_queues);
+	LIST_INIT(&ald_active);
+}
+
+static void
+ald_daemon(void)
+{
+	int needwakeup;
+	struct alq *alq;
+
+	ald_thread = FIRST_THREAD_IN_PROC(ald_proc);
+
+	alq_eventhandler_tag = EVENTHANDLER_REGISTER(shutdown_pre_sync,
+	    ald_shutdown, NULL, SHUTDOWN_PRI_FIRST);
+
+	ALD_LOCK();
+
+	for (;;) {
+		while ((alq = LIST_FIRST(&ald_active)) == NULL &&
+		    !ald_shutingdown)
+			mtx_sleep(&ald_active, &ald_mtx, PWAIT, "aldslp", 0);
+
+		/* Don't shutdown until all active ALQs are flushed. */
+		if (ald_shutingdown && alq == NULL) {
+			ALD_UNLOCK();
+			break;
+		}
+
+		ALQ_LOCK(alq);
+		ald_deactivate(alq);
+		ALD_UNLOCK();
+		needwakeup = alq_doio(alq);
+		ALQ_UNLOCK(alq);
+		if (needwakeup)
+			wakeup_one(alq);
+		ALD_LOCK();
+	}
+
+	kproc_exit(0);
+}
+
+static void
+ald_shutdown(void *arg, int howto)
+{
+	struct alq *alq;
+
+	ALD_LOCK();
+
+	/* Ensure no new queues can be created. */
+	ald_shutingdown = 1;
+
+	/* Shutdown all ALQs prior to terminating the ald_daemon. */
+	while ((alq = LIST_FIRST(&ald_queues)) != NULL) {
+		LIST_REMOVE(alq, aq_link);
+		ALD_UNLOCK();
+		alq_shutdown(alq);
+		ALD_LOCK();
+	}
+
+	/* At this point, all ALQs are flushed and shutdown. */
+
+	/*
+	 * Wake ald_daemon so that it exits. It won't be able to do
+	 * anything until we mtx_sleep because we hold the ald_mtx.
+	 */
+	wakeup(&ald_active);
+
+	/* Wait for ald_daemon to exit. */
+	mtx_sleep(ald_proc, &ald_mtx, PWAIT, "aldslp", 0);
+
+	ALD_UNLOCK();
+}
+
+static void
+alq_shutdown(struct alq *alq)
+{
+	ALQ_LOCK(alq);
+
+	/* Stop any new writers. */
+	alq->aq_flags |= AQ_SHUTDOWN;
+
+	/*
+	 * If the ALQ isn't active but has unwritten data (possible if
+	 * the ALQ_NOACTIVATE flag has been used), explicitly activate the
+	 * ALQ here so that the pending data gets flushed by the ald_daemon.
+	 */
+	if (!(alq->aq_flags & AQ_ACTIVE) && HAS_PENDING_DATA(alq)) {
+		alq->aq_flags |= AQ_ACTIVE;
+		ALQ_UNLOCK(alq);
+		ALD_LOCK();
+		ald_activate(alq);
+		ALD_UNLOCK();
+		ALQ_LOCK(alq);
+	}
+
+	/* Drain IO */
+	while (alq->aq_flags & AQ_ACTIVE) {
+		alq->aq_flags |= AQ_WANTED;
+		msleep_spin(alq, &alq->aq_mtx, "aldclose", 0);
+	}
+	ALQ_UNLOCK(alq);
+
+	vn_close(alq->aq_vp, FWRITE, alq->aq_cred,
+	    curthread);
+	crfree(alq->aq_cred);
+}
+
+void
+alq_destroy(struct alq *alq)
+{
+	/* Drain all pending IO. */
+	alq_shutdown(alq);
+
+	mtx_destroy(&alq->aq_mtx);
+	free(alq->aq_entbuf, M_ALD);
+	free(alq, M_ALD);
+}
+
+/*
+ * Flush all pending data to disk.  This operation will block.
+ */
+static int
+alq_doio(struct alq *alq)
+{
+	struct thread *td;
+	struct mount *mp;
+	struct vnode *vp;
+	struct uio auio;
+	struct iovec aiov[2];
+	int totlen;
+	int iov;
+	int wrapearly;
+
+	KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__));
+
+	vp = alq->aq_vp;
+	td = curthread;
+	totlen = 0;
+	iov = 1;
+	wrapearly = alq->aq_wrapearly;
+
+	bzero(&aiov, sizeof(aiov));
+	bzero(&auio, sizeof(auio));
+
+	/* Start the write from the location of our buffer tail pointer. */
+	aiov[0].iov_base = alq->aq_entbuf + alq->aq_writetail;
+
+	if (alq->aq_writetail < alq->aq_writehead) {
+		/* Buffer not wrapped. */
+		totlen = aiov[0].iov_len = alq->aq_writehead - alq->aq_writetail;
+	} else if (alq->aq_writehead == 0) {
+		/* Buffer not wrapped (special case to avoid an empty iov). */
+		totlen = aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail -
+		    wrapearly;
+	} else {
+		/*
+		 * Buffer wrapped, requires 2 aiov entries:
+		 * - first is from writetail to end of buffer
+		 * - second is from start of buffer to writehead
+		 */
+		aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail -
+		    wrapearly;
+		iov++;
+		aiov[1].iov_base = alq->aq_entbuf;
+		aiov[1].iov_len =  alq->aq_writehead;
+		totlen = aiov[0].iov_len + aiov[1].iov_len;
+	}
+
+	alq->aq_flags |= AQ_FLUSHING;
+	ALQ_UNLOCK(alq);
+
+	auio.uio_iov = &aiov[0];
+	auio.uio_offset = 0;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_iovcnt = iov;
+	auio.uio_resid = totlen;
+	auio.uio_td = td;
+
+	/*
+	 * Do all of the junk required to write now.
+	 */
+	vn_start_write(vp, &mp, V_WAIT);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	/*
+	 * XXX: VOP_WRITE error checks are ignored.
+	 */
+#ifdef MAC
+	if (mac_vnode_check_write(alq->aq_cred, NOCRED, vp) == 0)
+#endif
+		VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, alq->aq_cred);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+
+	ALQ_LOCK(alq);
+	alq->aq_flags &= ~AQ_FLUSHING;
+
+	/* Adjust writetail as required, taking into account wrapping. */
+	alq->aq_writetail = (alq->aq_writetail + totlen + wrapearly) %
+	    alq->aq_buflen;
+	alq->aq_freebytes += totlen + wrapearly;
+
+	/*
+	 * If we just flushed part of the buffer which wrapped, reset the
+	 * wrapearly indicator.
+	 */
+	if (wrapearly)
+		alq->aq_wrapearly = 0;
+
+	/*
+	 * If we just flushed the buffer completely, reset indexes to 0 to
+	 * minimise buffer wraps.
+	 * This is also required to ensure alq_getn() can't wedge itself.
+	 */
+	if (!HAS_PENDING_DATA(alq))
+		alq->aq_writehead = alq->aq_writetail = 0;
+
+	KASSERT((alq->aq_writetail >= 0 && alq->aq_writetail < alq->aq_buflen),
+	    ("%s: aq_writetail < 0 || aq_writetail >= aq_buflen", __func__));
+
+	if (alq->aq_flags & AQ_WANTED) {
+		alq->aq_flags &= ~AQ_WANTED;
+		return (1);
+	}
+
+	return(0);
+}
+
+static struct kproc_desc ald_kp = {
+        "ALQ Daemon",
+        ald_daemon,
+        &ald_proc
+};
+
+SYSINIT(aldthread, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &ald_kp);
+SYSINIT(ald, SI_SUB_LOCK, SI_ORDER_ANY, ald_startup, NULL);
+
+
+/* User visible queue functions */
+
+/*
+ * Create the queue data structure, allocate the buffer, and open the file.
+ */
+
+int
+alq_open_flags(struct alq **alqp, const char *file, struct ucred *cred, int cmode,
+    int size, int flags)
+{
+	struct thread *td;
+	struct nameidata nd;
+	struct alq *alq;
+	int oflags;
+	int error;
+
+	KASSERT((size > 0), ("%s: size <= 0", __func__));
+
+	*alqp = NULL;
+	td = curthread;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, file, td);
+	oflags = FWRITE | O_NOFOLLOW | O_CREAT;
+
+	error = vn_open_cred(&nd, &oflags, cmode, 0, cred, NULL);
+	if (error)
+		return (error);
+
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	/* We just unlock so we hold a reference */
+	VOP_UNLOCK(nd.ni_vp, 0);
+
+	alq = malloc(sizeof(*alq), M_ALD, M_WAITOK|M_ZERO);
+	alq->aq_vp = nd.ni_vp;
+	alq->aq_cred = crhold(cred);
+
+	mtx_init(&alq->aq_mtx, "ALD Queue", NULL, MTX_SPIN|MTX_QUIET);
+
+	alq->aq_buflen = size;
+	alq->aq_entmax = 0;
+	alq->aq_entlen = 0;
+
+	alq->aq_freebytes = alq->aq_buflen;
+	alq->aq_entbuf = malloc(alq->aq_buflen, M_ALD, M_WAITOK|M_ZERO);
+	alq->aq_writehead = alq->aq_writetail = 0;
+	if (flags & ALQ_ORDERED)
+		alq->aq_flags |= AQ_ORDERED;
+
+	if ((error = ald_add(alq)) != 0) {
+		alq_destroy(alq);
+		return (error);
+	}
+
+	*alqp = alq;
+
+	return (0);
+}
+
+int
+alq_open(struct alq **alqp, const char *file, struct ucred *cred, int cmode,
+    int size, int count)
+{
+	int ret;
+
+	KASSERT((count >= 0), ("%s: count < 0", __func__));
+
+	if (count > 0) {
+		ret = alq_open_flags(alqp, file, cred, cmode, size*count, 0);
+		(*alqp)->aq_flags |= AQ_LEGACY;
+		(*alqp)->aq_entmax = count;
+		(*alqp)->aq_entlen = size;
+	} else
+		ret = alq_open_flags(alqp, file, cred, cmode, size, 0);
+
+	return (ret);
+}
+
+
+/*
+ * Copy a new entry into the queue.  If the operation would block either
+ * wait or return an error depending on the value of waitok.
+ */
+int
+alq_writen(struct alq *alq, void *data, int len, int flags)
+{
+	int activate, copy, ret;
+	void *waitchan;
+
+	KASSERT((len > 0 && len <= alq->aq_buflen),
+	    ("%s: len <= 0 || len > aq_buflen", __func__));
+
+	activate = ret = 0;
+	copy = len;
+	waitchan = NULL;
+
+	ALQ_LOCK(alq);
+
+	/*
+	 * Fail to perform the write and return EWOULDBLOCK if:
+	 * - The message is larger than our underlying buffer.
+	 * - The ALQ is being shutdown.
+	 * - There is insufficient free space in our underlying buffer
+	 *   to accept the message and the user can't wait for space.
+	 * - There is insufficient free space in our underlying buffer
+	 *   to accept the message and the alq is inactive due to prior
+	 *   use of the ALQ_NOACTIVATE flag (which would lead to deadlock).
+	 */
+	if (len > alq->aq_buflen ||
+	    alq->aq_flags & AQ_SHUTDOWN ||
+	    (((flags & ALQ_NOWAIT) || (!(alq->aq_flags & AQ_ACTIVE) &&
+	    HAS_PENDING_DATA(alq))) && alq->aq_freebytes < len)) {
+		ALQ_UNLOCK(alq);
+		return (EWOULDBLOCK);
+	}
+
+	/*
+	 * If we want ordered writes and there is already at least one thread
+	 * waiting for resources to become available, sleep until we're woken.
+	 */
+	if (alq->aq_flags & AQ_ORDERED && alq->aq_waiters > 0) {
+		KASSERT(!(flags & ALQ_NOWAIT),
+		    ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
+		alq->aq_waiters++;
+		msleep_spin(&alq->aq_waiters, &alq->aq_mtx, "alqwnord", 0);
+		alq->aq_waiters--;
+	}
+
+	/*
+	 * (ALQ_WAITOK && aq_freebytes < len) or aq_freebytes >= len, either
+	 * enter while loop and sleep until we have enough free bytes (former)
+	 * or skip (latter). If AQ_ORDERED is set, only 1 thread at a time will
+	 * be in this loop. Otherwise, multiple threads may be sleeping here
+	 * competing for ALQ resources.
+	 */
+	while (alq->aq_freebytes < len && !(alq->aq_flags & AQ_SHUTDOWN)) {
+		KASSERT(!(flags & ALQ_NOWAIT),
+		    ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
+		alq->aq_flags |= AQ_WANTED;
+		alq->aq_waiters++;
+		if (waitchan)
+			wakeup(waitchan);
+		msleep_spin(alq, &alq->aq_mtx, "alqwnres", 0);
+		alq->aq_waiters--;
+
+		/*
+		 * If we're the first thread to wake after an AQ_WANTED wakeup
+		 * but there isn't enough free space for us, we're going to loop
+		 * and sleep again. If there are other threads waiting in this
+		 * loop, schedule a wakeup so that they can see if the space
+		 * they require is available.
+		 */
+		if (alq->aq_waiters > 0 && !(alq->aq_flags & AQ_ORDERED) &&
+		    alq->aq_freebytes < len && !(alq->aq_flags & AQ_WANTED))
+			waitchan = alq;
+		else
+			waitchan = NULL;
+	}
+
+	/*
+	 * If there are waiters, we need to signal the waiting threads after we
+	 * complete our work. The alq ptr is used as a wait channel for threads
+	 * requiring resources to be freed up. In the AQ_ORDERED case, threads
+	 * are not allowed to concurrently compete for resources in the above
+	 * while loop, so we use a different wait channel in this case.
+	 */
+	if (alq->aq_waiters > 0) {
+		if (alq->aq_flags & AQ_ORDERED)
+			waitchan = &alq->aq_waiters;
+		else
+			waitchan = alq;
+	} else
+		waitchan = NULL;
+
+	/* Bail if we're shutting down. */
+	if (alq->aq_flags & AQ_SHUTDOWN) {
+		ret = EWOULDBLOCK;
+		goto unlock;
+	}
+
+	/*
+	 * If we need to wrap the buffer to accommodate the write,
+	 * we'll need 2 calls to bcopy.
+	 */
+	if ((alq->aq_buflen - alq->aq_writehead) < len)
+		copy = alq->aq_buflen - alq->aq_writehead;
+
+	/* Copy message (or part thereof if wrap required) to the buffer. */
+	bcopy(data, alq->aq_entbuf + alq->aq_writehead, copy);
+	alq->aq_writehead += copy;
+
+	if (alq->aq_writehead >= alq->aq_buflen) {
+		KASSERT((alq->aq_writehead == alq->aq_buflen),
+		    ("%s: alq->aq_writehead (%d) > alq->aq_buflen (%d)",
+		    __func__,
+		    alq->aq_writehead,
+		    alq->aq_buflen));
+		alq->aq_writehead = 0;
+	}
+
+	if (copy != len) {
+		/*
+		 * Wrap the buffer by copying the remainder of our message
+		 * to the start of the buffer and resetting aq_writehead.
+		 */
+		bcopy(((uint8_t *)data)+copy, alq->aq_entbuf, len - copy);
+		alq->aq_writehead = len - copy;
+	}
+
+	KASSERT((alq->aq_writehead >= 0 && alq->aq_writehead < alq->aq_buflen),
+	    ("%s: aq_writehead < 0 || aq_writehead >= aq_buflen", __func__));
+
+	alq->aq_freebytes -= len;
+
+	if (!(alq->aq_flags & AQ_ACTIVE) && !(flags & ALQ_NOACTIVATE)) {
+		alq->aq_flags |= AQ_ACTIVE;
+		activate = 1;
+	}
+
+	KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__));
+
+unlock:
+	ALQ_UNLOCK(alq);
+
+	if (activate) {
+		ALD_LOCK();
+		ald_activate(alq);
+		ALD_UNLOCK();
+	}
+
+	/* NB: We rely on wakeup_one waking threads in a FIFO manner. */
+	if (waitchan != NULL)
+		wakeup_one(waitchan);
+
+	return (ret);
+}
+
+int
+alq_write(struct alq *alq, void *data, int flags)
+{
+	/* Should only be called in fixed length message (legacy) mode. */
+	KASSERT((alq->aq_flags & AQ_LEGACY),
+	    ("%s: fixed length write on variable length queue", __func__));
+	return (alq_writen(alq, data, alq->aq_entlen, flags));
+}
+
+/*
+ * Retrieve a pointer for the ALQ to write directly into, avoiding bcopy.
+ */
+struct ale *
+alq_getn(struct alq *alq, int len, int flags)
+{
+	int contigbytes;
+	void *waitchan;
+
+	KASSERT((len > 0 && len <= alq->aq_buflen),
+	    ("%s: len <= 0 || len > alq->aq_buflen", __func__));
+
+	waitchan = NULL;
+
+	ALQ_LOCK(alq);
+
+	/*
+	 * Determine the number of free contiguous bytes.
+	 * We ensure elsewhere that if aq_writehead == aq_writetail because
+	 * the buffer is empty, they will both be set to 0 and therefore
+	 * aq_freebytes == aq_buflen and is fully contiguous.
+	 * If they are equal and the buffer is not empty, aq_freebytes will
+	 * be 0 indicating the buffer is full.
+	 */
+	if (alq->aq_writehead <= alq->aq_writetail)
+		contigbytes = alq->aq_freebytes;
+	else {
+		contigbytes = alq->aq_buflen - alq->aq_writehead;
+
+		if (contigbytes < len) {
+			/*
+			 * Insufficient space at end of buffer to handle a
+			 * contiguous write. Wrap early if there's space at
+			 * the beginning. This will leave a hole at the end
+			 * of the buffer which we will have to skip over when
+			 * flushing the buffer to disk.
+			 */
+			if (alq->aq_writetail >= len || flags & ALQ_WAITOK) {
+				/* Keep track of # bytes left blank. */
+				alq->aq_wrapearly = contigbytes;
+				/* Do the wrap and adjust counters. */
+				contigbytes = alq->aq_freebytes =
+				    alq->aq_writetail;
+				alq->aq_writehead = 0;
+			}
+		}
+	}
+
+	/*
+	 * Return a NULL ALE if:
+	 * - The message is larger than our underlying buffer.
+	 * - The ALQ is being shutdown.
+	 * - There is insufficient free space in our underlying buffer
+	 *   to accept the message and the user can't wait for space.
+	 * - There is insufficient free space in our underlying buffer
+	 *   to accept the message and the alq is inactive due to prior
+	 *   use of the ALQ_NOACTIVATE flag (which would lead to deadlock).
+	 */
+	if (len > alq->aq_buflen ||
+	    alq->aq_flags & AQ_SHUTDOWN ||
+	    (((flags & ALQ_NOWAIT) || (!(alq->aq_flags & AQ_ACTIVE) &&
+	    HAS_PENDING_DATA(alq))) && contigbytes < len)) {
+		ALQ_UNLOCK(alq);
+		return (NULL);
+	}
+
+	/*
+	 * If we want ordered writes and there is already at least one thread
+	 * waiting for resources to become available, sleep until we're woken.
+	 */
+	if (alq->aq_flags & AQ_ORDERED && alq->aq_waiters > 0) {
+		KASSERT(!(flags & ALQ_NOWAIT),
+		    ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
+		alq->aq_waiters++;
+		msleep_spin(&alq->aq_waiters, &alq->aq_mtx, "alqgnord", 0);
+		alq->aq_waiters--;
+	}
+
+	/*
+	 * (ALQ_WAITOK && contigbytes < len) or contigbytes >= len, either enter
+	 * while loop and sleep until we have enough contiguous free bytes
+	 * (former) or skip (latter). If AQ_ORDERED is set, only 1 thread at a
+	 * time will be in this loop. Otherwise, multiple threads may be
+	 * sleeping here competing for ALQ resources.
+	 */
+	while (contigbytes < len && !(alq->aq_flags & AQ_SHUTDOWN)) {
+		KASSERT(!(flags & ALQ_NOWAIT),
+		    ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
+		alq->aq_flags |= AQ_WANTED;
+		alq->aq_waiters++;
+		if (waitchan)
+			wakeup(waitchan);
+		msleep_spin(alq, &alq->aq_mtx, "alqgnres", 0);
+		alq->aq_waiters--;
+
+		if (alq->aq_writehead <= alq->aq_writetail)
+			contigbytes = alq->aq_freebytes;
+		else
+			contigbytes = alq->aq_buflen - alq->aq_writehead;
+
+		/*
+		 * If we're the first thread to wake after an AQ_WANTED wakeup
+		 * but there isn't enough free space for us, we're going to loop
+		 * and sleep again. If there are other threads waiting in this
+		 * loop, schedule a wakeup so that they can see if the space
+		 * they require is available.
+		 */
+		if (alq->aq_waiters > 0 && !(alq->aq_flags & AQ_ORDERED) &&
+		    contigbytes < len && !(alq->aq_flags & AQ_WANTED))
+			waitchan = alq;
+		else
+			waitchan = NULL;
+	}
+
+	/*
+	 * If there are waiters, we need to signal the waiting threads after we
+	 * complete our work. The alq ptr is used as a wait channel for threads
+	 * requiring resources to be freed up. In the AQ_ORDERED case, threads
+	 * are not allowed to concurrently compete for resources in the above
+	 * while loop, so we use a different wait channel in this case.
+	 */
+	if (alq->aq_waiters > 0) {
+		if (alq->aq_flags & AQ_ORDERED)
+			waitchan = &alq->aq_waiters;
+		else
+			waitchan = alq;
+	} else
+		waitchan = NULL;
+
+	/* Bail if we're shutting down. */
+	if (alq->aq_flags & AQ_SHUTDOWN) {
+		ALQ_UNLOCK(alq);
+		if (waitchan != NULL)
+			wakeup_one(waitchan);
+		return (NULL);
+	}
+
+	/*
+	 * If we are here, we have a contiguous number of bytes >= len
+	 * available in our buffer starting at aq_writehead.
+	 */
+	alq->aq_getpost.ae_data = alq->aq_entbuf + alq->aq_writehead;
+	alq->aq_getpost.ae_bytesused = len;
+
+	return (&alq->aq_getpost);
+}
+
+struct ale *
+alq_get(struct alq *alq, int flags)
+{
+	/* Should only be called in fixed length message (legacy) mode. */
+	KASSERT((alq->aq_flags & AQ_LEGACY),
+	    ("%s: fixed length get on variable length queue", __func__));
+	return (alq_getn(alq, alq->aq_entlen, flags));
+}
+
+void
+alq_post_flags(struct alq *alq, struct ale *ale, int flags)
+{
+	int activate;
+	void *waitchan;
+
+	activate = 0;
+
+	if (ale->ae_bytesused > 0) {
+		if (!(alq->aq_flags & AQ_ACTIVE) &&
+		    !(flags & ALQ_NOACTIVATE)) {
+			alq->aq_flags |= AQ_ACTIVE;
+			activate = 1;
+		}
+
+		alq->aq_writehead += ale->ae_bytesused;
+		alq->aq_freebytes -= ale->ae_bytesused;
+
+		/* Wrap aq_writehead if we filled to the end of the buffer. */
+		if (alq->aq_writehead == alq->aq_buflen)
+			alq->aq_writehead = 0;
+
+		KASSERT((alq->aq_writehead >= 0 &&
+		    alq->aq_writehead < alq->aq_buflen),
+		    ("%s: aq_writehead < 0 || aq_writehead >= aq_buflen",
+		    __func__));
+
+		KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__));
+	}
+
+	/*
+	 * If there are waiters, we need to signal the waiting threads after we
+	 * complete our work. The alq ptr is used as a wait channel for threads
+	 * requiring resources to be freed up. In the AQ_ORDERED case, threads
+	 * are not allowed to concurrently compete for resources in the
+	 * alq_getn() while loop, so we use a different wait channel in this case.
+	 */
+	if (alq->aq_waiters > 0) {
+		if (alq->aq_flags & AQ_ORDERED)
+			waitchan = &alq->aq_waiters;
+		else
+			waitchan = alq;
+	} else
+		waitchan = NULL;
+
+	ALQ_UNLOCK(alq);
+
+	if (activate) {
+		ALD_LOCK();
+		ald_activate(alq);
+		ALD_UNLOCK();
+	}
+
+	/* NB: We rely on wakeup_one waking threads in a FIFO manner. */
+	if (waitchan != NULL)
+		wakeup_one(waitchan);
+}
+
+void
+alq_flush(struct alq *alq)
+{
+	int needwakeup = 0;
+
+	ALD_LOCK();
+	ALQ_LOCK(alq);
+
+	/*
+	 * Pull the lever iff there is data to flush and we're
+	 * not already in the middle of a flush operation.
+	 */
+	if (HAS_PENDING_DATA(alq) && !(alq->aq_flags & AQ_FLUSHING)) {
+		if (alq->aq_flags & AQ_ACTIVE)
+			ald_deactivate(alq);
+
+		ALD_UNLOCK();
+		needwakeup = alq_doio(alq);
+	} else
+		ALD_UNLOCK();
+
+	ALQ_UNLOCK(alq);
+
+	if (needwakeup)
+		wakeup_one(alq);
+}
+
+/*
+ * Flush remaining data, close the file and free all resources.
+ */
+void
+alq_close(struct alq *alq)
+{
+	/* Only flush and destroy alq if not already shutting down. */
+	if (ald_rem(alq) == 0)
+		alq_destroy(alq);
+}
+
+static int
+alq_load_handler(module_t mod, int what, void *arg)
+{
+	int ret;
+	
+	ret = 0;
+
+	switch (what) {
+	case MOD_LOAD:
+	case MOD_SHUTDOWN:
+		break;
+
+	case MOD_QUIESCE:
+		ALD_LOCK();
+		/* Only allow unload if there are no open queues. */
+		if (LIST_FIRST(&ald_queues) == NULL) {
+			ald_shutingdown = 1;
+			ALD_UNLOCK();
+			EVENTHANDLER_DEREGISTER(shutdown_pre_sync,
+			    alq_eventhandler_tag);
+			ald_shutdown(NULL, 0);
+			mtx_destroy(&ald_mtx);
+		} else {
+			ALD_UNLOCK();
+			ret = EBUSY;
+		}
+		break;
+
+	case MOD_UNLOAD:
+		/* If MOD_QUIESCE failed we must fail here too. */
+		if (ald_shutingdown == 0)
+			ret = EBUSY;
+		break;
+
+	default:
+		ret = EINVAL;
+		break;
+	}
+
+	return (ret);
+}
+
+static moduledata_t alq_mod =
+{
+	"alq",
+	alq_load_handler,
+	NULL
+};
+
+DECLARE_MODULE(alq, alq_mod, SI_SUB_SMP, SI_ORDER_ANY);
+MODULE_VERSION(alq, 1);
diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c
new file mode 100644
index 0000000..4cfd219
--- /dev/null
+++ b/sys/kern/kern_clock.c
@@ -0,0 +1,895 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdb.h"
+#include "opt_device_polling.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+#include "opt_ntp.h"
+#include "opt_watchdog.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/sleepqueue.h>
+#include <sys/smp.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/limits.h>
+#include <sys/timetc.h>
+
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+PMC_SOFT_DEFINE( , , clock, hard);
+PMC_SOFT_DEFINE( , , clock, stat);
+PMC_SOFT_DEFINE_EX( , , clock, prof, \
+    cpu_startprofclock, cpu_stopprofclock);
+#endif
+
+#ifdef DEVICE_POLLING
+extern void hardclock_device_poll(void);
+#endif /* DEVICE_POLLING */
+
+static void initclocks(void *dummy);
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL);
+
+/* Spin-lock protecting profiling statistics. */
+static struct mtx time_lock;
+
+SDT_PROVIDER_DECLARE(sched);
+SDT_PROBE_DEFINE2(sched, , , tick, tick, "struct thread *", "struct proc *");
+
+static int
+sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	long cp_time[CPUSTATES];
+#ifdef SCTL_MASK32
+	int i;
+	unsigned int cp_time32[CPUSTATES];
+#endif
+
+	read_cpu_time(cp_time);
+#ifdef SCTL_MASK32
+	if (req->flags & SCTL_MASK32) {
+		if (!req->oldptr)
+			return SYSCTL_OUT(req, 0, sizeof(cp_time32));
+		for (i = 0; i < CPUSTATES; i++)
+			cp_time32[i] = (unsigned int)cp_time[i];
+		error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
+	} else
+#endif
+	{
+		if (!req->oldptr)
+			return SYSCTL_OUT(req, 0, sizeof(cp_time));
+		error = SYSCTL_OUT(req, cp_time, sizeof(cp_time));
+	}
+	return error;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE,
+    0,0, sysctl_kern_cp_time, "LU", "CPU time statistics");
+
+static long empty[CPUSTATES];
+
+static int
+sysctl_kern_cp_times(SYSCTL_HANDLER_ARGS)
+{
+	struct pcpu *pcpu;
+	int error;
+	int c;
+	long *cp_time;
+#ifdef SCTL_MASK32
+	unsigned int cp_time32[CPUSTATES];
+	int i;
+#endif
+
+	if (!req->oldptr) {
+#ifdef SCTL_MASK32
+		if (req->flags & SCTL_MASK32)
+			return SYSCTL_OUT(req, 0, sizeof(cp_time32) * (mp_maxid + 1));
+		else
+#endif
+			return SYSCTL_OUT(req, 0, sizeof(long) * CPUSTATES * (mp_maxid + 1));
+	}
+	for (error = 0, c = 0; error == 0 && c <= mp_maxid; c++) {
+		if (!CPU_ABSENT(c)) {
+			pcpu = pcpu_find(c);
+			cp_time = pcpu->pc_cp_time;
+		} else {
+			cp_time = empty;
+		}
+#ifdef SCTL_MASK32
+		if (req->flags & SCTL_MASK32) {
+			for (i = 0; i < CPUSTATES; i++)
+				cp_time32[i] = (unsigned int)cp_time[i];
+			error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
+		} else
+#endif
+			error = SYSCTL_OUT(req, cp_time, sizeof(long) * CPUSTATES);
+	}
+	return error;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, cp_times, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE,
+    0,0, sysctl_kern_cp_times, "LU", "per-CPU time statistics");
+
+#ifdef DEADLKRES
+static const char *blessed[] = {
+	"getblk",
+	"so_snd_sx",
+	"so_rcv_sx",
+	NULL
+};
+static int slptime_threshold = 1800;
+static int blktime_threshold = 900;
+static int sleepfreq = 3;
+
+static void
+deadlkres(void)
+{
+	struct proc *p;
+	struct thread *td;
+	void *wchan;
+	int blkticks, i, slpticks, slptype, tryl, tticks;
+
+	tryl = 0;
+	for (;;) {
+		blkticks = blktime_threshold * hz;
+		slpticks = slptime_threshold * hz;
+
+		/*
+		 * Avoid to sleep on the sx_lock in order to avoid a possible
+		 * priority inversion problem leading to starvation.
+		 * If the lock can't be held after 100 tries, panic.
+		 */
+		if (!sx_try_slock(&allproc_lock)) {
+			if (tryl > 100)
+		panic("%s: possible deadlock detected on allproc_lock\n",
+				    __func__);
+			tryl++;
+			pause("allproc", sleepfreq * hz);
+			continue;
+		}
+		tryl = 0;
+		FOREACH_PROC_IN_SYSTEM(p) {
+			PROC_LOCK(p);
+			if (p->p_state == PRS_NEW) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			FOREACH_THREAD_IN_PROC(p, td) {
+
+				thread_lock(td);
+				if (TD_ON_LOCK(td)) {
+
+					/*
+					 * The thread should be blocked on a
+					 * turnstile, simply check if the
+					 * turnstile channel is in good state.
+					 */
+					MPASS(td->td_blocked != NULL);
+
+					tticks = ticks - td->td_blktick;
+					thread_unlock(td);
+					if (tticks > blkticks) {
+
+						/*
+						 * Accordingly with provided
+						 * thresholds, this thread is
+						 * stuck for too long on a
+						 * turnstile.
+						 */
+						PROC_UNLOCK(p);
+						sx_sunlock(&allproc_lock);
+	panic("%s: possible deadlock detected for %p, blocked for %d ticks\n",
+						    __func__, td, tticks);
+					}
+				} else if (TD_IS_SLEEPING(td) &&
+				    TD_ON_SLEEPQ(td)) {
+
+					/*
+					 * Check if the thread is sleeping on a
+					 * lock, otherwise skip the check.
+					 * Drop the thread lock in order to
+					 * avoid a LOR with the sleepqueue
+					 * spinlock.
+					 */
+					wchan = td->td_wchan;
+					tticks = ticks - td->td_slptick;
+					thread_unlock(td);
+					slptype = sleepq_type(wchan);
+					if ((slptype == SLEEPQ_SX ||
+					    slptype == SLEEPQ_LK) &&
+					    tticks > slpticks) {
+
+						/*
+						 * Accordingly with provided
+						 * thresholds, this thread is
+						 * stuck for too long on a
+						 * sleepqueue.
+						 * However, being on a
+						 * sleepqueue, we might still
+						 * check for the blessed
+						 * list.
+						 */
+						tryl = 0;
+						for (i = 0; blessed[i] != NULL;
+						    i++) {
+							if (!strcmp(blessed[i],
+							    td->td_wmesg)) {
+								tryl = 1;
+								break;
+							}
+						}
+						if (tryl != 0) {
+							tryl = 0;
+							continue;
+						}
+						PROC_UNLOCK(p);
+						sx_sunlock(&allproc_lock);
+	panic("%s: possible deadlock detected for %p, blocked for %d ticks\n",
+						    __func__, td, tticks);
+					}
+				} else
+					thread_unlock(td);
+			}
+			PROC_UNLOCK(p);
+		}
+		sx_sunlock(&allproc_lock);
+
+		/* Sleep for sleepfreq seconds. */
+		pause("-", sleepfreq * hz);
+	}
+}
+
+static struct kthread_desc deadlkres_kd = {
+	"deadlkres",
+	deadlkres,
+	(struct thread **)NULL
+};
+
+SYSINIT(deadlkres, SI_SUB_CLOCKS, SI_ORDER_ANY, kthread_start, &deadlkres_kd);
+
+static SYSCTL_NODE(_debug, OID_AUTO, deadlkres, CTLFLAG_RW, 0,
+    "Deadlock resolver");
+SYSCTL_INT(_debug_deadlkres, OID_AUTO, slptime_threshold, CTLFLAG_RW,
+    &slptime_threshold, 0,
+    "Number of seconds within is valid to sleep on a sleepqueue");
+SYSCTL_INT(_debug_deadlkres, OID_AUTO, blktime_threshold, CTLFLAG_RW,
+    &blktime_threshold, 0,
+    "Number of seconds within is valid to block on a turnstile");
+SYSCTL_INT(_debug_deadlkres, OID_AUTO, sleepfreq, CTLFLAG_RW, &sleepfreq, 0,
+    "Number of seconds between any deadlock resolver thread run");
+#endif	/* DEADLKRES */
+
+void
+read_cpu_time(long *cp_time)
+{
+	struct pcpu *pc;
+	int i, j;
+
+	/* Sum up global cp_time[]. */
+	bzero(cp_time, sizeof(long) * CPUSTATES);
+	CPU_FOREACH(i) {
+		pc = pcpu_find(i);
+		for (j = 0; j < CPUSTATES; j++)
+			cp_time[j] += pc->pc_cp_time[j];
+	}
+}
+
+#ifdef SW_WATCHDOG
+#include <sys/watchdog.h>
+
+static int watchdog_ticks;
+static int watchdog_enabled;
+static void watchdog_fire(void);
+static void watchdog_config(void *, u_int, int *);
+#endif /* SW_WATCHDOG */
+
+/*
+ * Clock handling routines.
+ *
+ * This code is written to operate with two timers that run independently of
+ * each other.
+ *
+ * The main timer, running hz times per second, is used to trigger interval
+ * timers, timeouts and rescheduling as needed.
+ *
+ * The second timer handles kernel and user profiling,
+ * and does resource use estimation.  If the second timer is programmable,
+ * it is randomized to avoid aliasing between the two clocks.  For example,
+ * the randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires.  Otherwise, it would never accumulate
+ * cpu ticks.  The mean frequency of the second timer is stathz.
+ *
+ * If no second timer exists, stathz will be zero; in this case we drive
+ * profiling and statistics off the main clock.  This WILL NOT be accurate;
+ * do not do it unless absolutely necessary.
+ *
+ * The statistics clock may (or may not) be run at a higher rate while
+ * profiling.  This profile clock runs at profhz.  We require that profhz
+ * be an integral multiple of stathz.
+ *
+ * If the statistics clock is running fast, it must be divided by the ratio
+ * profhz/stathz for statistics.  (For profiling, every tick counts.)
+ *
+ * Time-of-day is maintained using a "timecounter", which may or may
+ * not be related to the hardware generating the above mentioned
+ * interrupts.
+ */
+
+int	stathz;
+int	profhz;
+int	profprocs;
+volatile int	ticks;
+int	psratio;
+
+static DPCPU_DEFINE(int, pcputicks);	/* Per-CPU version of ticks. */
+static int global_hardclock_run = 0;
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+	void *dummy;
+{
+	register int i;
+
+	/*
+	 * Set divisors to 1 (normal case) and let the machine-specific
+	 * code do its bit.
+	 */
+	mtx_init(&time_lock, "time lock", NULL, MTX_DEF);
+	cpu_initclocks();
+
+	/*
+	 * Compute profhz/stathz, and fix profhz if needed.
+	 */
+	i = stathz ? stathz : hz;
+	if (profhz == 0)
+		profhz = i;
+	psratio = profhz / i;
+#ifdef SW_WATCHDOG
+	EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0);
+#endif
+}
+
+/*
+ * Each time the real-time timer fires, this function is called on all CPUs.
+ * Note that hardclock() calls hardclock_cpu() for the boot CPU, so only
+ * the other CPUs in the system need to call this function.
+ */
+void
+hardclock_cpu(int usermode)
+{
+	struct pstats *pstats;
+	struct thread *td = curthread;
+	struct proc *p = td->td_proc;
+	int flags;
+
+	/*
+	 * Run current process's virtual and profile time, as needed.
+	 */
+	pstats = p->p_stats;
+	flags = 0;
+	if (usermode &&
+	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
+		PROC_SLOCK(p);
+		if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
+			flags |= TDF_ALRMPEND | TDF_ASTPENDING;
+		PROC_SUNLOCK(p);
+	}
+	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
+		PROC_SLOCK(p);
+		if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
+			flags |= TDF_PROFPEND | TDF_ASTPENDING;
+		PROC_SUNLOCK(p);
+	}
+	thread_lock(td);
+	sched_tick(1);
+	td->td_flags |= flags;
+	thread_unlock(td);
+
+#ifdef HWPMC_HOOKS
+	if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
+		PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL);
+	if (td->td_intr_frame != NULL)
+		PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame);
+#endif
+	callout_process(sbinuptime());
+}
+
+/*
+ * The real-time timer, interrupting hz times per second.
+ */
+void
+hardclock(int usermode, uintfptr_t pc)
+{
+
+	atomic_add_int(&ticks, 1);
+	hardclock_cpu(usermode);
+	tc_ticktock(1);
+	cpu_tick_calibration();
+	/*
+	 * If no separate statistics clock is available, run it from here.
+	 *
+	 * XXX: this only works for UP
+	 */
+	if (stathz == 0) {
+		profclock(usermode, pc);
+		statclock(usermode);
+	}
+#ifdef DEVICE_POLLING
+	hardclock_device_poll();	/* this is very short and quick */
+#endif /* DEVICE_POLLING */
+#ifdef SW_WATCHDOG
+	if (watchdog_enabled > 0 && --watchdog_ticks <= 0)
+		watchdog_fire();
+#endif /* SW_WATCHDOG */
+}
+
+void
+hardclock_cnt(int cnt, int usermode)
+{
+	struct pstats *pstats;
+	struct thread *td = curthread;
+	struct proc *p = td->td_proc;
+	int *t = DPCPU_PTR(pcputicks);
+	int flags, global, newticks;
+#ifdef SW_WATCHDOG
+	int i;
+#endif /* SW_WATCHDOG */
+
+	/*
+	 * Update per-CPU and possibly global ticks values.
+	 */
+	*t += cnt;
+	do {
+		global = ticks;
+		newticks = *t - global;
+		if (newticks <= 0) {
+			if (newticks < -1)
+				*t = global - 1;
+			newticks = 0;
+			break;
+		}
+	} while (!atomic_cmpset_int(&ticks, global, *t));
+
+	/*
+	 * Run current process's virtual and profile time, as needed.
+	 */
+	pstats = p->p_stats;
+	flags = 0;
+	if (usermode &&
+	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
+		PROC_SLOCK(p);
+		if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL],
+		    tick * cnt) == 0)
+			flags |= TDF_ALRMPEND | TDF_ASTPENDING;
+		PROC_SUNLOCK(p);
+	}
+	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
+		PROC_SLOCK(p);
+		if (itimerdecr(&pstats->p_timer[ITIMER_PROF],
+		    tick * cnt) == 0)
+			flags |= TDF_PROFPEND | TDF_ASTPENDING;
+		PROC_SUNLOCK(p);
+	}
+	thread_lock(td);
+	sched_tick(cnt);
+	td->td_flags |= flags;
+	thread_unlock(td);
+
+#ifdef	HWPMC_HOOKS
+	if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
+		PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL);
+	if (td->td_intr_frame != NULL)
+		PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame);
+#endif
+	/* We are in charge to handle this tick duty. */
+	if (newticks > 0) {
+		/* Dangerous and no need to call these things concurrently. */
+		if (atomic_cmpset_acq_int(&global_hardclock_run, 0, 1)) {
+			tc_ticktock(newticks);
+#ifdef DEVICE_POLLING
+			/* This is very short and quick. */
+			hardclock_device_poll();
+#endif /* DEVICE_POLLING */
+			atomic_store_rel_int(&global_hardclock_run, 0);
+		}
+#ifdef SW_WATCHDOG
+		if (watchdog_enabled > 0) {
+			i = atomic_fetchadd_int(&watchdog_ticks, -newticks);
+			if (i > 0 && i <= newticks)
+				watchdog_fire();
+		}
+#endif /* SW_WATCHDOG */
+	}
+	if (curcpu == CPU_FIRST())
+		cpu_tick_calibration();
+}
+
+void
+hardclock_sync(int cpu)
+{
+	int	*t = DPCPU_ID_PTR(cpu, pcputicks);
+
+	*t = ticks;
+}
+
+/*
+ * Compute number of ticks in the specified amount of time.
+ */
+int
+tvtohz(tv)
+	struct timeval *tv;
+{
+	register unsigned long ticks;
+	register long sec, usec;
+
+	/*
+	 * If the number of usecs in the whole seconds part of the time
+	 * difference fits in a long, then the total number of usecs will
+	 * fit in an unsigned long.  Compute the total and convert it to
+	 * ticks, rounding up and adding 1 to allow for the current tick
+	 * to expire.  Rounding also depends on unsigned long arithmetic
+	 * to avoid overflow.
+	 *
+	 * Otherwise, if the number of ticks in the whole seconds part of
+	 * the time difference fits in a long, then convert the parts to
+	 * ticks separately and add, using similar rounding methods and
+	 * overflow avoidance.  This method would work in the previous
+	 * case but it is slightly slower and assumes that hz is integral.
+	 *
+	 * Otherwise, round the time difference down to the maximum
+	 * representable value.
+	 *
+	 * If ints have 32 bits, then the maximum value for any timeout in
+	 * 10ms ticks is 248 days.
+	 */
+	sec = tv->tv_sec;
+	usec = tv->tv_usec;
+	if (usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+	if (sec < 0) {
+#ifdef DIAGNOSTIC
+		if (usec > 0) {
+			sec++;
+			usec -= 1000000;
+		}
+		printf("tvotohz: negative time difference %ld sec %ld usec\n",
+		       sec, usec);
+#endif
+		ticks = 1;
+	} else if (sec <= LONG_MAX / 1000000)
+		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+			/ tick + 1;
+	else if (sec <= LONG_MAX / hz)
+		ticks = sec * hz
+			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
+	else
+		ticks = LONG_MAX;
+	if (ticks > INT_MAX)
+		ticks = INT_MAX;
+	return ((int)ticks);
+}
+
+/*
+ * Start profiling on a process.
+ *
+ * Kernel profiling passes proc0 which never exits and hence
+ * keeps the profile clock running constantly.
+ */
+void
+startprofclock(p)
+	register struct proc *p;
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if (p->p_flag & P_STOPPROF)
+		return;
+	if ((p->p_flag & P_PROFIL) == 0) {
+		p->p_flag |= P_PROFIL;
+		mtx_lock(&time_lock);
+		if (++profprocs == 1)
+			cpu_startprofclock();
+		mtx_unlock(&time_lock);
+	}
+}
+
+/*
+ * Stop profiling on a process.
+ */
+void
+stopprofclock(p)
+	register struct proc *p;
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if (p->p_flag & P_PROFIL) {
+		if (p->p_profthreads != 0) {
+			p->p_flag |= P_STOPPROF;
+			while (p->p_profthreads != 0)
+				msleep(&p->p_profthreads, &p->p_mtx, PPAUSE,
+				    "stopprof", 0);
+			p->p_flag &= ~P_STOPPROF;
+		}
+		if ((p->p_flag & P_PROFIL) == 0)
+			return;
+		p->p_flag &= ~P_PROFIL;
+		mtx_lock(&time_lock);
+		if (--profprocs == 0)
+			cpu_stopprofclock();
+		mtx_unlock(&time_lock);
+	}
+}
+
+/*
+ * Statistics clock.  Updates rusage information and calls the scheduler
+ * to adjust priorities of the active thread.
+ *
+ * This should be called by all active processors.
+ */
+void
+statclock(int usermode)
+{
+
+	statclock_cnt(1, usermode);
+}
+
+void
+statclock_cnt(int cnt, int usermode)
+{
+	struct rusage *ru;
+	struct vmspace *vm;
+	struct thread *td;
+	struct proc *p;
+	long rss;
+	long *cp_time;
+
+	td = curthread;
+	p = td->td_proc;
+
+	cp_time = (long *)PCPU_PTR(cp_time);
+	if (usermode) {
+		/*
+		 * Charge the time as appropriate.
+		 */
+		td->td_uticks += cnt;
+		if (p->p_nice > NZERO)
+			cp_time[CP_NICE] += cnt;
+		else
+			cp_time[CP_USER] += cnt;
+	} else {
+		/*
+		 * Came from kernel mode, so we were:
+		 * - handling an interrupt,
+		 * - doing syscall or trap work on behalf of the current
+		 *   user process, or
+		 * - spinning in the idle loop.
+		 * Whichever it is, charge the time as appropriate.
+		 * Note that we charge interrupts to the current process,
+		 * regardless of whether they are ``for'' that process,
+		 * so that we know how much of its real time was spent
+		 * in ``non-process'' (i.e., interrupt) work.
+		 */
+		if ((td->td_pflags & TDP_ITHREAD) ||
+		    td->td_intr_nesting_level >= 2) {
+			td->td_iticks += cnt;
+			cp_time[CP_INTR] += cnt;
+		} else {
+			td->td_pticks += cnt;
+			td->td_sticks += cnt;
+			if (!TD_IS_IDLETHREAD(td))
+				cp_time[CP_SYS] += cnt;
+			else
+				cp_time[CP_IDLE] += cnt;
+		}
+	}
+
+	/* Update resource usage integrals and maximums. */
+	MPASS(p->p_vmspace != NULL);
+	vm = p->p_vmspace;
+	ru = &td->td_ru;
+	ru->ru_ixrss += pgtok(vm->vm_tsize) * cnt;
+	ru->ru_idrss += pgtok(vm->vm_dsize) * cnt;
+	ru->ru_isrss += pgtok(vm->vm_ssize) * cnt;
+	rss = pgtok(vmspace_resident_count(vm));
+	if (ru->ru_maxrss < rss)
+		ru->ru_maxrss = rss;
+	KTR_POINT2(KTR_SCHED, "thread", sched_tdname(td), "statclock",
+	    "prio:%d", td->td_priority, "stathz:%d", (stathz)?stathz:hz);
+	SDT_PROBE2(sched, , , tick, td, td->td_proc);
+	thread_lock_flags(td, MTX_QUIET);
+	for ( ; cnt > 0; cnt--)
+		sched_clock(td);
+	thread_unlock(td);
+#ifdef HWPMC_HOOKS
+	if (td->td_intr_frame != NULL)
+		PMC_SOFT_CALL_TF( , , clock, stat, td->td_intr_frame);
+#endif
+}
+
+void
+profclock(int usermode, uintfptr_t pc)
+{
+
+	profclock_cnt(1, usermode, pc);
+}
+
+void
+profclock_cnt(int cnt, int usermode, uintfptr_t pc)
+{
+	struct thread *td;
+#ifdef GPROF
+	struct gmonparam *g;
+	uintfptr_t i;
+#endif
+
+	td = curthread;
+	if (usermode) {
+		/*
+		 * Came from user mode; CPU was in user state.
+		 * If this process is being profiled, record the tick.
+		 * if there is no related user location yet, don't
+		 * bother trying to count it.
+		 */
+		if (td->td_proc->p_flag & P_PROFIL)
+			addupc_intr(td, pc, cnt);
+	}
+#ifdef GPROF
+	else {
+		/*
+		 * Kernel statistics are just like addupc_intr, only easier.
+		 */
+		g = &_gmonparam;
+		if (g->state == GMON_PROF_ON && pc >= g->lowpc) {
+			i = PC_TO_I(g, pc);
+			if (i < g->textsize) {
+				KCOUNT(g, i) += cnt;
+			}
+		}
+	}
+#endif
+#ifdef HWPMC_HOOKS
+	if (td->td_intr_frame != NULL)
+		PMC_SOFT_CALL_TF( , , clock, prof, td->td_intr_frame);
+#endif
+}
+
+/*
+ * Return information about system clocks.
+ */
+static int
+sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
+{
+	struct clockinfo clkinfo;
+	/*
+	 * Construct clockinfo structure.
+	 */
+	bzero(&clkinfo, sizeof(clkinfo));
+	clkinfo.hz = hz;
+	clkinfo.tick = tick;
+	clkinfo.profhz = profhz;
+	clkinfo.stathz = stathz ? stathz : hz;
+	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate,
+	CTLTYPE_STRUCT|CTLFLAG_RD|CTLFLAG_MPSAFE,
+	0, 0, sysctl_kern_clockrate, "S,clockinfo",
+	"Rate and period of various kernel clocks");
+
+#ifdef SW_WATCHDOG
+
+static void
+watchdog_config(void *unused __unused, u_int cmd, int *error)
+{
+	u_int u;
+
+	u = cmd & WD_INTERVAL;
+	if (u >= WD_TO_1SEC) {
+		watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz;
+		watchdog_enabled = 1;
+		*error = 0;
+	} else {
+		watchdog_enabled = 0;
+	}
+}
+
+/*
+ * Handle a watchdog timeout by dumping interrupt information and
+ * then either dropping to DDB or panicking.
+ */
+static void
+watchdog_fire(void)
+{
+	int nintr;
+	uint64_t inttotal;
+	u_long *curintr;
+	char *curname;
+
+	curintr = intrcnt;
+	curname = intrnames;
+	inttotal = 0;
+	nintr = sintrcnt / sizeof(u_long);
+
+	printf("interrupt                   total\n");
+	while (--nintr >= 0) {
+		if (*curintr)
+			printf("%-12s %20lu\n", curname, *curintr);
+		curname += strlen(curname) + 1;
+		inttotal += *curintr++;
+	}
+	printf("Total        %20ju\n", (uintmax_t)inttotal);
+
+#if defined(KDB) && !defined(KDB_UNATTENDED)
+	kdb_backtrace();
+	kdb_enter(KDB_WHY_WATCHDOG, "watchdog timeout");
+#else
+	panic("watchdog timeout");
+#endif
+}
+
+#endif /* SW_WATCHDOG */
diff --git a/sys/kern/kern_clocksource.c b/sys/kern/kern_clocksource.c
new file mode 100644
index 0000000..c2bebbe
--- /dev/null
+++ b/sys/kern/kern_clocksource.c
@@ -0,0 +1,949 @@
+/*-
+ * Copyright (c) 2010-2013 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer,
+ *    without modification, immediately at the beginning of the file.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Common routines to manage event timers hardware.
+ */
+
+#include "opt_device_polling.h"
+#include "opt_kdtrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/timeet.h>
+#include <sys/timetc.h>
+
+#include <machine/atomic.h>
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <machine/smp.h>
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+cyclic_clock_func_t	cyclic_clock_func = NULL;
+#endif
+
+int			cpu_can_deep_sleep = 0;	/* C3 state is available. */
+int			cpu_disable_deep_sleep = 0; /* Timer dies in C3. */
+
+static void		setuptimer(void);
+static void		loadtimer(sbintime_t now, int first);
+static int		doconfigtimer(void);
+static void		configtimer(int start);
+static int		round_freq(struct eventtimer *et, int freq);
+
+static sbintime_t	getnextcpuevent(int idle);
+static sbintime_t	getnextevent(void);
+static int		handleevents(sbintime_t now, int fake);
+
+static struct mtx	et_hw_mtx;
+
+#define	ET_HW_LOCK(state)						\
+	{								\
+		if (timer->et_flags & ET_FLAGS_PERCPU)			\
+			mtx_lock_spin(&(state)->et_hw_mtx);		\
+		else							\
+			mtx_lock_spin(&et_hw_mtx);			\
+	}
+
+#define	ET_HW_UNLOCK(state)						\
+	{								\
+		if (timer->et_flags & ET_FLAGS_PERCPU)			\
+			mtx_unlock_spin(&(state)->et_hw_mtx);		\
+		else							\
+			mtx_unlock_spin(&et_hw_mtx);			\
+	}
+
+static struct eventtimer *timer = NULL;
+static sbintime_t	timerperiod;	/* Timer period for periodic mode. */
+static sbintime_t	statperiod;	/* statclock() events period. */
+static sbintime_t	profperiod;	/* profclock() events period. */
+static sbintime_t	nexttick;	/* Next global timer tick time. */
+static u_int		busy = 1;	/* Reconfiguration is in progress. */
+static int		profiling = 0;	/* Profiling events enabled. */
+
+static char		timername[32];	/* Wanted timer. */
+TUNABLE_STR("kern.eventtimer.timer", timername, sizeof(timername));
+
+static int		singlemul = 0;	/* Multiplier for periodic mode. */
+TUNABLE_INT("kern.eventtimer.singlemul", &singlemul);
+SYSCTL_INT(_kern_eventtimer, OID_AUTO, singlemul, CTLFLAG_RW, &singlemul,
+    0, "Multiplier for periodic mode");
+
+static u_int		idletick = 0;	/* Run periodic events when idle. */
+TUNABLE_INT("kern.eventtimer.idletick", &idletick);
+SYSCTL_UINT(_kern_eventtimer, OID_AUTO, idletick, CTLFLAG_RW, &idletick,
+    0, "Run periodic events when idle");
+
+static int		periodic = 0;	/* Periodic or one-shot mode. */
+static int		want_periodic = 0; /* What mode to prefer. */
+TUNABLE_INT("kern.eventtimer.periodic", &want_periodic);
+
+struct pcpu_state {
+	struct mtx	et_hw_mtx;	/* Per-CPU timer mutex. */
+	u_int		action;		/* Reconfiguration requests. */
+	u_int		handle;		/* Immediate handle resuests. */
+	sbintime_t	now;		/* Last tick time. */
+	sbintime_t	nextevent;	/* Next scheduled event on this CPU. */
+	sbintime_t	nexttick;	/* Next timer tick time. */
+	sbintime_t	nexthard;	/* Next hardlock() event. */
+	sbintime_t	nextstat;	/* Next statclock() event. */
+	sbintime_t	nextprof;	/* Next profclock() event. */
+	sbintime_t	nextcall;	/* Next callout event. */
+	sbintime_t	nextcallopt;	/* Next optional callout event. */
+#ifdef KDTRACE_HOOKS
+	sbintime_t	nextcyc;	/* Next OpenSolaris cyclics event. */
+#endif
+	int		ipi;		/* This CPU needs IPI. */
+	int		idle;		/* This CPU is in idle mode. */
+};
+
+static DPCPU_DEFINE(struct pcpu_state, timerstate);
+DPCPU_DEFINE(sbintime_t, hardclocktime);
+
+/*
+ * Timer broadcast IPI handler.
+ */
+int
+hardclockintr(void)
+{
+	sbintime_t now;
+	struct pcpu_state *state;
+	int done;
+
+	if (doconfigtimer() || busy)
+		return (FILTER_HANDLED);
+	state = DPCPU_PTR(timerstate);
+	now = state->now;
+	CTR3(KTR_SPARE2, "ipi  at %d:    now  %d.%08x",
+	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
+	done = handleevents(now, 0);
+	return (done ? FILTER_HANDLED : FILTER_STRAY);
+}
+
+/*
+ * Handle all events for specified time on this CPU
+ */
+static int
+handleevents(sbintime_t now, int fake)
+{
+	sbintime_t t, *hct;
+	struct trapframe *frame;
+	struct pcpu_state *state;
+	int usermode;
+	int done, runs;
+
+	CTR3(KTR_SPARE2, "handle at %d:  now  %d.%08x",
+	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
+	done = 0;
+	if (fake) {
+		frame = NULL;
+		usermode = 0;
+	} else {
+		frame = curthread->td_intr_frame;
+		usermode = TRAPF_USERMODE(frame);
+	}
+
+	state = DPCPU_PTR(timerstate);
+
+	runs = 0;
+	while (now >= state->nexthard) {
+		state->nexthard += tick_sbt;
+		runs++;
+	}
+	if (runs) {
+		hct = DPCPU_PTR(hardclocktime);
+		*hct = state->nexthard - tick_sbt;
+		if (fake < 2) {
+			hardclock_cnt(runs, usermode);
+			done = 1;
+		}
+	}
+	runs = 0;
+	while (now >= state->nextstat) {
+		state->nextstat += statperiod;
+		runs++;
+	}
+	if (runs && fake < 2) {
+		statclock_cnt(runs, usermode);
+		done = 1;
+	}
+	if (profiling) {
+		runs = 0;
+		while (now >= state->nextprof) {
+			state->nextprof += profperiod;
+			runs++;
+		}
+		if (runs && !fake) {
+			profclock_cnt(runs, usermode, TRAPF_PC(frame));
+			done = 1;
+		}
+	} else
+		state->nextprof = state->nextstat;
+	if (now >= state->nextcallopt) {
+		state->nextcall = state->nextcallopt = INT64_MAX;
+		callout_process(now);
+	}
+
+#ifdef KDTRACE_HOOKS
+	if (fake == 0 && now >= state->nextcyc && cyclic_clock_func != NULL) {
+		state->nextcyc = INT64_MAX;
+		(*cyclic_clock_func)(frame);
+	}
+#endif
+
+	t = getnextcpuevent(0);
+	ET_HW_LOCK(state);
+	if (!busy) {
+		state->idle = 0;
+		state->nextevent = t;
+		loadtimer(now, 0);
+	}
+	ET_HW_UNLOCK(state);
+	return (done);
+}
+
+/*
+ * Schedule binuptime of the next event on current CPU.
+ */
+static sbintime_t
+getnextcpuevent(int idle)
+{
+	sbintime_t event;
+	struct pcpu_state *state;
+	u_int hardfreq;
+
+	state = DPCPU_PTR(timerstate);
+	/* Handle hardclock() events, skipping some if CPU is idle. */
+	event = state->nexthard;
+	if (idle) {
+		hardfreq = (u_int)hz / 2;
+		if (tc_min_ticktock_freq > 2
+#ifdef SMP
+		    && curcpu == CPU_FIRST()
+#endif
+		    )
+			hardfreq = hz / tc_min_ticktock_freq;
+		if (hardfreq > 1)
+			event += tick_sbt * (hardfreq - 1);
+	}
+	/* Handle callout events. */
+	if (event > state->nextcall)
+		event = state->nextcall;
+	if (!idle) { /* If CPU is active - handle other types of events. */
+		if (event > state->nextstat)
+			event = state->nextstat;
+		if (profiling && event > state->nextprof)
+			event = state->nextprof;
+	}
+#ifdef KDTRACE_HOOKS
+	if (event > state->nextcyc)
+		event = state->nextcyc;
+#endif
+	return (event);
+}
+
+/*
+ * Schedule binuptime of the next event on all CPUs.
+ */
+static sbintime_t
+getnextevent(void)
+{
+	struct pcpu_state *state;
+	sbintime_t event;
+#ifdef SMP
+	int	cpu;
+#endif
+	int	c;
+
+	state = DPCPU_PTR(timerstate);
+	event = state->nextevent;
+	c = -1;
+#ifdef SMP
+	if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) {
+		CPU_FOREACH(cpu) {
+			state = DPCPU_ID_PTR(cpu, timerstate);
+			if (event > state->nextevent) {
+				event = state->nextevent;
+				c = cpu;
+			}
+		}
+	}
+#endif
+	CTR4(KTR_SPARE2, "next at %d:    next %d.%08x by %d",
+	    curcpu, (int)(event >> 32), (u_int)(event & 0xffffffff), c);
+	return (event);
+}
+
+/* Hardware timer callback function. */
+static void
+timercb(struct eventtimer *et, void *arg)
+{
+	sbintime_t now;
+	sbintime_t *next;
+	struct pcpu_state *state;
+#ifdef SMP
+	int cpu, bcast;
+#endif
+
+	/* Do not touch anything if somebody reconfiguring timers. */
+	if (busy)
+		return;
+	/* Update present and next tick times. */
+	state = DPCPU_PTR(timerstate);
+	if (et->et_flags & ET_FLAGS_PERCPU) {
+		next = &state->nexttick;
+	} else
+		next = &nexttick;
+	now = sbinuptime();
+	if (periodic)
+		*next = now + timerperiod;
+	else
+		*next = -1;	/* Next tick is not scheduled yet. */
+	state->now = now;
+	CTR3(KTR_SPARE2, "intr at %d:    now  %d.%08x",
+	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
+
+#ifdef SMP
+	/* Prepare broadcasting to other CPUs for non-per-CPU timers. */
+	bcast = 0;
+	if ((et->et_flags & ET_FLAGS_PERCPU) == 0 && smp_started) {
+		CPU_FOREACH(cpu) {
+			state = DPCPU_ID_PTR(cpu, timerstate);
+			ET_HW_LOCK(state);
+			state->now = now;
+			if (now >= state->nextevent) {
+				state->nextevent += SBT_1S;
+				if (curcpu != cpu) {
+					state->ipi = 1;
+					bcast = 1;
+				}
+			}
+			ET_HW_UNLOCK(state);
+		}
+	}
+#endif
+
+	/* Handle events for this time on this CPU. */
+	handleevents(now, 0);
+
+#ifdef SMP
+	/* Broadcast interrupt to other CPUs for non-per-CPU timers. */
+	if (bcast) {
+		CPU_FOREACH(cpu) {
+			if (curcpu == cpu)
+				continue;
+			state = DPCPU_ID_PTR(cpu, timerstate);
+			if (state->ipi) {
+				state->ipi = 0;
+				ipi_cpu(cpu, IPI_HARDCLOCK);
+			}
+		}
+	}
+#endif
+}
+
+/*
+ * Load new value into hardware timer.
+ */
+static void
+loadtimer(sbintime_t now, int start)
+{
+	struct pcpu_state *state;
+	sbintime_t new;
+	sbintime_t *next;
+	uint64_t tmp;
+	int eq;
+
+	if (timer->et_flags & ET_FLAGS_PERCPU) {
+		state = DPCPU_PTR(timerstate);
+		next = &state->nexttick;
+	} else
+		next = &nexttick;
+	if (periodic) {
+		if (start) {
+			/*
+			 * Try to start all periodic timers aligned
+			 * to period to make events synchronous.
+			 */
+			tmp = now % timerperiod;
+			new = timerperiod - tmp;
+			if (new < tmp)		/* Left less then passed. */
+				new += timerperiod;
+			CTR5(KTR_SPARE2, "load p at %d:   now %d.%08x first in %d.%08x",
+			    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff),
+			    (int)(new >> 32), (u_int)(new & 0xffffffff));
+			*next = new + now;
+			et_start(timer, new, timerperiod);
+		}
+	} else {
+		new = getnextevent();
+		eq = (new == *next);
+		CTR4(KTR_SPARE2, "load at %d:    next %d.%08x eq %d",
+		    curcpu, (int)(new >> 32), (u_int)(new & 0xffffffff), eq);
+		if (!eq) {
+			*next = new;
+			et_start(timer, new - now, 0);
+		}
+	}
+}
+
+/*
+ * Prepare event timer parameters after configuration changes.
+ */
+static void
+setuptimer(void)
+{
+	int freq;
+
+	if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0)
+		periodic = 0;
+	else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0)
+		periodic = 1;
+	singlemul = MIN(MAX(singlemul, 1), 20);
+	freq = hz * singlemul;
+	while (freq < (profiling ? profhz : stathz))
+		freq += hz;
+	freq = round_freq(timer, freq);
+	timerperiod = SBT_1S / freq;
+}
+
+/*
+ * Reconfigure specified per-CPU timer on other CPU. Called from IPI handler.
+ */
+static int
+doconfigtimer(void)
+{
+	sbintime_t now;
+	struct pcpu_state *state;
+
+	state = DPCPU_PTR(timerstate);
+	switch (atomic_load_acq_int(&state->action)) {
+	case 1:
+		now = sbinuptime();
+		ET_HW_LOCK(state);
+		loadtimer(now, 1);
+		ET_HW_UNLOCK(state);
+		state->handle = 0;
+		atomic_store_rel_int(&state->action, 0);
+		return (1);
+	case 2:
+		ET_HW_LOCK(state);
+		et_stop(timer);
+		ET_HW_UNLOCK(state);
+		state->handle = 0;
+		atomic_store_rel_int(&state->action, 0);
+		return (1);
+	}
+	if (atomic_readandclear_int(&state->handle) && !busy) {
+		now = sbinuptime();
+		handleevents(now, 0);
+		return (1);
+	}
+	return (0);
+}
+
+/*
+ * Reconfigure specified timer.
+ * For per-CPU timers use IPI to make other CPUs to reconfigure.
+ */
+static void
+configtimer(int start)
+{
+	sbintime_t now, next;
+	struct pcpu_state *state;
+	int cpu;
+
+	if (start) {
+		setuptimer();
+		now = sbinuptime();
+	} else
+		now = 0;
+	critical_enter();
+	ET_HW_LOCK(DPCPU_PTR(timerstate));
+	if (start) {
+		/* Initialize time machine parameters. */
+		next = now + timerperiod;
+		if (periodic)
+			nexttick = next;
+		else
+			nexttick = -1;
+		CPU_FOREACH(cpu) {
+			state = DPCPU_ID_PTR(cpu, timerstate);
+			state->now = now;
+			if (!smp_started && cpu != CPU_FIRST())
+				state->nextevent = INT64_MAX;
+			else
+				state->nextevent = next;
+			if (periodic)
+				state->nexttick = next;
+			else
+				state->nexttick = -1;
+			state->nexthard = next;
+			state->nextstat = next;
+			state->nextprof = next;
+			state->nextcall = next;
+			state->nextcallopt = next;
+			hardclock_sync(cpu);
+		}
+		busy = 0;
+		/* Start global timer or per-CPU timer of this CPU. */
+		loadtimer(now, 1);
+	} else {
+		busy = 1;
+		/* Stop global timer or per-CPU timer of this CPU. */
+		et_stop(timer);
+	}
+	ET_HW_UNLOCK(DPCPU_PTR(timerstate));
+#ifdef SMP
+	/* If timer is global or there is no other CPUs yet - we are done. */
+	if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || !smp_started) {
+		critical_exit();
+		return;
+	}
+	/* Set reconfigure flags for other CPUs. */
+	CPU_FOREACH(cpu) {
+		state = DPCPU_ID_PTR(cpu, timerstate);
+		atomic_store_rel_int(&state->action,
+		    (cpu == curcpu) ? 0 : ( start ? 1 : 2));
+	}
+	/* Broadcast reconfigure IPI. */
+	ipi_all_but_self(IPI_HARDCLOCK);
+	/* Wait for reconfiguration completed. */
+restart:
+	cpu_spinwait();
+	CPU_FOREACH(cpu) {
+		if (cpu == curcpu)
+			continue;
+		state = DPCPU_ID_PTR(cpu, timerstate);
+		if (atomic_load_acq_int(&state->action))
+			goto restart;
+	}
+#endif
+	critical_exit();
+}
+
+/*
+ * Calculate nearest frequency supported by hardware timer.
+ */
+static int
+round_freq(struct eventtimer *et, int freq)
+{
+	uint64_t div;
+
+	if (et->et_frequency != 0) {
+		div = lmax((et->et_frequency + freq / 2) / freq, 1);
+		if (et->et_flags & ET_FLAGS_POW2DIV)
+			div = 1 << (flsl(div + div / 2) - 1);
+		freq = (et->et_frequency + div / 2) / div;
+	}
+	if (et->et_min_period > SBT_1S)
+		panic("Event timer \"%s\" doesn't support sub-second periods!",
+		    et->et_name);
+	else if (et->et_min_period != 0)
+		freq = min(freq, SBT2FREQ(et->et_min_period));
+	if (et->et_max_period < SBT_1S && et->et_max_period != 0)
+		freq = max(freq, SBT2FREQ(et->et_max_period));
+	return (freq);
+}
+
+/*
+ * Configure and start event timers (BSP part).
+ */
+void
+cpu_initclocks_bsp(void)
+{
+	struct pcpu_state *state;
+	int base, div, cpu;
+
+	mtx_init(&et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN);
+	CPU_FOREACH(cpu) {
+		state = DPCPU_ID_PTR(cpu, timerstate);
+		mtx_init(&state->et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN);
+#ifdef KDTRACE_HOOKS
+		state->nextcyc = INT64_MAX;
+#endif
+		state->nextcall = INT64_MAX;
+		state->nextcallopt = INT64_MAX;
+	}
+	periodic = want_periodic;
+	/* Grab requested timer or the best of present. */
+	if (timername[0])
+		timer = et_find(timername, 0, 0);
+	if (timer == NULL && periodic) {
+		timer = et_find(NULL,
+		    ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC);
+	}
+	if (timer == NULL) {
+		timer = et_find(NULL,
+		    ET_FLAGS_ONESHOT, ET_FLAGS_ONESHOT);
+	}
+	if (timer == NULL && !periodic) {
+		timer = et_find(NULL,
+		    ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC);
+	}
+	if (timer == NULL)
+		panic("No usable event timer found!");
+	et_init(timer, timercb, NULL, NULL);
+
+	/* Adapt to timer capabilities. */
+	if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0)
+		periodic = 0;
+	else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0)
+		periodic = 1;
+	if (timer->et_flags & ET_FLAGS_C3STOP)
+		cpu_disable_deep_sleep++;
+
+	/*
+	 * We honor the requested 'hz' value.
+	 * We want to run stathz in the neighborhood of 128hz.
+	 * We would like profhz to run as often as possible.
+	 */
+	if (singlemul <= 0 || singlemul > 20) {
+		if (hz >= 1500 || (hz % 128) == 0)
+			singlemul = 1;
+		else if (hz >= 750)
+			singlemul = 2;
+		else
+			singlemul = 4;
+	}
+	if (periodic) {
+		base = round_freq(timer, hz * singlemul);
+		singlemul = max((base + hz / 2) / hz, 1);
+		hz = (base + singlemul / 2) / singlemul;
+		if (base <= 128)
+			stathz = base;
+		else {
+			div = base / 128;
+			if (div >= singlemul && (div % singlemul) == 0)
+				div++;
+			stathz = base / div;
+		}
+		profhz = stathz;
+		while ((profhz + stathz) <= 128 * 64)
+			profhz += stathz;
+		profhz = round_freq(timer, profhz);
+	} else {
+		hz = round_freq(timer, hz);
+		stathz = round_freq(timer, 127);
+		profhz = round_freq(timer, stathz * 64);
+	}
+	tick = 1000000 / hz;
+	tick_sbt = SBT_1S / hz;
+	tick_bt = sbttobt(tick_sbt);
+	statperiod = SBT_1S / stathz;
+	profperiod = SBT_1S / profhz;
+	ET_LOCK();
+	configtimer(1);
+	ET_UNLOCK();
+}
+
+/*
+ * Start per-CPU event timers on APs.
+ */
+void
+cpu_initclocks_ap(void)
+{
+	sbintime_t now;
+	struct pcpu_state *state;
+	struct thread *td;
+
+	state = DPCPU_PTR(timerstate);
+	now = sbinuptime();
+	ET_HW_LOCK(state);
+	state->now = now;
+	hardclock_sync(curcpu);
+	spinlock_enter();
+	ET_HW_UNLOCK(state);
+	td = curthread;
+	td->td_intr_nesting_level++;
+	handleevents(state->now, 2);
+	td->td_intr_nesting_level--;
+	spinlock_exit();
+}
+
+/*
+ * Switch to profiling clock rates.
+ */
+void
+cpu_startprofclock(void)
+{
+
+	ET_LOCK();
+	if (profiling == 0) {
+		if (periodic) {
+			configtimer(0);
+			profiling = 1;
+			configtimer(1);
+		} else
+			profiling = 1;
+	} else
+		profiling++;
+	ET_UNLOCK();
+}
+
+/*
+ * Switch to regular clock rates.
+ */
+void
+cpu_stopprofclock(void)
+{
+
+	ET_LOCK();
+	if (profiling == 1) {
+		if (periodic) {
+			configtimer(0);
+			profiling = 0;
+			configtimer(1);
+		} else
+		profiling = 0;
+	} else
+		profiling--;
+	ET_UNLOCK();
+}
+
+/*
+ * Switch to idle mode (all ticks handled).
+ */
+sbintime_t
+cpu_idleclock(void)
+{
+	sbintime_t now, t;
+	struct pcpu_state *state;
+
+	if (idletick || busy ||
+	    (periodic && (timer->et_flags & ET_FLAGS_PERCPU))
+#ifdef DEVICE_POLLING
+	    || curcpu == CPU_FIRST()
+#endif
+	    )
+		return (-1);
+	state = DPCPU_PTR(timerstate);
+	if (periodic)
+		now = state->now;
+	else
+		now = sbinuptime();
+	CTR3(KTR_SPARE2, "idle at %d:    now  %d.%08x",
+	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
+	t = getnextcpuevent(1);
+	ET_HW_LOCK(state);
+	state->idle = 1;
+	state->nextevent = t;
+	if (!periodic)
+		loadtimer(now, 0);
+	ET_HW_UNLOCK(state);
+	return (MAX(t - now, 0));
+}
+
+/*
+ * Switch to active mode (skip empty ticks).
+ */
+void
+cpu_activeclock(void)
+{
+	sbintime_t now;
+	struct pcpu_state *state;
+	struct thread *td;
+
+	state = DPCPU_PTR(timerstate);
+	if (state->idle == 0 || busy)
+		return;
+	if (periodic)
+		now = state->now;
+	else
+		now = sbinuptime();
+	CTR3(KTR_SPARE2, "active at %d:  now  %d.%08x",
+	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
+	spinlock_enter();
+	td = curthread;
+	td->td_intr_nesting_level++;
+	handleevents(now, 1);
+	td->td_intr_nesting_level--;
+	spinlock_exit();
+}
+
+#ifdef KDTRACE_HOOKS
+void
+clocksource_cyc_set(const struct bintime *bt)
+{
+	sbintime_t now, t;
+	struct pcpu_state *state;
+
+	/* Do not touch anything if somebody reconfiguring timers. */
+	if (busy)
+		return;
+	t = bttosbt(*bt);
+	state = DPCPU_PTR(timerstate);
+	if (periodic)
+		now = state->now;
+	else
+		now = sbinuptime();
+
+	CTR5(KTR_SPARE2, "set_cyc at %d:  now  %d.%08x  t  %d.%08x",
+	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff),
+	    (int)(t >> 32), (u_int)(t & 0xffffffff));
+
+	ET_HW_LOCK(state);
+	if (t == state->nextcyc)
+		goto done;
+	state->nextcyc = t;
+	if (t >= state->nextevent)
+		goto done;
+	state->nextevent = t;
+	if (!periodic)
+		loadtimer(now, 0);
+done:
+	ET_HW_UNLOCK(state);
+}
+#endif
+
+void
+cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt)
+{
+	struct pcpu_state *state;
+
+	/* Do not touch anything if somebody reconfiguring timers. */
+	if (busy)
+		return;
+	CTR6(KTR_SPARE2, "new co at %d:    on %d at %d.%08x - %d.%08x",
+	    curcpu, cpu, (int)(bt_opt >> 32), (u_int)(bt_opt & 0xffffffff),
+	    (int)(bt >> 32), (u_int)(bt & 0xffffffff));
+	state = DPCPU_ID_PTR(cpu, timerstate);
+	ET_HW_LOCK(state);
+
+	/*
+	 * If there is callout time already set earlier -- do nothing.
+	 * This check may appear redundant because we check already in
+	 * callout_process() but this double check guarantees we're safe
+	 * with respect to race conditions between interrupts execution
+	 * and scheduling.
+	 */
+	state->nextcallopt = bt_opt;
+	if (bt >= state->nextcall)
+		goto done;
+	state->nextcall = bt;
+	/* If there is some other event set earlier -- do nothing. */
+	if (bt >= state->nextevent)
+		goto done;
+	state->nextevent = bt;
+	/* If timer is periodic -- there is nothing to reprogram. */
+	if (periodic)
+		goto done;
+	/* If timer is global or of the current CPU -- reprogram it. */
+	if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || cpu == curcpu) {
+		loadtimer(sbinuptime(), 0);
+done:
+		ET_HW_UNLOCK(state);
+		return;
+	}
+	/* Otherwise make other CPU to reprogram it. */
+	state->handle = 1;
+	ET_HW_UNLOCK(state);
+#ifdef SMP
+	ipi_cpu(cpu, IPI_HARDCLOCK);
+#endif
+}
+
+/*
+ * Report or change the active event timers hardware.
+ */
+static int
+sysctl_kern_eventtimer_timer(SYSCTL_HANDLER_ARGS)
+{
+	char buf[32];
+	struct eventtimer *et;
+	int error;
+
+	ET_LOCK();
+	et = timer;
+	snprintf(buf, sizeof(buf), "%s", et->et_name);
+	ET_UNLOCK();
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	ET_LOCK();
+	et = timer;
+	if (error != 0 || req->newptr == NULL ||
+	    strcasecmp(buf, et->et_name) == 0) {
+		ET_UNLOCK();
+		return (error);
+	}
+	et = et_find(buf, 0, 0);
+	if (et == NULL) {
+		ET_UNLOCK();
+		return (ENOENT);
+	}
+	configtimer(0);
+	et_free(timer);
+	if (et->et_flags & ET_FLAGS_C3STOP)
+		cpu_disable_deep_sleep++;
+	if (timer->et_flags & ET_FLAGS_C3STOP)
+		cpu_disable_deep_sleep--;
+	periodic = want_periodic;
+	timer = et;
+	et_init(timer, timercb, NULL, NULL);
+	configtimer(1);
+	ET_UNLOCK();
+	return (error);
+}
+SYSCTL_PROC(_kern_eventtimer, OID_AUTO, timer,
+    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    0, 0, sysctl_kern_eventtimer_timer, "A", "Chosen event timer");
+
+/*
+ * Report or change the active event timer periodicity.
+ */
+static int
+sysctl_kern_eventtimer_periodic(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = periodic;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	ET_LOCK();
+	configtimer(0);
+	periodic = want_periodic = val;
+	configtimer(1);
+	ET_UNLOCK();
+	return (error);
+}
+SYSCTL_PROC(_kern_eventtimer, OID_AUTO, periodic,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    0, 0, sysctl_kern_eventtimer_periodic, "I", "Enable event timer periodic mode");
diff --git a/sys/kern/kern_condvar.c b/sys/kern/kern_condvar.c
new file mode 100644
index 0000000..483ea2e
--- /dev/null
+++ b/sys/kern/kern_condvar.c
@@ -0,0 +1,456 @@
+/*-
+ * Copyright (c) 2000 Jake Burkholder <jake@freebsd.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/condvar.h>
+#include <sys/sched.h>
+#include <sys/signalvar.h>
+#include <sys/sleepqueue.h>
+#include <sys/resourcevar.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+/*
+ * Common sanity checks for cv_wait* functions.
+ */
+#define	CV_ASSERT(cvp, lock, td) do {					\
+	KASSERT((td) != NULL, ("%s: td NULL", __func__));		\
+	KASSERT(TD_IS_RUNNING(td), ("%s: not TDS_RUNNING", __func__));	\
+	KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__));		\
+	KASSERT((lock) != NULL, ("%s: lock NULL", __func__));		\
+} while (0)
+
+/*
+ * Initialize a condition variable.  Must be called before use.
+ */
+void
+cv_init(struct cv *cvp, const char *desc)
+{
+
+	cvp->cv_description = desc;
+	cvp->cv_waiters = 0;
+}
+
+/*
+ * Destroy a condition variable.  The condition variable must be re-initialized
+ * in order to be re-used.
+ */
+void
+cv_destroy(struct cv *cvp)
+{
+#ifdef INVARIANTS
+	struct sleepqueue *sq;
+
+	sleepq_lock(cvp);
+	sq = sleepq_lookup(cvp);
+	sleepq_release(cvp);
+	KASSERT(sq == NULL, ("%s: associated sleep queue non-empty", __func__));
+#endif
+}
+
+/*
+ * Wait on a condition variable.  The current thread is placed on the condition
+ * variable's wait queue and suspended.  A cv_signal or cv_broadcast on the same
+ * condition variable will resume the thread.  The mutex is released before
+ * sleeping and will be held on return.  It is recommended that the mutex be
+ * held when cv_signal or cv_broadcast are called.
+ */
+void
+_cv_wait(struct cv *cvp, struct lock_object *lock)
+{
+	WITNESS_SAVE_DECL(lock_witness);
+	struct lock_class *class;
+	struct thread *td;
+	int lock_state;
+
+	td = curthread;
+	lock_state = 0;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(1, 0, cv_wmesg(cvp));
+#endif
+	CV_ASSERT(cvp, lock, td);
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+	    "Waiting on \"%s\"", cvp->cv_description);
+	class = LOCK_CLASS(lock);
+
+	if (cold || panicstr) {
+		/*
+		 * During autoconfiguration, just give interrupts
+		 * a chance, then just return.  Don't run any other
+		 * thread or panic below, in case this is the idle
+		 * process and already asleep.
+		 */
+		return;
+	}
+
+	sleepq_lock(cvp);
+
+	cvp->cv_waiters++;
+	if (lock == &Giant.lock_object)
+		mtx_assert(&Giant, MA_OWNED);
+	DROP_GIANT();
+
+	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
+	if (lock != &Giant.lock_object) {
+		if (class->lc_flags & LC_SLEEPABLE)
+			sleepq_release(cvp);
+		WITNESS_SAVE(lock, lock_witness);
+		lock_state = class->lc_unlock(lock);
+		if (class->lc_flags & LC_SLEEPABLE)
+			sleepq_lock(cvp);
+	}
+	sleepq_wait(cvp, 0);
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(0, 0, cv_wmesg(cvp));
+#endif
+	PICKUP_GIANT();
+	if (lock != &Giant.lock_object) {
+		class->lc_lock(lock, lock_state);
+		WITNESS_RESTORE(lock, lock_witness);
+	}
+}
+
+/*
+ * Wait on a condition variable.  This function differs from cv_wait by
+ * not aquiring the mutex after condition variable was signaled.
+ */
+void
+_cv_wait_unlock(struct cv *cvp, struct lock_object *lock)
+{
+	struct lock_class *class;
+	struct thread *td;
+
+	td = curthread;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(1, 0, cv_wmesg(cvp));
+#endif
+	CV_ASSERT(cvp, lock, td);
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+	    "Waiting on \"%s\"", cvp->cv_description);
+	KASSERT(lock != &Giant.lock_object,
+	    ("cv_wait_unlock cannot be used with Giant"));
+	class = LOCK_CLASS(lock);
+
+	if (cold || panicstr) {
+		/*
+		 * During autoconfiguration, just give interrupts
+		 * a chance, then just return.  Don't run any other
+		 * thread or panic below, in case this is the idle
+		 * process and already asleep.
+		 */
+		class->lc_unlock(lock);
+		return;
+	}
+
+	sleepq_lock(cvp);
+
+	cvp->cv_waiters++;
+	DROP_GIANT();
+
+	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
+	if (class->lc_flags & LC_SLEEPABLE)
+		sleepq_release(cvp);
+	class->lc_unlock(lock);
+	if (class->lc_flags & LC_SLEEPABLE)
+		sleepq_lock(cvp);
+	sleepq_wait(cvp, 0);
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(0, 0, cv_wmesg(cvp));
+#endif
+	PICKUP_GIANT();
+}
+
+/*
+ * Wait on a condition variable, allowing interruption by signals.  Return 0 if
+ * the thread was resumed with cv_signal or cv_broadcast, EINTR or ERESTART if
+ * a signal was caught.  If ERESTART is returned the system call should be
+ * restarted if possible.
+ */
+int
+_cv_wait_sig(struct cv *cvp, struct lock_object *lock)
+{
+	WITNESS_SAVE_DECL(lock_witness);
+	struct lock_class *class;
+	struct thread *td;
+	int lock_state, rval;
+
+	td = curthread;
+	lock_state = 0;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(1, 0, cv_wmesg(cvp));
+#endif
+	CV_ASSERT(cvp, lock, td);
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+	    "Waiting on \"%s\"", cvp->cv_description);
+	class = LOCK_CLASS(lock);
+
+	if (cold || panicstr) {
+		/*
+		 * After a panic, or during autoconfiguration, just give
+		 * interrupts a chance, then just return; don't run any other
+		 * procs or panic below, in case this is the idle process and
+		 * already asleep.
+		 */
+		return (0);
+	}
+
+	sleepq_lock(cvp);
+
+	cvp->cv_waiters++;
+	if (lock == &Giant.lock_object)
+		mtx_assert(&Giant, MA_OWNED);
+	DROP_GIANT();
+
+	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR |
+	    SLEEPQ_INTERRUPTIBLE, 0);
+	if (lock != &Giant.lock_object) {
+		if (class->lc_flags & LC_SLEEPABLE)
+			sleepq_release(cvp);
+		WITNESS_SAVE(lock, lock_witness);
+		lock_state = class->lc_unlock(lock);
+		if (class->lc_flags & LC_SLEEPABLE)
+			sleepq_lock(cvp);
+	}
+	rval = sleepq_wait_sig(cvp, 0);
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(0, 0, cv_wmesg(cvp));
+#endif
+	PICKUP_GIANT();
+	if (lock != &Giant.lock_object) {
+		class->lc_lock(lock, lock_state);
+		WITNESS_RESTORE(lock, lock_witness);
+	}
+
+	return (rval);
+}
+
+/*
+ * Wait on a condition variable for (at most) the value specified in sbt
+ * argument. Returns 0 if the process was resumed by cv_signal or cv_broadcast,
+ * EWOULDBLOCK if the timeout expires.
+ */
+int
+_cv_timedwait_sbt(struct cv *cvp, struct lock_object *lock, sbintime_t sbt,
+    sbintime_t pr, int flags)
+{
+	WITNESS_SAVE_DECL(lock_witness);
+	struct lock_class *class;
+	struct thread *td;
+	int lock_state, rval;
+
+	td = curthread;
+	lock_state = 0;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(1, 0, cv_wmesg(cvp));
+#endif
+	CV_ASSERT(cvp, lock, td);
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+	    "Waiting on \"%s\"", cvp->cv_description);
+	class = LOCK_CLASS(lock);
+
+	if (cold || panicstr) {
+		/*
+		 * After a panic, or during autoconfiguration, just give
+		 * interrupts a chance, then just return; don't run any other
+		 * thread or panic below, in case this is the idle process and
+		 * already asleep.
+		 */
+		return 0;
+	}
+
+	sleepq_lock(cvp);
+
+	cvp->cv_waiters++;
+	if (lock == &Giant.lock_object)
+		mtx_assert(&Giant, MA_OWNED);
+	DROP_GIANT();
+
+	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
+	sleepq_set_timeout_sbt(cvp, sbt, pr, flags);
+	if (lock != &Giant.lock_object) {
+		if (class->lc_flags & LC_SLEEPABLE)
+			sleepq_release(cvp);
+		WITNESS_SAVE(lock, lock_witness);
+		lock_state = class->lc_unlock(lock);
+		if (class->lc_flags & LC_SLEEPABLE)
+			sleepq_lock(cvp);
+	}
+	rval = sleepq_timedwait(cvp, 0);
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(0, 0, cv_wmesg(cvp));
+#endif
+	PICKUP_GIANT();
+	if (lock != &Giant.lock_object) {
+		class->lc_lock(lock, lock_state);
+		WITNESS_RESTORE(lock, lock_witness);
+	}
+
+	return (rval);
+}
+
+/*
+ * Wait on a condition variable for (at most) the value specified in sbt 
+ * argument, allowing interruption by signals.
+ * Returns 0 if the thread was resumed by cv_signal or cv_broadcast,
+ * EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if a signal
+ * was caught.
+ */
+int
+_cv_timedwait_sig_sbt(struct cv *cvp, struct lock_object *lock,
+    sbintime_t sbt, sbintime_t pr, int flags)
+{
+	WITNESS_SAVE_DECL(lock_witness);
+	struct lock_class *class;
+	struct thread *td;
+	int lock_state, rval;
+
+	td = curthread;
+	lock_state = 0;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(1, 0, cv_wmesg(cvp));
+#endif
+	CV_ASSERT(cvp, lock, td);
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+	    "Waiting on \"%s\"", cvp->cv_description);
+	class = LOCK_CLASS(lock);
+
+	if (cold || panicstr) {
+		/*
+		 * After a panic, or during autoconfiguration, just give
+		 * interrupts a chance, then just return; don't run any other
+		 * thread or panic below, in case this is the idle process and
+		 * already asleep.
+		 */
+		return 0;
+	}
+
+	sleepq_lock(cvp);
+
+	cvp->cv_waiters++;
+	if (lock == &Giant.lock_object)
+		mtx_assert(&Giant, MA_OWNED);
+	DROP_GIANT();
+
+	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR |
+	    SLEEPQ_INTERRUPTIBLE, 0);
+	sleepq_set_timeout_sbt(cvp, sbt, pr, flags);
+	if (lock != &Giant.lock_object) {
+		if (class->lc_flags & LC_SLEEPABLE)
+			sleepq_release(cvp);
+		WITNESS_SAVE(lock, lock_witness);
+		lock_state = class->lc_unlock(lock);
+		if (class->lc_flags & LC_SLEEPABLE)
+			sleepq_lock(cvp);
+	}
+	rval = sleepq_timedwait_sig(cvp, 0);
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(0, 0, cv_wmesg(cvp));
+#endif
+	PICKUP_GIANT();
+	if (lock != &Giant.lock_object) {
+		class->lc_lock(lock, lock_state);
+		WITNESS_RESTORE(lock, lock_witness);
+	}
+
+	return (rval);
+}
+
+/*
+ * Signal a condition variable, wakes up one waiting thread.  Will also wakeup
+ * the swapper if the process is not in memory, so that it can bring the
+ * sleeping process in.  Note that this may also result in additional threads
+ * being made runnable.  Should be called with the same mutex as was passed to
+ * cv_wait held.
+ */
+void
+cv_signal(struct cv *cvp)
+{
+	int wakeup_swapper;
+
+	wakeup_swapper = 0;
+	sleepq_lock(cvp);
+	if (cvp->cv_waiters > 0) {
+		cvp->cv_waiters--;
+		wakeup_swapper = sleepq_signal(cvp, SLEEPQ_CONDVAR, 0, 0);
+	}
+	sleepq_release(cvp);
+	if (wakeup_swapper)
+		kick_proc0();
+}
+
+/*
+ * Broadcast a signal to a condition variable.  Wakes up all waiting threads.
+ * Should be called with the same mutex as was passed to cv_wait held.
+ */
+void
+cv_broadcastpri(struct cv *cvp, int pri)
+{
+	int wakeup_swapper;
+
+	/*
+	 * XXX sleepq_broadcast pri argument changed from -1 meaning
+	 * no pri to 0 meaning no pri.
+	 */
+	wakeup_swapper = 0;
+	if (pri == -1)
+		pri = 0;
+	sleepq_lock(cvp);
+	if (cvp->cv_waiters > 0) {
+		cvp->cv_waiters = 0;
+		wakeup_swapper = sleepq_broadcast(cvp, SLEEPQ_CONDVAR, pri, 0);
+	}
+	sleepq_release(cvp);
+	if (wakeup_swapper)
+		kick_proc0();
+}
diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c
new file mode 100644
index 0000000..c04d1da
--- /dev/null
+++ b/sys/kern/kern_conf.c
@@ -0,0 +1,1459 @@
+/*-
+ * Copyright (c) 1999-2002 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/bio.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/vnode.h>
+#include <sys/queue.h>
+#include <sys/poll.h>
+#include <sys/sx.h>
+#include <sys/ctype.h>
+#include <sys/ucred.h>
+#include <sys/taskqueue.h>
+#include <machine/stdarg.h>
+
+#include <fs/devfs/devfs_int.h>
+#include <vm/vm.h>
+
+static MALLOC_DEFINE(M_DEVT, "cdev", "cdev storage");
+
+struct mtx devmtx;
+static void destroy_devl(struct cdev *dev);
+static int destroy_dev_sched_cbl(struct cdev *dev,
+    void (*cb)(void *), void *arg);
+static void destroy_dev_tq(void *ctx, int pending);
+static int make_dev_credv(int flags, struct cdev **dres, struct cdevsw *devsw,
+    int unit, struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt,
+    va_list ap);
+
+static struct cdev_priv_list cdevp_free_list =
+    TAILQ_HEAD_INITIALIZER(cdevp_free_list);
+static SLIST_HEAD(free_cdevsw, cdevsw) cdevsw_gt_post_list =
+    SLIST_HEAD_INITIALIZER(cdevsw_gt_post_list);
+
+void
+dev_lock(void)
+{
+
+	mtx_lock(&devmtx);
+}
+
+/*
+ * Free all the memory collected while the cdev mutex was
+ * locked. Since devmtx is after the system map mutex, free() cannot
+ * be called immediately and is postponed until cdev mutex can be
+ * dropped.
+ */
+static void
+dev_unlock_and_free(void)
+{
+	struct cdev_priv_list cdp_free;
+	struct free_cdevsw csw_free;
+	struct cdev_priv *cdp;
+	struct cdevsw *csw;
+
+	mtx_assert(&devmtx, MA_OWNED);
+
+	/*
+	 * Make the local copy of the list heads while the dev_mtx is
+	 * held. Free it later.
+	 */
+	TAILQ_INIT(&cdp_free);
+	TAILQ_CONCAT(&cdp_free, &cdevp_free_list, cdp_list);
+	csw_free = cdevsw_gt_post_list;
+	SLIST_INIT(&cdevsw_gt_post_list);
+
+	mtx_unlock(&devmtx);
+
+	while ((cdp = TAILQ_FIRST(&cdp_free)) != NULL) {
+		TAILQ_REMOVE(&cdp_free, cdp, cdp_list);
+		devfs_free(&cdp->cdp_c);
+	}
+	while ((csw = SLIST_FIRST(&csw_free)) != NULL) {
+		SLIST_REMOVE_HEAD(&csw_free, d_postfree_list);
+		free(csw, M_DEVT);
+	}
+}
+
+static void
+dev_free_devlocked(struct cdev *cdev)
+{
+	struct cdev_priv *cdp;
+
+	mtx_assert(&devmtx, MA_OWNED);
+	cdp = cdev2priv(cdev);
+	TAILQ_INSERT_HEAD(&cdevp_free_list, cdp, cdp_list);
+}
+
+static void
+cdevsw_free_devlocked(struct cdevsw *csw)
+{
+
+	mtx_assert(&devmtx, MA_OWNED);
+	SLIST_INSERT_HEAD(&cdevsw_gt_post_list, csw, d_postfree_list);
+}
+
+void
+dev_unlock(void)
+{
+
+	mtx_unlock(&devmtx);
+}
+
+void
+dev_ref(struct cdev *dev)
+{
+
+	mtx_assert(&devmtx, MA_NOTOWNED);
+	mtx_lock(&devmtx);
+	dev->si_refcount++;
+	mtx_unlock(&devmtx);
+}
+
+void
+dev_refl(struct cdev *dev)
+{
+
+	mtx_assert(&devmtx, MA_OWNED);
+	dev->si_refcount++;
+}
+
+void
+dev_rel(struct cdev *dev)
+{
+	int flag = 0;
+
+	mtx_assert(&devmtx, MA_NOTOWNED);
+	dev_lock();
+	dev->si_refcount--;
+	KASSERT(dev->si_refcount >= 0,
+	    ("dev_rel(%s) gave negative count", devtoname(dev)));
+#if 0
+	if (dev->si_usecount == 0 &&
+	    (dev->si_flags & SI_CHEAPCLONE) && (dev->si_flags & SI_NAMED))
+		;
+	else 
+#endif
+	if (dev->si_devsw == NULL && dev->si_refcount == 0) {
+		LIST_REMOVE(dev, si_list);
+		flag = 1;
+	}
+	dev_unlock();
+	if (flag)
+		devfs_free(dev);
+}
+
+struct cdevsw *
+dev_refthread(struct cdev *dev, int *ref)
+{
+	struct cdevsw *csw;
+	struct cdev_priv *cdp;
+
+	mtx_assert(&devmtx, MA_NOTOWNED);
+	if ((dev->si_flags & SI_ETERNAL) != 0) {
+		*ref = 0;
+		return (dev->si_devsw);
+	}
+	dev_lock();
+	csw = dev->si_devsw;
+	if (csw != NULL) {
+		cdp = cdev2priv(dev);
+		if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0)
+			dev->si_threadcount++;
+		else
+			csw = NULL;
+	}
+	dev_unlock();
+	*ref = 1;
+	return (csw);
+}
+
+struct cdevsw *
+devvn_refthread(struct vnode *vp, struct cdev **devp, int *ref)
+{
+	struct cdevsw *csw;
+	struct cdev_priv *cdp;
+	struct cdev *dev;
+
+	mtx_assert(&devmtx, MA_NOTOWNED);
+	if ((vp->v_vflag & VV_ETERNALDEV) != 0) {
+		dev = vp->v_rdev;
+		if (dev == NULL)
+			return (NULL);
+		KASSERT((dev->si_flags & SI_ETERNAL) != 0,
+		    ("Not eternal cdev"));
+		*ref = 0;
+		csw = dev->si_devsw;
+		KASSERT(csw != NULL, ("Eternal cdev is destroyed"));
+		*devp = dev;
+		return (csw);
+	}
+
+	csw = NULL;
+	dev_lock();
+	dev = vp->v_rdev;
+	if (dev == NULL) {
+		dev_unlock();
+		return (NULL);
+	}
+	cdp = cdev2priv(dev);
+	if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0) {
+		csw = dev->si_devsw;
+		if (csw != NULL)
+			dev->si_threadcount++;
+	}
+	dev_unlock();
+	if (csw != NULL) {
+		*devp = dev;
+		*ref = 1;
+	}
+	return (csw);
+}
+
+void	
+dev_relthread(struct cdev *dev, int ref)
+{
+
+	mtx_assert(&devmtx, MA_NOTOWNED);
+	if (!ref)
+		return;
+	dev_lock();
+	KASSERT(dev->si_threadcount > 0,
+	    ("%s threadcount is wrong", dev->si_name));
+	dev->si_threadcount--;
+	dev_unlock();
+}
+
+int
+nullop(void)
+{
+
+	return (0);
+}
+
+int
+eopnotsupp(void)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+enxio(void)
+{
+	return (ENXIO);
+}
+
+static int
+enodev(void)
+{
+	return (ENODEV);
+}
+
+/* Define a dead_cdevsw for use when devices leave unexpectedly. */
+
+#define dead_open	(d_open_t *)enxio
+#define dead_close	(d_close_t *)enxio
+#define dead_read	(d_read_t *)enxio
+#define dead_write	(d_write_t *)enxio
+#define dead_ioctl	(d_ioctl_t *)enxio
+#define dead_poll	(d_poll_t *)enodev
+#define dead_mmap	(d_mmap_t *)enodev
+
+static void
+dead_strategy(struct bio *bp)
+{
+
+	biofinish(bp, NULL, ENXIO);
+}
+
+#define dead_dump	(dumper_t *)enxio
+#define dead_kqfilter	(d_kqfilter_t *)enxio
+#define dead_mmap_single (d_mmap_single_t *)enodev
+
+static struct cdevsw dead_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_open =	dead_open,
+	.d_close =	dead_close,
+	.d_read =	dead_read,
+	.d_write =	dead_write,
+	.d_ioctl =	dead_ioctl,
+	.d_poll =	dead_poll,
+	.d_mmap =	dead_mmap,
+	.d_strategy =	dead_strategy,
+	.d_name =	"dead",
+	.d_dump =	dead_dump,
+	.d_kqfilter =	dead_kqfilter,
+	.d_mmap_single = dead_mmap_single
+};
+
+/* Default methods if driver does not specify method */
+
+#define null_open	(d_open_t *)nullop
+#define null_close	(d_close_t *)nullop
+#define no_read		(d_read_t *)enodev
+#define no_write	(d_write_t *)enodev
+#define no_ioctl	(d_ioctl_t *)enodev
+#define no_mmap		(d_mmap_t *)enodev
+#define no_kqfilter	(d_kqfilter_t *)enodev
+#define no_mmap_single	(d_mmap_single_t *)enodev
+
+static void
+no_strategy(struct bio *bp)
+{
+
+	biofinish(bp, NULL, ENODEV);
+}
+
+static int
+no_poll(struct cdev *dev __unused, int events, struct thread *td __unused)
+{
+
+	return (poll_no_poll(events));
+}
+
+#define no_dump		(dumper_t *)enodev
+
+static int
+giant_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+	struct cdevsw *dsw;
+	int ref, retval;
+
+	dsw = dev_refthread(dev, &ref);
+	if (dsw == NULL)
+		return (ENXIO);
+	mtx_lock(&Giant);
+	retval = dsw->d_gianttrick->d_open(dev, oflags, devtype, td);
+	mtx_unlock(&Giant);
+	dev_relthread(dev, ref);
+	return (retval);
+}
+
+static int
+giant_fdopen(struct cdev *dev, int oflags, struct thread *td, struct file *fp)
+{
+	struct cdevsw *dsw;
+	int ref, retval;
+
+	dsw = dev_refthread(dev, &ref);
+	if (dsw == NULL)
+		return (ENXIO);
+	mtx_lock(&Giant);
+	retval = dsw->d_gianttrick->d_fdopen(dev, oflags, td, fp);
+	mtx_unlock(&Giant);
+	dev_relthread(dev, ref);
+	return (retval);
+}
+
+static int
+giant_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
+{
+	struct cdevsw *dsw;
+	int ref, retval;
+
+	dsw = dev_refthread(dev, &ref);
+	if (dsw == NULL)
+		return (ENXIO);
+	mtx_lock(&Giant);
+	retval = dsw->d_gianttrick->d_close(dev, fflag, devtype, td);
+	mtx_unlock(&Giant);
+	dev_relthread(dev, ref);
+	return (retval);
+}
+
+static void
+giant_strategy(struct bio *bp)
+{
+	struct cdevsw *dsw;
+	struct cdev *dev;
+	int ref;
+
+	dev = bp->bio_dev;
+	dsw = dev_refthread(dev, &ref);
+	if (dsw == NULL) {
+		biofinish(bp, NULL, ENXIO);
+		return;
+	}
+	mtx_lock(&Giant);
+	dsw->d_gianttrick->d_strategy(bp);
+	mtx_unlock(&Giant);
+	dev_relthread(dev, ref);
+}
+
+static int
+giant_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
+{
+	struct cdevsw *dsw;
+	int ref, retval;
+
+	dsw = dev_refthread(dev, &ref);
+	if (dsw == NULL)
+		return (ENXIO);
+	mtx_lock(&Giant);
+	retval = dsw->d_gianttrick->d_ioctl(dev, cmd, data, fflag, td);
+	mtx_unlock(&Giant);
+	dev_relthread(dev, ref);
+	return (retval);
+}
+  
+static int
+giant_read(struct cdev *dev, struct uio *uio, int ioflag)
+{
+	struct cdevsw *dsw;
+	int ref, retval;
+
+	dsw = dev_refthread(dev, &ref);
+	if (dsw == NULL)
+		return (ENXIO);
+	mtx_lock(&Giant);
+	retval = dsw->d_gianttrick->d_read(dev, uio, ioflag);
+	mtx_unlock(&Giant);
+	dev_relthread(dev, ref);
+	return (retval);
+}
+
+static int
+giant_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+	struct cdevsw *dsw;
+	int ref, retval;
+
+	dsw = dev_refthread(dev, &ref);
+	if (dsw == NULL)
+		return (ENXIO);
+	mtx_lock(&Giant);
+	retval = dsw->d_gianttrick->d_write(dev, uio, ioflag);
+	mtx_unlock(&Giant);
+	dev_relthread(dev, ref);
+	return (retval);
+}
+
+static int
+giant_poll(struct cdev *dev, int events, struct thread *td)
+{
+	struct cdevsw *dsw;
+	int ref, retval;
+
+	dsw = dev_refthread(dev, &ref);
+	if (dsw == NULL)
+		return (ENXIO);
+	mtx_lock(&Giant);
+	retval = dsw->d_gianttrick->d_poll(dev, events, td);
+	mtx_unlock(&Giant);
+	dev_relthread(dev, ref);
+	return (retval);
+}
+
+static int
+giant_kqfilter(struct cdev *dev, struct knote *kn)
+{
+	struct cdevsw *dsw;
+	int ref, retval;
+
+	dsw = dev_refthread(dev, &ref);
+	if (dsw == NULL)
+		return (ENXIO);
+	mtx_lock(&Giant);
+	retval = dsw->d_gianttrick->d_kqfilter(dev, kn);
+	mtx_unlock(&Giant);
+	dev_relthread(dev, ref);
+	return (retval);
+}
+
+static int
+giant_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot,
+    vm_memattr_t *memattr)
+{
+	struct cdevsw *dsw;
+	int ref, retval;
+
+	dsw = dev_refthread(dev, &ref);
+	if (dsw == NULL)
+		return (ENXIO);
+	mtx_lock(&Giant);
+	retval = dsw->d_gianttrick->d_mmap(dev, offset, paddr, nprot,
+	    memattr);
+	mtx_unlock(&Giant);
+	dev_relthread(dev, ref);
+	return (retval);
+}
+
+static int
+giant_mmap_single(struct cdev *dev, vm_ooffset_t *offset, vm_size_t size,
+    vm_object_t *object, int nprot)
+{
+	struct cdevsw *dsw;
+	int ref, retval;
+
+	dsw = dev_refthread(dev, &ref);
+	if (dsw == NULL)
+		return (ENXIO);
+	mtx_lock(&Giant);
+	retval = dsw->d_gianttrick->d_mmap_single(dev, offset, size, object,
+	    nprot);
+	mtx_unlock(&Giant);
+	dev_relthread(dev, ref);
+	return (retval);
+}
+
+static void
+notify(struct cdev *dev, const char *ev, int flags)
+{
+	static const char prefix[] = "cdev=";
+	char *data;
+	int namelen, mflags;
+
+	if (cold)
+		return;
+	mflags = (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK;
+	namelen = strlen(dev->si_name);
+	data = malloc(namelen + sizeof(prefix), M_TEMP, mflags);
+	if (data == NULL)
+		return;
+	memcpy(data, prefix, sizeof(prefix) - 1);
+	memcpy(data + sizeof(prefix) - 1, dev->si_name, namelen + 1);
+	devctl_notify_f("DEVFS", "CDEV", ev, data, mflags);
+	free(data, M_TEMP);
+}
+
+static void
+notify_create(struct cdev *dev, int flags)
+{
+
+	notify(dev, "CREATE", flags);
+}
+
+static void
+notify_destroy(struct cdev *dev)
+{
+
+	notify(dev, "DESTROY", MAKEDEV_WAITOK);
+}
+
+static struct cdev *
+newdev(struct cdevsw *csw, int unit, struct cdev *si)
+{
+	struct cdev *si2;
+
+	mtx_assert(&devmtx, MA_OWNED);
+	if (csw->d_flags & D_NEEDMINOR) {
+		/* We may want to return an existing device */
+		LIST_FOREACH(si2, &csw->d_devs, si_list) {
+			if (dev2unit(si2) == unit) {
+				dev_free_devlocked(si);
+				return (si2);
+			}
+		}
+	}
+	si->si_drv0 = unit;
+	si->si_devsw = csw;
+	LIST_INSERT_HEAD(&csw->d_devs, si, si_list);
+	return (si);
+}
+
+static void
+fini_cdevsw(struct cdevsw *devsw)
+{
+	struct cdevsw *gt;
+
+	if (devsw->d_gianttrick != NULL) {
+		gt = devsw->d_gianttrick;
+		memcpy(devsw, gt, sizeof *devsw);
+		cdevsw_free_devlocked(gt);
+		devsw->d_gianttrick = NULL;
+	}
+	devsw->d_flags &= ~D_INIT;
+}
+
+static int
+prep_cdevsw(struct cdevsw *devsw, int flags)
+{
+	struct cdevsw *dsw2;
+
+	mtx_assert(&devmtx, MA_OWNED);
+	if (devsw->d_flags & D_INIT)
+		return (0);
+	if (devsw->d_flags & D_NEEDGIANT) {
+		dev_unlock();
+		dsw2 = malloc(sizeof *dsw2, M_DEVT,
+		     (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK);
+		dev_lock();
+		if (dsw2 == NULL && !(devsw->d_flags & D_INIT))
+			return (ENOMEM);
+	} else
+		dsw2 = NULL;
+	if (devsw->d_flags & D_INIT) {
+		if (dsw2 != NULL)
+			cdevsw_free_devlocked(dsw2);
+		return (0);
+	}
+
+	if (devsw->d_version != D_VERSION_03) {
+		printf(
+		    "WARNING: Device driver \"%s\" has wrong version %s\n",
+		    devsw->d_name == NULL ? "???" : devsw->d_name,
+		    "and is disabled.  Recompile KLD module.");
+		devsw->d_open = dead_open;
+		devsw->d_close = dead_close;
+		devsw->d_read = dead_read;
+		devsw->d_write = dead_write;
+		devsw->d_ioctl = dead_ioctl;
+		devsw->d_poll = dead_poll;
+		devsw->d_mmap = dead_mmap;
+		devsw->d_mmap_single = dead_mmap_single;
+		devsw->d_strategy = dead_strategy;
+		devsw->d_dump = dead_dump;
+		devsw->d_kqfilter = dead_kqfilter;
+	}
+	
+	if (devsw->d_flags & D_NEEDGIANT) {
+		if (devsw->d_gianttrick == NULL) {
+			memcpy(dsw2, devsw, sizeof *dsw2);
+			devsw->d_gianttrick = dsw2;
+			dsw2 = NULL;
+		}
+	}
+
+#define FIXUP(member, noop, giant) 				\
+	do {							\
+		if (devsw->member == NULL) {			\
+			devsw->member = noop;			\
+		} else if (devsw->d_flags & D_NEEDGIANT)	\
+			devsw->member = giant;			\
+		}						\
+	while (0)
+
+	FIXUP(d_open,		null_open,	giant_open);
+	FIXUP(d_fdopen,		NULL,		giant_fdopen);
+	FIXUP(d_close,		null_close,	giant_close);
+	FIXUP(d_read,		no_read,	giant_read);
+	FIXUP(d_write,		no_write,	giant_write);
+	FIXUP(d_ioctl,		no_ioctl,	giant_ioctl);
+	FIXUP(d_poll,		no_poll,	giant_poll);
+	FIXUP(d_mmap,		no_mmap,	giant_mmap);
+	FIXUP(d_strategy,	no_strategy,	giant_strategy);
+	FIXUP(d_kqfilter,	no_kqfilter,	giant_kqfilter);
+	FIXUP(d_mmap_single,	no_mmap_single,	giant_mmap_single);
+
+	if (devsw->d_dump == NULL)	devsw->d_dump = no_dump;
+
+	LIST_INIT(&devsw->d_devs);
+
+	devsw->d_flags |= D_INIT;
+
+	if (dsw2 != NULL)
+		cdevsw_free_devlocked(dsw2);
+	return (0);
+}
+
+static int
+prep_devname(struct cdev *dev, const char *fmt, va_list ap)
+{
+	int len;
+	char *from, *q, *s, *to;
+
+	mtx_assert(&devmtx, MA_OWNED);
+
+	len = vsnrprintf(dev->si_name, sizeof(dev->si_name), 32, fmt, ap);
+	if (len > sizeof(dev->si_name) - 1)
+		return (ENAMETOOLONG);
+
+	/* Strip leading slashes. */
+	for (from = dev->si_name; *from == '/'; from++)
+		;
+
+	for (to = dev->si_name; *from != '\0'; from++, to++) {
+		/*
+		 * Spaces and double quotation marks cause
+		 * problems for the devctl(4) protocol.
+		 * Reject names containing those characters.
+		 */
+		if (isspace(*from) || *from == '"')
+			return (EINVAL);
+		/* Treat multiple sequential slashes as single. */
+		while (from[0] == '/' && from[1] == '/')
+			from++;
+		/* Trailing slash is considered invalid. */
+		if (from[0] == '/' && from[1] == '\0')
+			return (EINVAL);
+		*to = *from;
+	}
+	*to = '\0';
+
+	if (dev->si_name[0] == '\0')
+		return (EINVAL);
+
+	/* Disallow "." and ".." components. */
+	for (s = dev->si_name;;) {
+		for (q = s; *q != '/' && *q != '\0'; q++)
+			;
+		if (q - s == 1 && s[0] == '.')
+			return (EINVAL);
+		if (q - s == 2 && s[0] == '.' && s[1] == '.')
+			return (EINVAL);
+		if (*q != '/')
+			break;
+		s = q + 1;
+	}
+
+	if (devfs_dev_exists(dev->si_name) != 0)
+		return (EEXIST);
+
+	return (0);
+}
+
+static int
+make_dev_credv(int flags, struct cdev **dres, struct cdevsw *devsw, int unit,
+    struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt,
+    va_list ap)
+{
+	struct cdev *dev, *dev_new;
+	int res;
+
+	KASSERT((flags & MAKEDEV_WAITOK) == 0 || (flags & MAKEDEV_NOWAIT) == 0,
+	    ("make_dev_credv: both WAITOK and NOWAIT specified"));
+	dev_new = devfs_alloc(flags);
+	if (dev_new == NULL)
+		return (ENOMEM);
+	dev_lock();
+	res = prep_cdevsw(devsw, flags);
+	if (res != 0) {
+		dev_unlock();
+		devfs_free(dev_new);
+		return (res);
+	}
+	dev = newdev(devsw, unit, dev_new);
+	if ((dev->si_flags & SI_NAMED) == 0) {
+		res = prep_devname(dev, fmt, ap);
+		if (res != 0) {
+			if ((flags & MAKEDEV_CHECKNAME) == 0) {
+				panic(
+			"make_dev_credv: bad si_name (error=%d, si_name=%s)",
+				    res, dev->si_name);
+			}
+			if (dev == dev_new) {
+				LIST_REMOVE(dev, si_list);
+				dev_unlock();
+				devfs_free(dev);
+			} else
+				dev_unlock();
+			return (res);
+		}
+	}
+	if (flags & MAKEDEV_REF)
+		dev_refl(dev);
+	if (flags & MAKEDEV_ETERNAL)
+		dev->si_flags |= SI_ETERNAL;
+	if (dev->si_flags & SI_CHEAPCLONE &&
+	    dev->si_flags & SI_NAMED) {
+		/*
+		 * This is allowed as it removes races and generally
+		 * simplifies cloning devices.
+		 * XXX: still ??
+		 */
+		dev_unlock_and_free();
+		*dres = dev;
+		return (0);
+	}
+	KASSERT(!(dev->si_flags & SI_NAMED),
+	    ("make_dev() by driver %s on pre-existing device (min=%x, name=%s)",
+	    devsw->d_name, dev2unit(dev), devtoname(dev)));
+	dev->si_flags |= SI_NAMED;
+	if (cr != NULL)
+		dev->si_cred = crhold(cr);
+	dev->si_uid = uid;
+	dev->si_gid = gid;
+	dev->si_mode = mode;
+
+	devfs_create(dev);
+	clean_unrhdrl(devfs_inos);
+	dev_unlock_and_free();
+
+	notify_create(dev, flags);
+
+	*dres = dev;
+	return (0);
+}
+
+struct cdev *
+make_dev(struct cdevsw *devsw, int unit, uid_t uid, gid_t gid, int mode,
+    const char *fmt, ...)
+{
+	struct cdev *dev;
+	va_list ap;
+	int res;
+
+	va_start(ap, fmt);
+	res = make_dev_credv(0, &dev, devsw, unit, NULL, uid, gid, mode, fmt,
+	    ap);
+	va_end(ap);
+	KASSERT(res == 0 && dev != NULL,
+	    ("make_dev: failed make_dev_credv (error=%d)", res));
+	return (dev);
+}
+
+struct cdev *
+make_dev_cred(struct cdevsw *devsw, int unit, struct ucred *cr, uid_t uid,
+    gid_t gid, int mode, const char *fmt, ...)
+{
+	struct cdev *dev;
+	va_list ap;
+	int res;
+
+	va_start(ap, fmt);
+	res = make_dev_credv(0, &dev, devsw, unit, cr, uid, gid, mode, fmt, ap);
+	va_end(ap);
+
+	KASSERT(res == 0 && dev != NULL,
+	    ("make_dev_cred: failed make_dev_credv (error=%d)", res));
+	return (dev);
+}
+
+struct cdev *
+make_dev_credf(int flags, struct cdevsw *devsw, int unit, struct ucred *cr,
+    uid_t uid, gid_t gid, int mode, const char *fmt, ...)
+{
+	struct cdev *dev;
+	va_list ap;
+	int res;
+
+	va_start(ap, fmt);
+	res = make_dev_credv(flags, &dev, devsw, unit, cr, uid, gid, mode,
+	    fmt, ap);
+	va_end(ap);
+
+	KASSERT(((flags & MAKEDEV_NOWAIT) != 0 && res == ENOMEM) ||
+	    ((flags & MAKEDEV_CHECKNAME) != 0 && res != ENOMEM) || res == 0,
+	    ("make_dev_credf: failed make_dev_credv (error=%d)", res));
+	return (res == 0 ? dev : NULL);
+}
+
+int
+make_dev_p(int flags, struct cdev **cdev, struct cdevsw *devsw,
+    struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt, ...)
+{
+	va_list ap;
+	int res;
+
+	va_start(ap, fmt);
+	res = make_dev_credv(flags, cdev, devsw, 0, cr, uid, gid, mode,
+	    fmt, ap);
+	va_end(ap);
+
+	KASSERT(((flags & MAKEDEV_NOWAIT) != 0 && res == ENOMEM) ||
+	    ((flags & MAKEDEV_CHECKNAME) != 0 && res != ENOMEM) || res == 0,
+	    ("make_dev_p: failed make_dev_credv (error=%d)", res));
+	return (res);
+}
+
+static void
+dev_dependsl(struct cdev *pdev, struct cdev *cdev)
+{
+
+	cdev->si_parent = pdev;
+	cdev->si_flags |= SI_CHILD;
+	LIST_INSERT_HEAD(&pdev->si_children, cdev, si_siblings);
+}
+
+
+void
+dev_depends(struct cdev *pdev, struct cdev *cdev)
+{
+
+	dev_lock();
+	dev_dependsl(pdev, cdev);
+	dev_unlock();
+}
+
+static int
+make_dev_alias_v(int flags, struct cdev **cdev, struct cdev *pdev,
+    const char *fmt, va_list ap)
+{
+	struct cdev *dev;
+	int error;
+
+	KASSERT(pdev != NULL, ("make_dev_alias_v: pdev is NULL"));
+	KASSERT((flags & MAKEDEV_WAITOK) == 0 || (flags & MAKEDEV_NOWAIT) == 0,
+	    ("make_dev_alias_v: both WAITOK and NOWAIT specified"));
+	KASSERT((flags & ~(MAKEDEV_WAITOK | MAKEDEV_NOWAIT |
+	    MAKEDEV_CHECKNAME)) == 0,
+	    ("make_dev_alias_v: invalid flags specified (flags=%02x)", flags));
+
+	dev = devfs_alloc(flags);
+	if (dev == NULL)
+		return (ENOMEM);
+	dev_lock();
+	dev->si_flags |= SI_ALIAS;
+	error = prep_devname(dev, fmt, ap);
+	if (error != 0) {
+		if ((flags & MAKEDEV_CHECKNAME) == 0) {
+			panic("make_dev_alias_v: bad si_name "
+			    "(error=%d, si_name=%s)", error, dev->si_name);
+		}
+		dev_unlock();
+		devfs_free(dev);
+		return (error);
+	}
+	dev->si_flags |= SI_NAMED;
+	devfs_create(dev);
+	dev_dependsl(pdev, dev);
+	clean_unrhdrl(devfs_inos);
+	dev_unlock();
+
+	notify_create(dev, flags);
+	*cdev = dev;
+
+	return (0);
+}
+
+struct cdev *
+make_dev_alias(struct cdev *pdev, const char *fmt, ...)
+{
+	struct cdev *dev;
+	va_list ap;
+	int res;
+
+	va_start(ap, fmt);
+	res = make_dev_alias_v(MAKEDEV_WAITOK, &dev, pdev, fmt, ap);
+	va_end(ap);
+
+	KASSERT(res == 0 && dev != NULL,
+	    ("make_dev_alias: failed make_dev_alias_v (error=%d)", res));
+	return (dev);
+}
+
+int
+make_dev_alias_p(int flags, struct cdev **cdev, struct cdev *pdev,
+    const char *fmt, ...)
+{
+	va_list ap;
+	int res;
+
+	va_start(ap, fmt);
+	res = make_dev_alias_v(flags, cdev, pdev, fmt, ap);
+	va_end(ap);
+	return (res);
+}
+
+int
+make_dev_physpath_alias(int flags, struct cdev **cdev, struct cdev *pdev, 
+    struct cdev *old_alias, const char *physpath)
+{
+	char *devfspath;
+	int physpath_len;
+	int max_parentpath_len;
+	int parentpath_len;
+	int devfspathbuf_len;
+	int mflags;
+	int ret;
+
+	*cdev = NULL;
+	devfspath = NULL;
+	physpath_len = strlen(physpath);
+	ret = EINVAL;
+	if (physpath_len == 0)
+		goto out;
+
+	if (strncmp("id1,", physpath, 4) == 0) {
+		physpath += 4;
+		physpath_len -= 4;
+		if (physpath_len == 0)
+			goto out;
+	}
+
+	max_parentpath_len = SPECNAMELEN - physpath_len - /*/*/1;
+	parentpath_len = strlen(pdev->si_name);
+	if (max_parentpath_len < parentpath_len) {
+		if (bootverbose)
+			printf("WARNING: Unable to alias %s "
+			    "to %s/%s - path too long\n",
+			    pdev->si_name, physpath, pdev->si_name);
+		ret = ENAMETOOLONG;
+		goto out;
+	}
+
+	mflags = (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK;
+	devfspathbuf_len = physpath_len + /*/*/1 + parentpath_len + /*NUL*/1;
+	devfspath = malloc(devfspathbuf_len, M_DEVBUF, mflags);
+	if (devfspath == NULL) {
+		ret = ENOMEM;
+		goto out;
+	}
+
+	sprintf(devfspath, "%s/%s", physpath, pdev->si_name);
+	if (old_alias != NULL && strcmp(old_alias->si_name, devfspath) == 0) {
+		/* Retain the existing alias. */
+		*cdev = old_alias;
+		old_alias = NULL;
+		ret = 0;
+	} else {
+		ret = make_dev_alias_p(flags, cdev, pdev, "%s", devfspath);
+	}
+out:
+	if (old_alias != NULL)	
+		destroy_dev(old_alias);
+	if (devfspath != NULL)
+		free(devfspath, M_DEVBUF);
+	return (ret);
+}
+
+static void
+destroy_devl(struct cdev *dev)
+{
+	struct cdevsw *csw;
+	struct cdev_privdata *p;
+
+	mtx_assert(&devmtx, MA_OWNED);
+	KASSERT(dev->si_flags & SI_NAMED,
+	    ("WARNING: Driver mistake: destroy_dev on %d\n", dev2unit(dev)));
+	KASSERT((dev->si_flags & SI_ETERNAL) == 0,
+	    ("WARNING: Driver mistake: destroy_dev on eternal %d\n",
+	     dev2unit(dev)));
+
+	devfs_destroy(dev);
+
+	/* Remove name marking */
+	dev->si_flags &= ~SI_NAMED;
+
+	dev->si_refcount++;	/* Avoid race with dev_rel() */
+
+	/* If we are a child, remove us from the parents list */
+	if (dev->si_flags & SI_CHILD) {
+		LIST_REMOVE(dev, si_siblings);
+		dev->si_flags &= ~SI_CHILD;
+	}
+
+	/* Kill our children */
+	while (!LIST_EMPTY(&dev->si_children))
+		destroy_devl(LIST_FIRST(&dev->si_children));
+
+	/* Remove from clone list */
+	if (dev->si_flags & SI_CLONELIST) {
+		LIST_REMOVE(dev, si_clone);
+		dev->si_flags &= ~SI_CLONELIST;
+	}
+
+	csw = dev->si_devsw;
+	dev->si_devsw = NULL;	/* already NULL for SI_ALIAS */
+	while (csw != NULL && csw->d_purge != NULL && dev->si_threadcount) {
+		csw->d_purge(dev);
+		msleep(csw, &devmtx, PRIBIO, "devprg", hz/10);
+		if (dev->si_threadcount)
+			printf("Still %lu threads in %s\n",
+			    dev->si_threadcount, devtoname(dev));
+	}
+	while (dev->si_threadcount != 0) {
+		/* Use unique dummy wait ident */
+		msleep(&csw, &devmtx, PRIBIO, "devdrn", hz / 10);
+	}
+
+	dev_unlock();
+	notify_destroy(dev);
+	mtx_lock(&cdevpriv_mtx);
+	while ((p = LIST_FIRST(&cdev2priv(dev)->cdp_fdpriv)) != NULL) {
+		devfs_destroy_cdevpriv(p);
+		mtx_lock(&cdevpriv_mtx);
+	}
+	mtx_unlock(&cdevpriv_mtx);
+	dev_lock();
+
+	dev->si_drv1 = 0;
+	dev->si_drv2 = 0;
+	bzero(&dev->__si_u, sizeof(dev->__si_u));
+
+	if (!(dev->si_flags & SI_ALIAS)) {
+		/* Remove from cdevsw list */
+		LIST_REMOVE(dev, si_list);
+
+		/* If cdevsw has no more struct cdev *'s, clean it */
+		if (LIST_EMPTY(&csw->d_devs)) {
+			fini_cdevsw(csw);
+			wakeup(&csw->d_devs);
+		}
+	}
+	dev->si_flags &= ~SI_ALIAS;
+	dev->si_refcount--;	/* Avoid race with dev_rel() */
+
+	if (dev->si_refcount > 0) {
+		LIST_INSERT_HEAD(&dead_cdevsw.d_devs, dev, si_list);
+	} else {
+		dev_free_devlocked(dev);
+	}
+}
+
+void
+destroy_dev(struct cdev *dev)
+{
+
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "destroy_dev");
+	dev_lock();
+	destroy_devl(dev);
+	dev_unlock_and_free();
+}
+
+const char *
+devtoname(struct cdev *dev)
+{
+
+	return (dev->si_name);
+}
+
+int
+dev_stdclone(char *name, char **namep, const char *stem, int *unit)
+{
+	int u, i;
+
+	i = strlen(stem);
+	if (bcmp(stem, name, i) != 0)
+		return (0);
+	if (!isdigit(name[i]))
+		return (0);
+	u = 0;
+	if (name[i] == '0' && isdigit(name[i+1]))
+		return (0);
+	while (isdigit(name[i])) {
+		u *= 10;
+		u += name[i++] - '0';
+	}
+	if (u > 0xffffff)
+		return (0);
+	*unit = u;
+	if (namep)
+		*namep = &name[i];
+	if (name[i]) 
+		return (2);
+	return (1);
+}
+
+/*
+ * Helper functions for cloning device drivers.
+ *
+ * The objective here is to make it unnecessary for the device drivers to
+ * use rman or similar to manage their unit number space.  Due to the way
+ * we do "on-demand" devices, using rman or other "private" methods 
+ * will be very tricky to lock down properly once we lock down this file.
+ *
+ * Instead we give the drivers these routines which puts the struct cdev *'s
+ * that are to be managed on their own list, and gives the driver the ability
+ * to ask for the first free unit number or a given specified unit number.
+ *
+ * In addition these routines support paired devices (pty, nmdm and similar)
+ * by respecting a number of "flag" bits in the minor number.
+ *
+ */
+
+struct clonedevs {
+	LIST_HEAD(,cdev)	head;
+};
+
+void
+clone_setup(struct clonedevs **cdp)
+{
+
+	*cdp = malloc(sizeof **cdp, M_DEVBUF, M_WAITOK | M_ZERO);
+	LIST_INIT(&(*cdp)->head);
+}
+
+int
+clone_create(struct clonedevs **cdp, struct cdevsw *csw, int *up,
+    struct cdev **dp, int extra)
+{
+	struct clonedevs *cd;
+	struct cdev *dev, *ndev, *dl, *de;
+	int unit, low, u;
+
+	KASSERT(*cdp != NULL,
+	    ("clone_setup() not called in driver \"%s\"", csw->d_name));
+	KASSERT(!(extra & CLONE_UNITMASK),
+	    ("Illegal extra bits (0x%x) in clone_create", extra));
+	KASSERT(*up <= CLONE_UNITMASK,
+	    ("Too high unit (0x%x) in clone_create", *up));
+	KASSERT(csw->d_flags & D_NEEDMINOR,
+	    ("clone_create() on cdevsw without minor numbers"));
+
+
+	/*
+	 * Search the list for a lot of things in one go:
+	 *   A preexisting match is returned immediately.
+	 *   The lowest free unit number if we are passed -1, and the place
+	 *	 in the list where we should insert that new element.
+	 *   The place to insert a specified unit number, if applicable
+	 *       the end of the list.
+	 */
+	unit = *up;
+	ndev = devfs_alloc(MAKEDEV_WAITOK);
+	dev_lock();
+	prep_cdevsw(csw, MAKEDEV_WAITOK);
+	low = extra;
+	de = dl = NULL;
+	cd = *cdp;
+	LIST_FOREACH(dev, &cd->head, si_clone) {
+		KASSERT(dev->si_flags & SI_CLONELIST,
+		    ("Dev %p(%s) should be on clonelist", dev, dev->si_name));
+		u = dev2unit(dev);
+		if (u == (unit | extra)) {
+			*dp = dev;
+			dev_unlock();
+			devfs_free(ndev);
+			return (0);
+		}
+		if (unit == -1 && u == low) {
+			low++;
+			de = dev;
+			continue;
+		} else if (u < (unit | extra)) {
+			de = dev;
+			continue;
+		} else if (u > (unit | extra)) {
+			dl = dev;
+			break;
+		}
+	}
+	if (unit == -1)
+		unit = low & CLONE_UNITMASK;
+	dev = newdev(csw, unit | extra, ndev);
+	if (dev->si_flags & SI_CLONELIST) {
+		printf("dev %p (%s) is on clonelist\n", dev, dev->si_name);
+		printf("unit=%d, low=%d, extra=0x%x\n", unit, low, extra);
+		LIST_FOREACH(dev, &cd->head, si_clone) {
+			printf("\t%p %s\n", dev, dev->si_name);
+		}
+		panic("foo");
+	}
+	KASSERT(!(dev->si_flags & SI_CLONELIST),
+	    ("Dev %p(%s) should not be on clonelist", dev, dev->si_name));
+	if (dl != NULL)
+		LIST_INSERT_BEFORE(dl, dev, si_clone);
+	else if (de != NULL)
+		LIST_INSERT_AFTER(de, dev, si_clone);
+	else
+		LIST_INSERT_HEAD(&cd->head, dev, si_clone);
+	dev->si_flags |= SI_CLONELIST;
+	*up = unit;
+	dev_unlock_and_free();
+	return (1);
+}
+
+/*
+ * Kill everything still on the list.  The driver should already have
+ * disposed of any softc hung of the struct cdev *'s at this time.
+ */
+void
+clone_cleanup(struct clonedevs **cdp)
+{
+	struct cdev *dev;
+	struct cdev_priv *cp;
+	struct clonedevs *cd;
+	
+	cd = *cdp;
+	if (cd == NULL)
+		return;
+	dev_lock();
+	while (!LIST_EMPTY(&cd->head)) {
+		dev = LIST_FIRST(&cd->head);
+		LIST_REMOVE(dev, si_clone);
+		KASSERT(dev->si_flags & SI_CLONELIST,
+		    ("Dev %p(%s) should be on clonelist", dev, dev->si_name));
+		dev->si_flags &= ~SI_CLONELIST;
+		cp = cdev2priv(dev);
+		if (!(cp->cdp_flags & CDP_SCHED_DTR)) {
+			cp->cdp_flags |= CDP_SCHED_DTR;
+			KASSERT(dev->si_flags & SI_NAMED,
+				("Driver has goofed in cloning underways udev %x unit %x", dev2udev(dev), dev2unit(dev)));
+			destroy_devl(dev);
+		}
+	}
+	dev_unlock_and_free();
+	free(cd, M_DEVBUF);
+	*cdp = NULL;
+}
+
+static TAILQ_HEAD(, cdev_priv) dev_ddtr =
+	TAILQ_HEAD_INITIALIZER(dev_ddtr);
+static struct task dev_dtr_task = TASK_INITIALIZER(0, destroy_dev_tq, NULL);
+
+static void
+destroy_dev_tq(void *ctx, int pending)
+{
+	struct cdev_priv *cp;
+	struct cdev *dev;
+	void (*cb)(void *);
+	void *cb_arg;
+
+	dev_lock();
+	while (!TAILQ_EMPTY(&dev_ddtr)) {
+		cp = TAILQ_FIRST(&dev_ddtr);
+		dev = &cp->cdp_c;
+		KASSERT(cp->cdp_flags & CDP_SCHED_DTR,
+		    ("cdev %p in dev_destroy_tq without CDP_SCHED_DTR", cp));
+		TAILQ_REMOVE(&dev_ddtr, cp, cdp_dtr_list);
+		cb = cp->cdp_dtr_cb;
+		cb_arg = cp->cdp_dtr_cb_arg;
+		destroy_devl(dev);
+		dev_unlock_and_free();
+		dev_rel(dev);
+		if (cb != NULL)
+			cb(cb_arg);
+		dev_lock();
+	}
+	dev_unlock();
+}
+
+/*
+ * devmtx shall be locked on entry. devmtx will be unlocked after
+ * function return.
+ */
+static int
+destroy_dev_sched_cbl(struct cdev *dev, void (*cb)(void *), void *arg)
+{
+	struct cdev_priv *cp;
+
+	mtx_assert(&devmtx, MA_OWNED);
+	cp = cdev2priv(dev);
+	if (cp->cdp_flags & CDP_SCHED_DTR) {
+		dev_unlock();
+		return (0);
+	}
+	dev_refl(dev);
+	cp->cdp_flags |= CDP_SCHED_DTR;
+	cp->cdp_dtr_cb = cb;
+	cp->cdp_dtr_cb_arg = arg;
+	TAILQ_INSERT_TAIL(&dev_ddtr, cp, cdp_dtr_list);
+	dev_unlock();
+	taskqueue_enqueue(taskqueue_swi_giant, &dev_dtr_task);
+	return (1);
+}
+
+int
+destroy_dev_sched_cb(struct cdev *dev, void (*cb)(void *), void *arg)
+{
+
+	dev_lock();
+	return (destroy_dev_sched_cbl(dev, cb, arg));
+}
+
+int
+destroy_dev_sched(struct cdev *dev)
+{
+
+	return (destroy_dev_sched_cb(dev, NULL, NULL));
+}
+
+void
+destroy_dev_drain(struct cdevsw *csw)
+{
+
+	dev_lock();
+	while (!LIST_EMPTY(&csw->d_devs)) {
+		msleep(&csw->d_devs, &devmtx, PRIBIO, "devscd", hz/10);
+	}
+	dev_unlock();
+}
+
+void
+drain_dev_clone_events(void)
+{
+
+	sx_xlock(&clone_drain_lock);
+	sx_xunlock(&clone_drain_lock);
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <sys/kernel.h>
+
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cdev, db_show_cdev)
+{
+	struct cdev_priv *cdp;
+	struct cdev *dev;
+	u_int flags;
+	char buf[512];
+
+	if (!have_addr) {
+		TAILQ_FOREACH(cdp, &cdevp_list, cdp_list) {
+			dev = &cdp->cdp_c;
+			db_printf("%s %p\n", dev->si_name, dev);
+			if (db_pager_quit)
+				break;
+		}
+		return;
+	}
+
+	dev = (struct cdev *)addr;
+	cdp = cdev2priv(dev);
+	db_printf("dev %s ref %d use %ld thr %ld inuse %u fdpriv %p\n",
+	    dev->si_name, dev->si_refcount, dev->si_usecount,
+	    dev->si_threadcount, cdp->cdp_inuse, cdp->cdp_fdpriv.lh_first);
+	db_printf("devsw %p si_drv0 %d si_drv1 %p si_drv2 %p\n",
+	    dev->si_devsw, dev->si_drv0, dev->si_drv1, dev->si_drv2);
+	flags = dev->si_flags;
+#define	SI_FLAG(flag)	do {						\
+	if (flags & (flag)) {						\
+		if (buf[0] != '\0')					\
+			strlcat(buf, ", ", sizeof(buf));		\
+		strlcat(buf, (#flag) + 3, sizeof(buf));			\
+		flags &= ~(flag);					\
+	}								\
+} while (0)
+	buf[0] = '\0';
+	SI_FLAG(SI_ETERNAL);
+	SI_FLAG(SI_ALIAS);
+	SI_FLAG(SI_NAMED);
+	SI_FLAG(SI_CHEAPCLONE);
+	SI_FLAG(SI_CHILD);
+	SI_FLAG(SI_DUMPDEV);
+	SI_FLAG(SI_CLONELIST);
+	db_printf("si_flags %s\n", buf);
+
+	flags = cdp->cdp_flags;
+#define	CDP_FLAG(flag)	do {						\
+	if (flags & (flag)) {						\
+		if (buf[0] != '\0')					\
+			strlcat(buf, ", ", sizeof(buf));		\
+		strlcat(buf, (#flag) + 4, sizeof(buf));			\
+		flags &= ~(flag);					\
+	}								\
+} while (0)
+	buf[0] = '\0';
+	CDP_FLAG(CDP_ACTIVE);
+	CDP_FLAG(CDP_SCHED_DTR);
+	db_printf("cdp_flags %s\n", buf);
+}
+#endif
diff --git a/sys/kern/kern_cons.c b/sys/kern/kern_cons.c
new file mode 100644
index 0000000..d17846a
--- /dev/null
+++ b/sys/kern/kern_cons.c
@@ -0,0 +1,643 @@
+/*-
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1991 The Regents of the University of California.
+ * Copyright (c) 1999 Michael Smith
+ * Copyright (c) 2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ *
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)cons.c	7.2 (Berkeley) 5/9/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/fcntl.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/msgbuf.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/reboot.h>
+#include <sys/sysctl.h>
+#include <sys/sbuf.h>
+#include <sys/tty.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+#include <ddb/ddb.h>
+
+#include <machine/cpu.h>
+#include <machine/clock.h>
+
+static MALLOC_DEFINE(M_TTYCONS, "tty console", "tty console handling");
+
+struct cn_device {
+	STAILQ_ENTRY(cn_device) cnd_next;
+	struct		consdev *cnd_cn;
+};
+
+#define CNDEVPATHMAX	32
+#define CNDEVTAB_SIZE	4
+static struct cn_device cn_devtab[CNDEVTAB_SIZE];
+static STAILQ_HEAD(, cn_device) cn_devlist =
+    STAILQ_HEAD_INITIALIZER(cn_devlist);
+
+int	cons_avail_mask = 0;	/* Bit mask. Each registered low level console
+				 * which is currently unavailable for inpit
+				 * (i.e., if it is in graphics mode) will have
+				 * this bit cleared.
+				 */
+static int cn_mute;
+static char *consbuf;			/* buffer used by `consmsgbuf' */
+static struct callout conscallout;	/* callout for outputting to constty */
+struct msgbuf consmsgbuf;		/* message buffer for console tty */
+static u_char console_pausing;		/* pause after each line during probe */
+static char *console_pausestr=
+"<pause; press any key to proceed to next line or '.' to end pause mode>";
+struct tty *constty;			/* pointer to console "window" tty */
+static struct mtx cnputs_mtx;		/* Mutex for cnputs(). */
+static int use_cnputs_mtx = 0;		/* != 0 if cnputs_mtx locking reqd. */
+
+static void constty_timeout(void *arg);
+
+static struct consdev cons_consdev;
+DATA_SET(cons_set, cons_consdev);
+SET_DECLARE(cons_set, struct consdev);
+
+void
+cninit(void)
+{
+	struct consdev *best_cn, *cn, **list;
+
+	/*
+	 * Check if we should mute the console (for security reasons perhaps)
+	 * It can be changes dynamically using sysctl kern.consmute
+	 * once we are up and going.
+	 * 
+	 */
+        cn_mute = ((boothowto & (RB_MUTE
+			|RB_SINGLE
+			|RB_VERBOSE
+			|RB_ASKNAME)) == RB_MUTE);
+
+	/*
+	 * Find the first console with the highest priority.
+	 */
+	best_cn = NULL;
+	SET_FOREACH(list, cons_set) {
+		cn = *list;
+		cnremove(cn);
+		/* Skip cons_consdev. */
+		if (cn->cn_ops == NULL)
+			continue;
+		cn->cn_ops->cn_probe(cn);
+		if (cn->cn_pri == CN_DEAD)
+			continue;
+		if (best_cn == NULL || cn->cn_pri > best_cn->cn_pri)
+			best_cn = cn;
+		if (boothowto & RB_MULTIPLE) {
+			/*
+			 * Initialize console, and attach to it.
+			 */
+			cn->cn_ops->cn_init(cn);
+			cnadd(cn);
+		}
+	}
+	if (best_cn == NULL)
+		return;
+	if ((boothowto & RB_MULTIPLE) == 0) {
+		best_cn->cn_ops->cn_init(best_cn);
+		cnadd(best_cn);
+	}
+	if (boothowto & RB_PAUSE)
+		console_pausing = 1;
+	/*
+	 * Make the best console the preferred console.
+	 */
+	cnselect(best_cn);
+}
+
+void
+cninit_finish()
+{
+	console_pausing = 0;
+} 
+
+/* add a new physical console to back the virtual console */
+int
+cnadd(struct consdev *cn)
+{
+	struct cn_device *cnd;
+	int i;
+
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
+		if (cnd->cnd_cn == cn)
+			return (0);
+	for (i = 0; i < CNDEVTAB_SIZE; i++) {
+		cnd = &cn_devtab[i];
+		if (cnd->cnd_cn == NULL)
+			break;
+	}
+	if (cnd->cnd_cn != NULL)
+		return (ENOMEM);
+	cnd->cnd_cn = cn;
+	if (cn->cn_name[0] == '\0') {
+		/* XXX: it is unclear if/where this print might output */
+		printf("WARNING: console at %p has no name\n", cn);
+	}
+	STAILQ_INSERT_TAIL(&cn_devlist, cnd, cnd_next);
+	if (STAILQ_FIRST(&cn_devlist) == cnd)
+		ttyconsdev_select(cnd->cnd_cn->cn_name);
+
+	/* Add device to the active mask. */
+	cnavailable(cn, (cn->cn_flags & CN_FLAG_NOAVAIL) == 0);
+
+	return (0);
+}
+
+void
+cnremove(struct consdev *cn)
+{
+	struct cn_device *cnd;
+	int i;
+
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+		if (cnd->cnd_cn != cn)
+			continue;
+		if (STAILQ_FIRST(&cn_devlist) == cnd)
+			ttyconsdev_select(NULL);
+		STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next);
+		cnd->cnd_cn = NULL;
+
+		/* Remove this device from available mask. */
+		for (i = 0; i < CNDEVTAB_SIZE; i++) 
+			if (cnd == &cn_devtab[i]) {
+				cons_avail_mask &= ~(1 << i);
+				break;
+			}
+#if 0
+		/*
+		 * XXX
+		 * syscons gets really confused if console resources are
+		 * freed after the system has initialized.
+		 */
+		if (cn->cn_term != NULL)
+			cn->cn_ops->cn_term(cn);
+#endif
+		return;
+	}
+}
+
+void
+cnselect(struct consdev *cn)
+{
+	struct cn_device *cnd;
+
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+		if (cnd->cnd_cn != cn)
+			continue;
+		if (cnd == STAILQ_FIRST(&cn_devlist))
+			return;
+		STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next);
+		STAILQ_INSERT_HEAD(&cn_devlist, cnd, cnd_next);
+		ttyconsdev_select(cnd->cnd_cn->cn_name);
+		return;
+	}
+}
+
+void
+cnavailable(struct consdev *cn, int available)
+{
+	int i;
+
+	for (i = 0; i < CNDEVTAB_SIZE; i++) {
+		if (cn_devtab[i].cnd_cn == cn)
+			break;
+	}
+	if (available) {
+		if (i < CNDEVTAB_SIZE)
+			cons_avail_mask |= (1 << i); 
+		cn->cn_flags &= ~CN_FLAG_NOAVAIL;
+	} else {
+		if (i < CNDEVTAB_SIZE)
+			cons_avail_mask &= ~(1 << i);
+		cn->cn_flags |= CN_FLAG_NOAVAIL;
+	}
+}
+
+int
+cnunavailable(void)
+{
+
+	return (cons_avail_mask == 0);
+}
+
+/*
+ * sysctl_kern_console() provides output parseable in conscontrol(1).
+ */
+static int
+sysctl_kern_console(SYSCTL_HANDLER_ARGS)
+{
+	struct cn_device *cnd;
+	struct consdev *cp, **list;
+	char *p;
+	int delete, error;
+	struct sbuf *sb;
+
+	sb = sbuf_new(NULL, NULL, CNDEVPATHMAX * 2, SBUF_AUTOEXTEND);
+	if (sb == NULL)
+		return (ENOMEM);
+	sbuf_clear(sb);
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
+		sbuf_printf(sb, "%s,", cnd->cnd_cn->cn_name);
+	sbuf_printf(sb, "/");
+	SET_FOREACH(list, cons_set) {
+		cp = *list;
+		if (cp->cn_name[0] != '\0')
+			sbuf_printf(sb, "%s,", cp->cn_name);
+	}
+	sbuf_finish(sb);
+	error = sysctl_handle_string(oidp, sbuf_data(sb), sbuf_len(sb), req);
+	if (error == 0 && req->newptr != NULL) {
+		p = sbuf_data(sb);
+		error = ENXIO;
+		delete = 0;
+		if (*p == '-') {
+			delete = 1;
+			p++;
+		}
+		SET_FOREACH(list, cons_set) {
+			cp = *list;
+			if (strcmp(p, cp->cn_name) != 0)
+				continue;
+			if (delete) {
+				cnremove(cp);
+				error = 0;
+			} else {
+				error = cnadd(cp);
+				if (error == 0)
+					cnselect(cp);
+			}
+			break;
+		}
+	}
+	sbuf_delete(sb);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, console, CTLTYPE_STRING|CTLFLAG_RW,
+	0, 0, sysctl_kern_console, "A", "Console device control");
+
+/*
+ * User has changed the state of the console muting.
+ * This may require us to open or close the device in question.
+ */
+static int
+sysctl_kern_consmute(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+
+	error = sysctl_handle_int(oidp, &cn_mute, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, consmute, CTLTYPE_INT|CTLFLAG_RW,
+	0, sizeof(cn_mute), sysctl_kern_consmute, "I",
+	"State of the console muting");
+
+void
+cngrab()
+{
+	struct cn_device *cnd;
+	struct consdev *cn;
+
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+		cn = cnd->cnd_cn;
+		if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG))
+			cn->cn_ops->cn_grab(cn);
+	}
+}
+
+void
+cnungrab()
+{
+	struct cn_device *cnd;
+	struct consdev *cn;
+
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+		cn = cnd->cnd_cn;
+		if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG))
+			cn->cn_ops->cn_ungrab(cn);
+	}
+}
+
+/*
+ * Low level console routines.
+ */
+int
+cngetc(void)
+{
+	int c;
+
+	if (cn_mute)
+		return (-1);
+	while ((c = cncheckc()) == -1)
+		cpu_spinwait();
+	if (c == '\r')
+		c = '\n';		/* console input is always ICRNL */
+	return (c);
+}
+
+int
+cncheckc(void)
+{
+	struct cn_device *cnd;
+	struct consdev *cn;
+	int c;
+
+	if (cn_mute)
+		return (-1);
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+		cn = cnd->cnd_cn;
+		if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG)) {
+			c = cn->cn_ops->cn_getc(cn);
+			if (c != -1)
+				return (c);
+		}
+	}
+	return (-1);
+}
+
+void
+cngets(char *cp, size_t size, int visible)
+{
+	char *lp, *end;
+	int c;
+
+	cngrab();
+
+	lp = cp;
+	end = cp + size - 1;
+	for (;;) {
+		c = cngetc() & 0177;
+		switch (c) {
+		case '\n':
+		case '\r':
+			cnputc(c);
+			*lp = '\0';
+			cnungrab();
+			return;
+		case '\b':
+		case '\177':
+			if (lp > cp) {
+				if (visible) {
+					cnputc(c);
+					cnputs(" \b");
+				}
+				lp--;
+			}
+			continue;
+		case '\0':
+			continue;
+		default:
+			if (lp < end) {
+				switch (visible) {
+				case GETS_NOECHO:
+					break;
+				case GETS_ECHOPASS:
+					cnputc('*');
+					break;
+				default:
+					cnputc(c);
+					break;
+				}
+				*lp++ = c;
+			}
+		}
+	}
+}
+
+void
+cnputc(int c)
+{
+	struct cn_device *cnd;
+	struct consdev *cn;
+	char *cp;
+
+	if (cn_mute || c == '\0')
+		return;
+	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+		cn = cnd->cnd_cn;
+		if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG)) {
+			if (c == '\n')
+				cn->cn_ops->cn_putc(cn, '\r');
+			cn->cn_ops->cn_putc(cn, c);
+		}
+	}
+	if (console_pausing && c == '\n' && !kdb_active) {
+		for (cp = console_pausestr; *cp != '\0'; cp++)
+			cnputc(*cp);
+		cngrab();
+		if (cngetc() == '.')
+			console_pausing = 0;
+		cnungrab();
+		cnputc('\r');
+		for (cp = console_pausestr; *cp != '\0'; cp++)
+			cnputc(' ');
+		cnputc('\r');
+	}
+}
+
+void
+cnputs(char *p)
+{
+	int c;
+	int unlock_reqd = 0;
+
+	if (use_cnputs_mtx) {
+		mtx_lock_spin(&cnputs_mtx);
+		unlock_reqd = 1;
+	}
+
+	while ((c = *p++) != '\0')
+		cnputc(c);
+
+	if (unlock_reqd)
+		mtx_unlock_spin(&cnputs_mtx);
+}
+
+static int consmsgbuf_size = 8192;
+SYSCTL_INT(_kern, OID_AUTO, consmsgbuf_size, CTLFLAG_RW, &consmsgbuf_size, 0,
+    "Console tty buffer size");
+
+/*
+ * Redirect console output to a tty.
+ */
+void
+constty_set(struct tty *tp)
+{
+	int size;
+
+	KASSERT(tp != NULL, ("constty_set: NULL tp"));
+	if (consbuf == NULL) {
+		size = consmsgbuf_size;
+		consbuf = malloc(size, M_TTYCONS, M_WAITOK);
+		msgbuf_init(&consmsgbuf, consbuf, size);
+		callout_init(&conscallout, 0);
+	}
+	constty = tp;
+	constty_timeout(NULL);
+}
+
+/*
+ * Disable console redirection to a tty.
+ */
+void
+constty_clear(void)
+{
+	int c;
+
+	constty = NULL;
+	if (consbuf == NULL)
+		return;
+	callout_stop(&conscallout);
+	while ((c = msgbuf_getchar(&consmsgbuf)) != -1)
+		cnputc(c);
+	free(consbuf, M_TTYCONS);
+	consbuf = NULL;
+}
+
+/* Times per second to check for pending console tty messages. */
+static int constty_wakeups_per_second = 5;
+SYSCTL_INT(_kern, OID_AUTO, constty_wakeups_per_second, CTLFLAG_RW,
+    &constty_wakeups_per_second, 0,
+    "Times per second to check for pending console tty messages");
+
+static void
+constty_timeout(void *arg)
+{
+	int c;
+
+	if (constty != NULL) {
+		tty_lock(constty);
+		while ((c = msgbuf_getchar(&consmsgbuf)) != -1) {
+			if (tty_putchar(constty, c) < 0) {
+				tty_unlock(constty);
+				constty = NULL;
+				break;
+			}
+		}
+
+		if (constty != NULL)
+			tty_unlock(constty);
+	}
+	if (constty != NULL) {
+		callout_reset(&conscallout, hz / constty_wakeups_per_second,
+		    constty_timeout, NULL);
+	} else {
+		/* Deallocate the constty buffer memory. */
+		constty_clear();
+	}
+}
+
+static void
+cn_drvinit(void *unused)
+{
+
+	mtx_init(&cnputs_mtx, "cnputs_mtx", NULL, MTX_SPIN | MTX_NOWITNESS);
+	use_cnputs_mtx = 1;
+}
+
+SYSINIT(cndev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, cn_drvinit, NULL);
+
+/*
+ * Sysbeep(), if we have hardware for it
+ */
+
+#ifdef HAS_TIMER_SPKR
+
+static int beeping;
+
+static void
+sysbeepstop(void *chan)
+{
+
+	timer_spkr_release();
+	beeping = 0;
+}
+
+int
+sysbeep(int pitch, int period)
+{
+
+	if (timer_spkr_acquire()) {
+		if (!beeping) {
+			/* Something else owns it. */
+			return (EBUSY);
+		}
+	}
+	timer_spkr_setfreq(pitch);
+	if (!beeping) {
+		beeping = period;
+		timeout(sysbeepstop, (void *)NULL, period);
+	}
+	return (0);
+}
+
+#else
+
+/*
+ * No hardware, no sound
+ */
+
+int
+sysbeep(int pitch __unused, int period __unused)
+{
+
+	return (ENODEV);
+}
+
+#endif
+
diff --git a/sys/kern/kern_context.c b/sys/kern/kern_context.c
new file mode 100644
index 0000000..70751d0
--- /dev/null
+++ b/sys/kern/kern_context.c
@@ -0,0 +1,129 @@
+/*-
+ * Copyright (c) 2002 Daniel M. Eischen <deischen@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/ucontext.h>
+
+/*
+ * The first two fields of a ucontext_t are the signal mask and the machine
+ * context.  The next field is uc_link; we want to avoid destroying the link
+ * when copying out contexts.
+ */
+#define	UC_COPY_SIZE	offsetof(ucontext_t, uc_link)
+
+#ifndef _SYS_SYSPROTO_H_
+struct getcontext_args {
+	struct __ucontext *ucp;
+}
+struct setcontext_args {
+	const struct __ucontext_t *ucp;
+}
+struct swapcontext_args {
+	struct __ucontext *oucp;
+	const struct __ucontext_t *ucp;
+}
+#endif
+
+int
+sys_getcontext(struct thread *td, struct getcontext_args *uap)
+{
+	ucontext_t uc;
+	int ret;
+
+	if (uap->ucp == NULL)
+		ret = EINVAL;
+	else {
+		get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
+		PROC_LOCK(td->td_proc);
+		uc.uc_sigmask = td->td_sigmask;
+		PROC_UNLOCK(td->td_proc);
+		bzero(uc.__spare__, sizeof(uc.__spare__));
+		ret = copyout(&uc, uap->ucp, UC_COPY_SIZE);
+	}
+	return (ret);
+}
+
+int
+sys_setcontext(struct thread *td, struct setcontext_args *uap)
+{
+	ucontext_t uc;
+	int ret;	
+
+	if (uap->ucp == NULL)
+		ret = EINVAL;
+	else {
+		ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
+		if (ret == 0) {
+			ret = set_mcontext(td, &uc.uc_mcontext);
+			if (ret == 0) {
+				kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask,
+				    NULL, 0);
+			}
+		}
+	}
+	return (ret == 0 ? EJUSTRETURN : ret);
+}
+
+int
+sys_swapcontext(struct thread *td, struct swapcontext_args *uap)
+{
+	ucontext_t uc;
+	int ret;	
+
+	if (uap->oucp == NULL || uap->ucp == NULL)
+		ret = EINVAL;
+	else {
+		get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
+		bzero(uc.__spare__, sizeof(uc.__spare__));
+		PROC_LOCK(td->td_proc);
+		uc.uc_sigmask = td->td_sigmask;
+		PROC_UNLOCK(td->td_proc);
+		ret = copyout(&uc, uap->oucp, UC_COPY_SIZE);
+		if (ret == 0) {
+			ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
+			if (ret == 0) {
+				ret = set_mcontext(td, &uc.uc_mcontext);
+				if (ret == 0) {
+					kern_sigprocmask(td, SIG_SETMASK,
+					    &uc.uc_sigmask, NULL, 0);
+				}
+			}
+		}
+	}
+	return (ret == 0 ? EJUSTRETURN : ret);
+}
diff --git a/sys/kern/kern_cpu.c b/sys/kern/kern_cpu.c
new file mode 100644
index 0000000..6df4d3f
--- /dev/null
+++ b/sys/kern/kern_cpu.c
@@ -0,0 +1,1063 @@
+/*-
+ * Copyright (c) 2004-2007 Nate Lawson (SDG)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/cpu.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/timetc.h>
+#include <sys/taskqueue.h>
+
+#include "cpufreq_if.h"
+
+/*
+ * Common CPU frequency glue code.  Drivers for specific hardware can
+ * attach this interface to allow users to get/set the CPU frequency.
+ */
+
+/*
+ * Number of levels we can handle.  Levels are synthesized from settings
+ * so for M settings and N drivers, there may be M*N levels.
+ */
+#define CF_MAX_LEVELS	64
+
+struct cf_saved_freq {
+	struct cf_level			level;
+	int				priority;
+	SLIST_ENTRY(cf_saved_freq)	link;
+};
+
+struct cpufreq_softc {
+	struct sx			lock;
+	struct cf_level			curr_level;
+	int				curr_priority;
+	SLIST_HEAD(, cf_saved_freq)	saved_freq;
+	struct cf_level_lst		all_levels;
+	int				all_count;
+	int				max_mhz;
+	device_t			dev;
+	struct sysctl_ctx_list		sysctl_ctx;
+	struct task			startup_task;
+	struct cf_level			*levels_buf;
+};
+
+struct cf_setting_array {
+	struct cf_setting		sets[MAX_SETTINGS];
+	int				count;
+	TAILQ_ENTRY(cf_setting_array)	link;
+};
+
+TAILQ_HEAD(cf_setting_lst, cf_setting_array);
+
+#define CF_MTX_INIT(x)		sx_init((x), "cpufreq lock")
+#define CF_MTX_LOCK(x)		sx_xlock((x))
+#define CF_MTX_UNLOCK(x)	sx_xunlock((x))
+#define CF_MTX_ASSERT(x)	sx_assert((x), SX_XLOCKED)
+
+#define CF_DEBUG(msg...)	do {		\
+	if (cf_verbose)				\
+		printf("cpufreq: " msg);	\
+	} while (0)
+
+static int	cpufreq_attach(device_t dev);
+static void	cpufreq_startup_task(void *ctx, int pending);
+static int	cpufreq_detach(device_t dev);
+static int	cf_set_method(device_t dev, const struct cf_level *level,
+		    int priority);
+static int	cf_get_method(device_t dev, struct cf_level *level);
+static int	cf_levels_method(device_t dev, struct cf_level *levels,
+		    int *count);
+static int	cpufreq_insert_abs(struct cpufreq_softc *sc,
+		    struct cf_setting *sets, int count);
+static int	cpufreq_expand_set(struct cpufreq_softc *sc,
+		    struct cf_setting_array *set_arr);
+static struct cf_level *cpufreq_dup_set(struct cpufreq_softc *sc,
+		    struct cf_level *dup, struct cf_setting *set);
+static int	cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS);
+static int	cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS);
+static int	cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS);
+
+static device_method_t cpufreq_methods[] = {
+	DEVMETHOD(device_probe,		bus_generic_probe),
+	DEVMETHOD(device_attach,	cpufreq_attach),
+	DEVMETHOD(device_detach,	cpufreq_detach),
+
+        DEVMETHOD(cpufreq_set,		cf_set_method),
+        DEVMETHOD(cpufreq_get,		cf_get_method),
+        DEVMETHOD(cpufreq_levels,	cf_levels_method),
+	{0, 0}
+};
+static driver_t cpufreq_driver = {
+	"cpufreq", cpufreq_methods, sizeof(struct cpufreq_softc)
+};
+static devclass_t cpufreq_dc;
+DRIVER_MODULE(cpufreq, cpu, cpufreq_driver, cpufreq_dc, 0, 0);
+
+static int		cf_lowest_freq;
+static int		cf_verbose;
+TUNABLE_INT("debug.cpufreq.lowest", &cf_lowest_freq);
+TUNABLE_INT("debug.cpufreq.verbose", &cf_verbose);
+static SYSCTL_NODE(_debug, OID_AUTO, cpufreq, CTLFLAG_RD, NULL,
+    "cpufreq debugging");
+SYSCTL_INT(_debug_cpufreq, OID_AUTO, lowest, CTLFLAG_RW, &cf_lowest_freq, 1,
+    "Don't provide levels below this frequency.");
+SYSCTL_INT(_debug_cpufreq, OID_AUTO, verbose, CTLFLAG_RW, &cf_verbose, 1,
+    "Print verbose debugging messages");
+
+static int
+cpufreq_attach(device_t dev)
+{
+	struct cpufreq_softc *sc;
+	struct pcpu *pc;
+	device_t parent;
+	uint64_t rate;
+	int numdevs;
+
+	CF_DEBUG("initializing %s\n", device_get_nameunit(dev));
+	sc = device_get_softc(dev);
+	parent = device_get_parent(dev);
+	sc->dev = dev;
+	sysctl_ctx_init(&sc->sysctl_ctx);
+	TAILQ_INIT(&sc->all_levels);
+	CF_MTX_INIT(&sc->lock);
+	sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
+	SLIST_INIT(&sc->saved_freq);
+	/* Try to get nominal CPU freq to use it as maximum later if needed */
+	sc->max_mhz = cpu_get_nominal_mhz(dev);
+	/* If that fails, try to measure the current rate */
+	if (sc->max_mhz <= 0) {
+		pc = cpu_get_pcpu(dev);
+		if (cpu_est_clockrate(pc->pc_cpuid, &rate) == 0)
+			sc->max_mhz = rate / 1000000;
+		else
+			sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
+	}
+
+	/*
+	 * Only initialize one set of sysctls for all CPUs.  In the future,
+	 * if multiple CPUs can have different settings, we can move these
+	 * sysctls to be under every CPU instead of just the first one.
+	 */
+	numdevs = devclass_get_count(cpufreq_dc);
+	if (numdevs > 1)
+		return (0);
+
+	CF_DEBUG("initializing one-time data for %s\n",
+	    device_get_nameunit(dev));
+	sc->levels_buf = malloc(CF_MAX_LEVELS * sizeof(*sc->levels_buf),
+	    M_DEVBUF, M_WAITOK);
+	SYSCTL_ADD_PROC(&sc->sysctl_ctx,
+	    SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
+	    OID_AUTO, "freq", CTLTYPE_INT | CTLFLAG_RW, sc, 0,
+	    cpufreq_curr_sysctl, "I", "Current CPU frequency");
+	SYSCTL_ADD_PROC(&sc->sysctl_ctx,
+	    SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
+	    OID_AUTO, "freq_levels", CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
+	    cpufreq_levels_sysctl, "A", "CPU frequency levels");
+
+	/*
+	 * Queue a one-shot broadcast that levels have changed.
+	 * It will run once the system has completed booting.
+	 */
+	TASK_INIT(&sc->startup_task, 0, cpufreq_startup_task, dev);
+	taskqueue_enqueue(taskqueue_thread, &sc->startup_task);
+
+	return (0);
+}
+
+/* Handle any work to be done for all drivers that attached during boot. */
+static void 
+cpufreq_startup_task(void *ctx, int pending)
+{
+
+	cpufreq_settings_changed((device_t)ctx);
+}
+
+static int
+cpufreq_detach(device_t dev)
+{
+	struct cpufreq_softc *sc;
+	struct cf_saved_freq *saved_freq;
+	int numdevs;
+
+	CF_DEBUG("shutdown %s\n", device_get_nameunit(dev));
+	sc = device_get_softc(dev);
+	sysctl_ctx_free(&sc->sysctl_ctx);
+
+	while ((saved_freq = SLIST_FIRST(&sc->saved_freq)) != NULL) {
+		SLIST_REMOVE_HEAD(&sc->saved_freq, link);
+		free(saved_freq, M_TEMP);
+	}
+
+	/* Only clean up these resources when the last device is detaching. */
+	numdevs = devclass_get_count(cpufreq_dc);
+	if (numdevs == 1) {
+		CF_DEBUG("final shutdown for %s\n", device_get_nameunit(dev));
+		free(sc->levels_buf, M_DEVBUF);
+	}
+
+	return (0);
+}
+
+static int
+cf_set_method(device_t dev, const struct cf_level *level, int priority)
+{
+	struct cpufreq_softc *sc;
+	const struct cf_setting *set;
+	struct cf_saved_freq *saved_freq, *curr_freq;
+	struct pcpu *pc;
+	int error, i;
+
+	sc = device_get_softc(dev);
+	error = 0;
+	set = NULL;
+	saved_freq = NULL;
+
+	/* We are going to change levels so notify the pre-change handler. */
+	EVENTHANDLER_INVOKE(cpufreq_pre_change, level, &error);
+	if (error != 0) {
+		EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
+		return (error);
+	}
+
+	CF_MTX_LOCK(&sc->lock);
+
+#ifdef SMP
+	/*
+	 * If still booting and secondary CPUs not started yet, don't allow
+	 * changing the frequency until they're online.  This is because we
+	 * can't switch to them using sched_bind() and thus we'd only be
+	 * switching the main CPU.  XXXTODO: Need to think more about how to
+	 * handle having different CPUs at different frequencies.  
+	 */
+	if (mp_ncpus > 1 && !smp_active) {
+		device_printf(dev, "rejecting change, SMP not started yet\n");
+		error = ENXIO;
+		goto out;
+	}
+#endif /* SMP */
+
+	/*
+	 * If the requested level has a lower priority, don't allow
+	 * the new level right now.
+	 */
+	if (priority < sc->curr_priority) {
+		CF_DEBUG("ignoring, curr prio %d less than %d\n", priority,
+		    sc->curr_priority);
+		error = EPERM;
+		goto out;
+	}
+
+	/*
+	 * If the caller didn't specify a level and one is saved, prepare to
+	 * restore the saved level.  If none has been saved, return an error.
+	 */
+	if (level == NULL) {
+		saved_freq = SLIST_FIRST(&sc->saved_freq);
+		if (saved_freq == NULL) {
+			CF_DEBUG("NULL level, no saved level\n");
+			error = ENXIO;
+			goto out;
+		}
+		level = &saved_freq->level;
+		priority = saved_freq->priority;
+		CF_DEBUG("restoring saved level, freq %d prio %d\n",
+		    level->total_set.freq, priority);
+	}
+
+	/* Reject levels that are below our specified threshold. */
+	if (level->total_set.freq < cf_lowest_freq) {
+		CF_DEBUG("rejecting freq %d, less than %d limit\n",
+		    level->total_set.freq, cf_lowest_freq);
+		error = EINVAL;
+		goto out;
+	}
+
+	/* If already at this level, just return. */
+	if (sc->curr_level.total_set.freq == level->total_set.freq) {
+		CF_DEBUG("skipping freq %d, same as current level %d\n",
+		    level->total_set.freq, sc->curr_level.total_set.freq);
+		goto skip;
+	}
+
+	/* First, set the absolute frequency via its driver. */
+	set = &level->abs_set;
+	if (set->dev) {
+		if (!device_is_attached(set->dev)) {
+			error = ENXIO;
+			goto out;
+		}
+
+		/* Bind to the target CPU before switching. */
+		pc = cpu_get_pcpu(set->dev);
+		thread_lock(curthread);
+		sched_bind(curthread, pc->pc_cpuid);
+		thread_unlock(curthread);
+		CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq,
+		    device_get_nameunit(set->dev), PCPU_GET(cpuid));
+		error = CPUFREQ_DRV_SET(set->dev, set);
+		thread_lock(curthread);
+		sched_unbind(curthread);
+		thread_unlock(curthread);
+		if (error) {
+			goto out;
+		}
+	}
+
+	/* Next, set any/all relative frequencies via their drivers. */
+	for (i = 0; i < level->rel_count; i++) {
+		set = &level->rel_set[i];
+		if (!device_is_attached(set->dev)) {
+			error = ENXIO;
+			goto out;
+		}
+
+		/* Bind to the target CPU before switching. */
+		pc = cpu_get_pcpu(set->dev);
+		thread_lock(curthread);
+		sched_bind(curthread, pc->pc_cpuid);
+		thread_unlock(curthread);
+		CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq,
+		    device_get_nameunit(set->dev), PCPU_GET(cpuid));
+		error = CPUFREQ_DRV_SET(set->dev, set);
+		thread_lock(curthread);
+		sched_unbind(curthread);
+		thread_unlock(curthread);
+		if (error) {
+			/* XXX Back out any successful setting? */
+			goto out;
+		}
+	}
+
+skip:
+	/*
+	 * Before recording the current level, check if we're going to a
+	 * higher priority.  If so, save the previous level and priority.
+	 */
+	if (sc->curr_level.total_set.freq != CPUFREQ_VAL_UNKNOWN &&
+	    priority > sc->curr_priority) {
+		CF_DEBUG("saving level, freq %d prio %d\n",
+		    sc->curr_level.total_set.freq, sc->curr_priority);
+		curr_freq = malloc(sizeof(*curr_freq), M_TEMP, M_NOWAIT);
+		if (curr_freq == NULL) {
+			error = ENOMEM;
+			goto out;
+		}
+		curr_freq->level = sc->curr_level;
+		curr_freq->priority = sc->curr_priority;
+		SLIST_INSERT_HEAD(&sc->saved_freq, curr_freq, link);
+	}
+	sc->curr_level = *level;
+	sc->curr_priority = priority;
+
+	/* If we were restoring a saved state, reset it to "unused". */
+	if (saved_freq != NULL) {
+		CF_DEBUG("resetting saved level\n");
+		sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
+		SLIST_REMOVE_HEAD(&sc->saved_freq, link);
+		free(saved_freq, M_TEMP);
+	}
+
+out:
+	CF_MTX_UNLOCK(&sc->lock);
+
+	/*
+	 * We changed levels (or attempted to) so notify the post-change
+	 * handler of new frequency or error.
+	 */
+	EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
+	if (error && set)
+		device_printf(set->dev, "set freq failed, err %d\n", error);
+
+	return (error);
+}
+
+static int
+cf_get_method(device_t dev, struct cf_level *level)
+{
+	struct cpufreq_softc *sc;
+	struct cf_level *levels;
+	struct cf_setting *curr_set, set;
+	struct pcpu *pc;
+	device_t *devs;
+	int count, error, i, n, numdevs;
+	uint64_t rate;
+
+	sc = device_get_softc(dev);
+	error = 0;
+	levels = NULL;
+
+	/* If we already know the current frequency, we're done. */
+	CF_MTX_LOCK(&sc->lock);
+	curr_set = &sc->curr_level.total_set;
+	if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
+		CF_DEBUG("get returning known freq %d\n", curr_set->freq);
+		goto out;
+	}
+	CF_MTX_UNLOCK(&sc->lock);
+
+	/*
+	 * We need to figure out the current level.  Loop through every
+	 * driver, getting the current setting.  Then, attempt to get a best
+	 * match of settings against each level.
+	 */
+	count = CF_MAX_LEVELS;
+	levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
+	if (levels == NULL)
+		return (ENOMEM);
+	error = CPUFREQ_LEVELS(sc->dev, levels, &count);
+	if (error) {
+		if (error == E2BIG)
+			printf("cpufreq: need to increase CF_MAX_LEVELS\n");
+		free(levels, M_TEMP);
+		return (error);
+	}
+	error = device_get_children(device_get_parent(dev), &devs, &numdevs);
+	if (error) {
+		free(levels, M_TEMP);
+		return (error);
+	}
+
+	/*
+	 * Reacquire the lock and search for the given level.
+	 *
+	 * XXX Note: this is not quite right since we really need to go
+	 * through each level and compare both absolute and relative
+	 * settings for each driver in the system before making a match.
+	 * The estimation code below catches this case though.
+	 */
+	CF_MTX_LOCK(&sc->lock);
+	for (n = 0; n < numdevs && curr_set->freq == CPUFREQ_VAL_UNKNOWN; n++) {
+		if (!device_is_attached(devs[n]))
+			continue;
+		if (CPUFREQ_DRV_GET(devs[n], &set) != 0)
+			continue;
+		for (i = 0; i < count; i++) {
+			if (set.freq == levels[i].total_set.freq) {
+				sc->curr_level = levels[i];
+				break;
+			}
+		}
+	}
+	free(devs, M_TEMP);
+	if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
+		CF_DEBUG("get matched freq %d from drivers\n", curr_set->freq);
+		goto out;
+	}
+
+	/*
+	 * We couldn't find an exact match, so attempt to estimate and then
+	 * match against a level.
+	 */
+	pc = cpu_get_pcpu(dev);
+	if (pc == NULL) {
+		error = ENXIO;
+		goto out;
+	}
+	cpu_est_clockrate(pc->pc_cpuid, &rate);
+	rate /= 1000000;
+	for (i = 0; i < count; i++) {
+		if (CPUFREQ_CMP(rate, levels[i].total_set.freq)) {
+			sc->curr_level = levels[i];
+			CF_DEBUG("get estimated freq %d\n", curr_set->freq);
+			goto out;
+		}
+	}
+	error = ENXIO;
+
+out:
+	if (error == 0)
+		*level = sc->curr_level;
+
+	CF_MTX_UNLOCK(&sc->lock);
+	if (levels)
+		free(levels, M_TEMP);
+	return (error);
+}
+
+static int
+cf_levels_method(device_t dev, struct cf_level *levels, int *count)
+{
+	struct cf_setting_array *set_arr;
+	struct cf_setting_lst rel_sets;
+	struct cpufreq_softc *sc;
+	struct cf_level *lev;
+	struct cf_setting *sets;
+	struct pcpu *pc;
+	device_t *devs;
+	int error, i, numdevs, set_count, type;
+	uint64_t rate;
+
+	if (levels == NULL || count == NULL)
+		return (EINVAL);
+
+	TAILQ_INIT(&rel_sets);
+	sc = device_get_softc(dev);
+	error = device_get_children(device_get_parent(dev), &devs, &numdevs);
+	if (error)
+		return (error);
+	sets = malloc(MAX_SETTINGS * sizeof(*sets), M_TEMP, M_NOWAIT);
+	if (sets == NULL) {
+		free(devs, M_TEMP);
+		return (ENOMEM);
+	}
+
+	/* Get settings from all cpufreq drivers. */
+	CF_MTX_LOCK(&sc->lock);
+	for (i = 0; i < numdevs; i++) {
+		/* Skip devices that aren't ready. */
+		if (!device_is_attached(devs[i]))
+			continue;
+
+		/*
+		 * Get settings, skipping drivers that offer no settings or
+		 * provide settings for informational purposes only.
+		 */
+		error = CPUFREQ_DRV_TYPE(devs[i], &type);
+		if (error || (type & CPUFREQ_FLAG_INFO_ONLY)) {
+			if (error == 0) {
+				CF_DEBUG("skipping info-only driver %s\n",
+				    device_get_nameunit(devs[i]));
+			}
+			continue;
+		}
+		set_count = MAX_SETTINGS;
+		error = CPUFREQ_DRV_SETTINGS(devs[i], sets, &set_count);
+		if (error || set_count == 0)
+			continue;
+
+		/* Add the settings to our absolute/relative lists. */
+		switch (type & CPUFREQ_TYPE_MASK) {
+		case CPUFREQ_TYPE_ABSOLUTE:
+			error = cpufreq_insert_abs(sc, sets, set_count);
+			break;
+		case CPUFREQ_TYPE_RELATIVE:
+			CF_DEBUG("adding %d relative settings\n", set_count);
+			set_arr = malloc(sizeof(*set_arr), M_TEMP, M_NOWAIT);
+			if (set_arr == NULL) {
+				error = ENOMEM;
+				goto out;
+			}
+			bcopy(sets, set_arr->sets, set_count * sizeof(*sets));
+			set_arr->count = set_count;
+			TAILQ_INSERT_TAIL(&rel_sets, set_arr, link);
+			break;
+		default:
+			error = EINVAL;
+		}
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * If there are no absolute levels, create a fake one at 100%.  We
+	 * then cache the clockrate for later use as our base frequency.
+	 */
+	if (TAILQ_EMPTY(&sc->all_levels)) {
+		if (sc->max_mhz == CPUFREQ_VAL_UNKNOWN) {
+			sc->max_mhz = cpu_get_nominal_mhz(dev);
+			/*
+			 * If the CPU can't report a rate for 100%, hope
+			 * the CPU is running at its nominal rate right now,
+			 * and use that instead.
+			 */
+			if (sc->max_mhz <= 0) {
+				pc = cpu_get_pcpu(dev);
+				cpu_est_clockrate(pc->pc_cpuid, &rate);
+				sc->max_mhz = rate / 1000000;
+			}
+		}
+		memset(&sets[0], CPUFREQ_VAL_UNKNOWN, sizeof(*sets));
+		sets[0].freq = sc->max_mhz;
+		sets[0].dev = NULL;
+		error = cpufreq_insert_abs(sc, sets, 1);
+		if (error)
+			goto out;
+	}
+
+	/* Create a combined list of absolute + relative levels. */
+	TAILQ_FOREACH(set_arr, &rel_sets, link)
+		cpufreq_expand_set(sc, set_arr);
+
+	/* If the caller doesn't have enough space, return the actual count. */
+	if (sc->all_count > *count) {
+		*count = sc->all_count;
+		error = E2BIG;
+		goto out;
+	}
+
+	/* Finally, output the list of levels. */
+	i = 0;
+	TAILQ_FOREACH(lev, &sc->all_levels, link) {
+
+		/* Skip levels that have a frequency that is too low. */
+		if (lev->total_set.freq < cf_lowest_freq) {
+			sc->all_count--;
+			continue;
+		}
+
+		levels[i] = *lev;
+		i++;
+	}
+	*count = sc->all_count;
+	error = 0;
+
+out:
+	/* Clear all levels since we regenerate them each time. */
+	while ((lev = TAILQ_FIRST(&sc->all_levels)) != NULL) {
+		TAILQ_REMOVE(&sc->all_levels, lev, link);
+		free(lev, M_TEMP);
+	}
+	sc->all_count = 0;
+
+	CF_MTX_UNLOCK(&sc->lock);
+	while ((set_arr = TAILQ_FIRST(&rel_sets)) != NULL) {
+		TAILQ_REMOVE(&rel_sets, set_arr, link);
+		free(set_arr, M_TEMP);
+	}
+	free(devs, M_TEMP);
+	free(sets, M_TEMP);
+	return (error);
+}
+
+/*
+ * Create levels for an array of absolute settings and insert them in
+ * sorted order in the specified list.
+ */
+static int
+cpufreq_insert_abs(struct cpufreq_softc *sc, struct cf_setting *sets,
+    int count)
+{
+	struct cf_level_lst *list;
+	struct cf_level *level, *search;
+	int i;
+
+	CF_MTX_ASSERT(&sc->lock);
+
+	list = &sc->all_levels;
+	for (i = 0; i < count; i++) {
+		level = malloc(sizeof(*level), M_TEMP, M_NOWAIT | M_ZERO);
+		if (level == NULL)
+			return (ENOMEM);
+		level->abs_set = sets[i];
+		level->total_set = sets[i];
+		level->total_set.dev = NULL;
+		sc->all_count++;
+
+		if (TAILQ_EMPTY(list)) {
+			CF_DEBUG("adding abs setting %d at head\n",
+			    sets[i].freq);
+			TAILQ_INSERT_HEAD(list, level, link);
+			continue;
+		}
+
+		TAILQ_FOREACH_REVERSE(search, list, cf_level_lst, link) {
+			if (sets[i].freq <= search->total_set.freq) {
+				CF_DEBUG("adding abs setting %d after %d\n",
+				    sets[i].freq, search->total_set.freq);
+				TAILQ_INSERT_AFTER(list, search, level, link);
+				break;
+			}
+		}
+	}
+	return (0);
+}
+
+/*
+ * Expand a group of relative settings, creating derived levels from them.
+ */
+static int
+cpufreq_expand_set(struct cpufreq_softc *sc, struct cf_setting_array *set_arr)
+{
+	struct cf_level *fill, *search;
+	struct cf_setting *set;
+	int i;
+
+	CF_MTX_ASSERT(&sc->lock);
+
+	/*
+	 * Walk the set of all existing levels in reverse.  This is so we
+	 * create derived states from the lowest absolute settings first
+	 * and discard duplicates created from higher absolute settings.
+	 * For instance, a level of 50 Mhz derived from 100 Mhz + 50% is
+	 * preferable to 200 Mhz + 25% because absolute settings are more
+	 * efficient since they often change the voltage as well.
+	 */
+	TAILQ_FOREACH_REVERSE(search, &sc->all_levels, cf_level_lst, link) {
+		/* Add each setting to the level, duplicating if necessary. */
+		for (i = 0; i < set_arr->count; i++) {
+			set = &set_arr->sets[i];
+
+			/*
+			 * If this setting is less than 100%, split the level
+			 * into two and add this setting to the new level.
+			 */
+			fill = search;
+			if (set->freq < 10000) {
+				fill = cpufreq_dup_set(sc, search, set);
+
+				/*
+				 * The new level was a duplicate of an existing
+				 * level or its absolute setting is too high
+				 * so we freed it.  For example, we discard a
+				 * derived level of 1000 MHz/25% if a level
+				 * of 500 MHz/100% already exists.
+				 */
+				if (fill == NULL)
+					break;
+			}
+
+			/* Add this setting to the existing or new level. */
+			KASSERT(fill->rel_count < MAX_SETTINGS,
+			    ("cpufreq: too many relative drivers (%d)",
+			    MAX_SETTINGS));
+			fill->rel_set[fill->rel_count] = *set;
+			fill->rel_count++;
+			CF_DEBUG(
+			"expand set added rel setting %d%% to %d level\n",
+			    set->freq / 100, fill->total_set.freq);
+		}
+	}
+
+	return (0);
+}
+
+static struct cf_level *
+cpufreq_dup_set(struct cpufreq_softc *sc, struct cf_level *dup,
+    struct cf_setting *set)
+{
+	struct cf_level_lst *list;
+	struct cf_level *fill, *itr;
+	struct cf_setting *fill_set, *itr_set;
+	int i;
+
+	CF_MTX_ASSERT(&sc->lock);
+
+	/*
+	 * Create a new level, copy it from the old one, and update the
+	 * total frequency and power by the percentage specified in the
+	 * relative setting.
+	 */
+	fill = malloc(sizeof(*fill), M_TEMP, M_NOWAIT);
+	if (fill == NULL)
+		return (NULL);
+	*fill = *dup;
+	fill_set = &fill->total_set;
+	fill_set->freq =
+	    ((uint64_t)fill_set->freq * set->freq) / 10000;
+	if (fill_set->power != CPUFREQ_VAL_UNKNOWN) {
+		fill_set->power = ((uint64_t)fill_set->power * set->freq)
+		    / 10000;
+	}
+	if (set->lat != CPUFREQ_VAL_UNKNOWN) {
+		if (fill_set->lat != CPUFREQ_VAL_UNKNOWN)
+			fill_set->lat += set->lat;
+		else
+			fill_set->lat = set->lat;
+	}
+	CF_DEBUG("dup set considering derived setting %d\n", fill_set->freq);
+
+	/*
+	 * If we copied an old level that we already modified (say, at 100%),
+	 * we need to remove that setting before adding this one.  Since we
+	 * process each setting array in order, we know any settings for this
+	 * driver will be found at the end.
+	 */
+	for (i = fill->rel_count; i != 0; i--) {
+		if (fill->rel_set[i - 1].dev != set->dev)
+			break;
+		CF_DEBUG("removed last relative driver: %s\n",
+		    device_get_nameunit(set->dev));
+		fill->rel_count--;
+	}
+
+	/*
+	 * Insert the new level in sorted order.  If it is a duplicate of an
+	 * existing level (1) or has an absolute setting higher than the
+	 * existing level (2), do not add it.  We can do this since any such
+	 * level is guaranteed use less power.  For example (1), a level with
+	 * one absolute setting of 800 Mhz uses less power than one composed
+	 * of an absolute setting of 1600 Mhz and a relative setting at 50%.
+	 * Also for example (2), a level of 800 Mhz/75% is preferable to
+	 * 1600 Mhz/25% even though the latter has a lower total frequency.
+	 */
+	list = &sc->all_levels;
+	KASSERT(!TAILQ_EMPTY(list), ("all levels list empty in dup set"));
+	TAILQ_FOREACH_REVERSE(itr, list, cf_level_lst, link) {
+		itr_set = &itr->total_set;
+		if (CPUFREQ_CMP(fill_set->freq, itr_set->freq)) {
+			CF_DEBUG("dup set rejecting %d (dupe)\n",
+			    fill_set->freq);
+			itr = NULL;
+			break;
+		} else if (fill_set->freq < itr_set->freq) {
+			if (fill->abs_set.freq <= itr->abs_set.freq) {
+				CF_DEBUG(
+			"dup done, inserting new level %d after %d\n",
+				    fill_set->freq, itr_set->freq);
+				TAILQ_INSERT_AFTER(list, itr, fill, link);
+				sc->all_count++;
+			} else {
+				CF_DEBUG("dup set rejecting %d (abs too big)\n",
+				    fill_set->freq);
+				itr = NULL;
+			}
+			break;
+		}
+	}
+
+	/* We didn't find a good place for this new level so free it. */
+	if (itr == NULL) {
+		CF_DEBUG("dup set freeing new level %d (not optimal)\n",
+		    fill_set->freq);
+		free(fill, M_TEMP);
+		fill = NULL;
+	}
+
+	return (fill);
+}
+
+static int
+cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct cpufreq_softc *sc;
+	struct cf_level *levels;
+	int best, count, diff, bdiff, devcount, error, freq, i, n;
+	device_t *devs;
+
+	devs = NULL;
+	sc = oidp->oid_arg1;
+	levels = sc->levels_buf;
+
+	error = CPUFREQ_GET(sc->dev, &levels[0]);
+	if (error)
+		goto out;
+	freq = levels[0].total_set.freq;
+	error = sysctl_handle_int(oidp, &freq, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		goto out;
+
+	/*
+	 * While we only call cpufreq_get() on one device (assuming all
+	 * CPUs have equal levels), we call cpufreq_set() on all CPUs.
+	 * This is needed for some MP systems.
+	 */
+	error = devclass_get_devices(cpufreq_dc, &devs, &devcount);
+	if (error)
+		goto out;
+	for (n = 0; n < devcount; n++) {
+		count = CF_MAX_LEVELS;
+		error = CPUFREQ_LEVELS(devs[n], levels, &count);
+		if (error) {
+			if (error == E2BIG)
+				printf(
+			"cpufreq: need to increase CF_MAX_LEVELS\n");
+			break;
+		}
+		best = 0;
+		bdiff = 1 << 30;
+		for (i = 0; i < count; i++) {
+			diff = abs(levels[i].total_set.freq - freq);
+			if (diff < bdiff) {
+				bdiff = diff;
+				best = i;
+			}
+		}
+		error = CPUFREQ_SET(devs[n], &levels[best], CPUFREQ_PRIO_USER);
+	}
+
+out:
+	if (devs)
+		free(devs, M_TEMP);
+	return (error);
+}
+
+static int
+cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct cpufreq_softc *sc;
+	struct cf_level *levels;
+	struct cf_setting *set;
+	struct sbuf sb;
+	int count, error, i;
+
+	sc = oidp->oid_arg1;
+	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
+
+	/* Get settings from the device and generate the output string. */
+	count = CF_MAX_LEVELS;
+	levels = sc->levels_buf;
+	if (levels == NULL) {
+		sbuf_delete(&sb);
+		return (ENOMEM);
+	}
+	error = CPUFREQ_LEVELS(sc->dev, levels, &count);
+	if (error) {
+		if (error == E2BIG)
+			printf("cpufreq: need to increase CF_MAX_LEVELS\n");
+		goto out;
+	}
+	if (count) {
+		for (i = 0; i < count; i++) {
+			set = &levels[i].total_set;
+			sbuf_printf(&sb, "%d/%d ", set->freq, set->power);
+		}
+	} else
+		sbuf_cpy(&sb, "0");
+	sbuf_trim(&sb);
+	sbuf_finish(&sb);
+	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
+
+out:
+	sbuf_delete(&sb);
+	return (error);
+}
+
+static int
+cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	device_t dev;
+	struct cf_setting *sets;
+	struct sbuf sb;
+	int error, i, set_count;
+
+	dev = oidp->oid_arg1;
+	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
+
+	/* Get settings from the device and generate the output string. */
+	set_count = MAX_SETTINGS;
+	sets = malloc(set_count * sizeof(*sets), M_TEMP, M_NOWAIT);
+	if (sets == NULL) {
+		sbuf_delete(&sb);
+		return (ENOMEM);
+	}
+	error = CPUFREQ_DRV_SETTINGS(dev, sets, &set_count);
+	if (error)
+		goto out;
+	if (set_count) {
+		for (i = 0; i < set_count; i++)
+			sbuf_printf(&sb, "%d/%d ", sets[i].freq, sets[i].power);
+	} else
+		sbuf_cpy(&sb, "0");
+	sbuf_trim(&sb);
+	sbuf_finish(&sb);
+	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
+
+out:
+	free(sets, M_TEMP);
+	sbuf_delete(&sb);
+	return (error);
+}
+
+int
+cpufreq_register(device_t dev)
+{
+	struct cpufreq_softc *sc;
+	device_t cf_dev, cpu_dev;
+
+	/* Add a sysctl to get each driver's settings separately. */
+	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
+	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
+	    OID_AUTO, "freq_settings", CTLTYPE_STRING | CTLFLAG_RD, dev, 0,
+	    cpufreq_settings_sysctl, "A", "CPU frequency driver settings");
+
+	/*
+	 * Add only one cpufreq device to each CPU.  Currently, all CPUs
+	 * must offer the same levels and be switched at the same time.
+	 */
+	cpu_dev = device_get_parent(dev);
+	if ((cf_dev = device_find_child(cpu_dev, "cpufreq", -1))) {
+		sc = device_get_softc(cf_dev);
+		sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
+		return (0);
+	}
+
+	/* Add the child device and possibly sysctls. */
+	cf_dev = BUS_ADD_CHILD(cpu_dev, 0, "cpufreq", -1);
+	if (cf_dev == NULL)
+		return (ENOMEM);
+	device_quiet(cf_dev);
+
+	return (device_probe_and_attach(cf_dev));
+}
+
+int
+cpufreq_unregister(device_t dev)
+{
+	device_t cf_dev, *devs;
+	int cfcount, devcount, error, i, type;
+
+	/*
+	 * If this is the last cpufreq child device, remove the control
+	 * device as well.  We identify cpufreq children by calling a method
+	 * they support.
+	 */
+	error = device_get_children(device_get_parent(dev), &devs, &devcount);
+	if (error)
+		return (error);
+	cf_dev = device_find_child(device_get_parent(dev), "cpufreq", -1);
+	if (cf_dev == NULL) {
+		device_printf(dev,
+	"warning: cpufreq_unregister called with no cpufreq device active\n");
+		return (0);
+	}
+	cfcount = 0;
+	for (i = 0; i < devcount; i++) {
+		if (!device_is_attached(devs[i]))
+			continue;
+		if (CPUFREQ_DRV_TYPE(devs[i], &type) == 0)
+			cfcount++;
+	}
+	if (cfcount <= 1)
+		device_delete_child(device_get_parent(cf_dev), cf_dev);
+	free(devs, M_TEMP);
+
+	return (0);
+}
+
+int
+cpufreq_settings_changed(device_t dev)
+{
+
+	EVENTHANDLER_INVOKE(cpufreq_levels_changed,
+	    device_get_unit(device_get_parent(dev)));
+	return (0);
+}
diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c
new file mode 100644
index 0000000..42c95c9
--- /dev/null
+++ b/sys/kern/kern_cpuset.c
@@ -0,0 +1,1166 @@
+/*-
+ * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ * 
+ * Copyright (c) 2008 Nokia Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/refcount.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/cpuset.h>
+#include <sys/sx.h>
+#include <sys/queue.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif /* DDB */
+
+/*
+ * cpusets provide a mechanism for creating and manipulating sets of
+ * processors for the purpose of constraining the scheduling of threads to
+ * specific processors.
+ *
+ * Each process belongs to an identified set, by default this is set 1.  Each
+ * thread may further restrict the cpus it may run on to a subset of this
+ * named set.  This creates an anonymous set which other threads and processes
+ * may not join by number.
+ *
+ * The named set is referred to herein as the 'base' set to avoid ambiguity.
+ * This set is usually a child of a 'root' set while the anonymous set may
+ * simply be referred to as a mask.  In the syscall api these are referred to
+ * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
+ *
+ * Threads inherit their set from their creator whether it be anonymous or
+ * not.  This means that anonymous sets are immutable because they may be
+ * shared.  To modify an anonymous set a new set is created with the desired
+ * mask and the same parent as the existing anonymous set.  This gives the
+ * illusion of each thread having a private mask.
+ *
+ * Via the syscall apis a user may ask to retrieve or modify the root, base,
+ * or mask that is discovered via a pid, tid, or setid.  Modifying a set
+ * modifies all numbered and anonymous child sets to comply with the new mask.
+ * Modifying a pid or tid's mask applies only to that tid but must still
+ * exist within the assigned parent set.
+ *
+ * A thread may not be assigned to a group separate from other threads in
+ * the process.  This is to remove ambiguity when the setid is queried with
+ * a pid argument.  There is no other technical limitation.
+ *
+ * This somewhat complex arrangement is intended to make it easy for
+ * applications to query available processors and bind their threads to
+ * specific processors while also allowing administrators to dynamically
+ * reprovision by changing sets which apply to groups of processes.
+ *
+ * A simple application should not concern itself with sets at all and
+ * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
+ * meaning 'curthread'.  It may query available cpus for that tid with a
+ * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
+ */
+static uma_zone_t cpuset_zone;
+static struct mtx cpuset_lock;
+static struct setlist cpuset_ids;
+static struct unrhdr *cpuset_unr;
+static struct cpuset *cpuset_zero;
+
+/* Return the size of cpuset_t at the kernel level */
+SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD,
+	0, sizeof(cpuset_t), "sizeof(cpuset_t)");
+
+cpuset_t *cpuset_root;
+
+/*
+ * Acquire a reference to a cpuset, all pointers must be tracked with refs.
+ */
+struct cpuset *
+cpuset_ref(struct cpuset *set)
+{
+
+	refcount_acquire(&set->cs_ref);
+	return (set);
+}
+
+/*
+ * Walks up the tree from 'set' to find the root.  Returns the root
+ * referenced.
+ */
+static struct cpuset *
+cpuset_refroot(struct cpuset *set)
+{
+
+	for (; set->cs_parent != NULL; set = set->cs_parent)
+		if (set->cs_flags & CPU_SET_ROOT)
+			break;
+	cpuset_ref(set);
+
+	return (set);
+}
+
+/*
+ * Find the first non-anonymous set starting from 'set'.  Returns this set
+ * referenced.  May return the passed in set with an extra ref if it is
+ * not anonymous. 
+ */
+static struct cpuset *
+cpuset_refbase(struct cpuset *set)
+{
+
+	if (set->cs_id == CPUSET_INVALID)
+		set = set->cs_parent;
+	cpuset_ref(set);
+
+	return (set);
+}
+
+/*
+ * Release a reference in a context where it is safe to allocate.
+ */
+void
+cpuset_rel(struct cpuset *set)
+{
+	cpusetid_t id;
+
+	if (refcount_release(&set->cs_ref) == 0)
+		return;
+	mtx_lock_spin(&cpuset_lock);
+	LIST_REMOVE(set, cs_siblings);
+	id = set->cs_id;
+	if (id != CPUSET_INVALID)
+		LIST_REMOVE(set, cs_link);
+	mtx_unlock_spin(&cpuset_lock);
+	cpuset_rel(set->cs_parent);
+	uma_zfree(cpuset_zone, set);
+	if (id != CPUSET_INVALID)
+		free_unr(cpuset_unr, id);
+}
+
+/*
+ * Deferred release must be used when in a context that is not safe to
+ * allocate/free.  This places any unreferenced sets on the list 'head'.
+ */
+static void
+cpuset_rel_defer(struct setlist *head, struct cpuset *set)
+{
+
+	if (refcount_release(&set->cs_ref) == 0)
+		return;
+	mtx_lock_spin(&cpuset_lock);
+	LIST_REMOVE(set, cs_siblings);
+	if (set->cs_id != CPUSET_INVALID)
+		LIST_REMOVE(set, cs_link);
+	LIST_INSERT_HEAD(head, set, cs_link);
+	mtx_unlock_spin(&cpuset_lock);
+}
+
+/*
+ * Complete a deferred release.  Removes the set from the list provided to
+ * cpuset_rel_defer.
+ */
+static void
+cpuset_rel_complete(struct cpuset *set)
+{
+	LIST_REMOVE(set, cs_link);
+	cpuset_rel(set->cs_parent);
+	uma_zfree(cpuset_zone, set);
+}
+
+/*
+ * Find a set based on an id.  Returns it with a ref.
+ */
+static struct cpuset *
+cpuset_lookup(cpusetid_t setid, struct thread *td)
+{
+	struct cpuset *set;
+
+	if (setid == CPUSET_INVALID)
+		return (NULL);
+	mtx_lock_spin(&cpuset_lock);
+	LIST_FOREACH(set, &cpuset_ids, cs_link)
+		if (set->cs_id == setid)
+			break;
+	if (set)
+		cpuset_ref(set);
+	mtx_unlock_spin(&cpuset_lock);
+
+	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
+	if (set != NULL && jailed(td->td_ucred)) {
+		struct cpuset *jset, *tset;
+
+		jset = td->td_ucred->cr_prison->pr_cpuset;
+		for (tset = set; tset != NULL; tset = tset->cs_parent)
+			if (tset == jset)
+				break;
+		if (tset == NULL) {
+			cpuset_rel(set);
+			set = NULL;
+		}
+	}
+
+	return (set);
+}
+
+/*
+ * Create a set in the space provided in 'set' with the provided parameters.
+ * The set is returned with a single ref.  May return EDEADLK if the set
+ * will have no valid cpu based on restrictions from the parent.
+ */
+static int
+_cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask,
+    cpusetid_t id)
+{
+
+	if (!CPU_OVERLAP(&parent->cs_mask, mask))
+		return (EDEADLK);
+	CPU_COPY(mask, &set->cs_mask);
+	LIST_INIT(&set->cs_children);
+	refcount_init(&set->cs_ref, 1);
+	set->cs_flags = 0;
+	mtx_lock_spin(&cpuset_lock);
+	CPU_AND(&set->cs_mask, &parent->cs_mask);
+	set->cs_id = id;
+	set->cs_parent = cpuset_ref(parent);
+	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
+	if (set->cs_id != CPUSET_INVALID)
+		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
+	mtx_unlock_spin(&cpuset_lock);
+
+	return (0);
+}
+
+/*
+ * Create a new non-anonymous set with the requested parent and mask.  May
+ * return failures if the mask is invalid or a new number can not be
+ * allocated.
+ */
+static int
+cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
+{
+	struct cpuset *set;
+	cpusetid_t id;
+	int error;
+
+	id = alloc_unr(cpuset_unr);
+	if (id == -1)
+		return (ENFILE);
+	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
+	error = _cpuset_create(set, parent, mask, id);
+	if (error == 0)
+		return (0);
+	free_unr(cpuset_unr, id);
+	uma_zfree(cpuset_zone, set);
+
+	return (error);
+}
+
+/*
+ * Recursively check for errors that would occur from applying mask to
+ * the tree of sets starting at 'set'.  Checks for sets that would become
+ * empty as well as RDONLY flags.
+ */
+static int
+cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
+{
+	struct cpuset *nset;
+	cpuset_t newmask;
+	int error;
+
+	mtx_assert(&cpuset_lock, MA_OWNED);
+	if (set->cs_flags & CPU_SET_RDONLY)
+		return (EPERM);
+	if (check_mask) {
+		if (!CPU_OVERLAP(&set->cs_mask, mask))
+			return (EDEADLK);
+		CPU_COPY(&set->cs_mask, &newmask);
+		CPU_AND(&newmask, mask);
+	} else
+		CPU_COPY(mask, &newmask);
+	error = 0;
+	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
+		if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
+			break;
+	return (error);
+}
+
+/*
+ * Applies the mask 'mask' without checking for empty sets or permissions.
+ */
+static void
+cpuset_update(struct cpuset *set, cpuset_t *mask)
+{
+	struct cpuset *nset;
+
+	mtx_assert(&cpuset_lock, MA_OWNED);
+	CPU_AND(&set->cs_mask, mask);
+	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
+		cpuset_update(nset, &set->cs_mask);
+
+	return;
+}
+
+/*
+ * Modify the set 'set' to use a copy of the mask provided.  Apply this new
+ * mask to restrict all children in the tree.  Checks for validity before
+ * applying the changes.
+ */
+static int
+cpuset_modify(struct cpuset *set, cpuset_t *mask)
+{
+	struct cpuset *root;
+	int error;
+
+	error = priv_check(curthread, PRIV_SCHED_CPUSET);
+	if (error)
+		return (error);
+	/*
+	 * In case we are called from within the jail
+	 * we do not allow modifying the dedicated root
+	 * cpuset of the jail but may still allow to
+	 * change child sets.
+	 */
+	if (jailed(curthread->td_ucred) &&
+	    set->cs_flags & CPU_SET_ROOT)
+		return (EPERM);
+	/*
+	 * Verify that we have access to this set of
+	 * cpus.
+	 */
+	root = set->cs_parent;
+	if (root && !CPU_SUBSET(&root->cs_mask, mask))
+		return (EINVAL);
+	mtx_lock_spin(&cpuset_lock);
+	error = cpuset_testupdate(set, mask, 0);
+	if (error)
+		goto out;
+	CPU_COPY(mask, &set->cs_mask);
+	cpuset_update(set, mask);
+out:
+	mtx_unlock_spin(&cpuset_lock);
+
+	return (error);
+}
+
+/*
+ * Resolve the 'which' parameter of several cpuset apis.
+ *
+ * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
+ * checks for permission via p_cansched().
+ *
+ * For WHICH_SET returns a valid set with a new reference.
+ *
+ * -1 may be supplied for any argument to mean the current proc/thread or
+ * the base set of the current thread.  May fail with ESRCH/EPERM.
+ */
+static int
+cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
+    struct cpuset **setp)
+{
+	struct cpuset *set;
+	struct thread *td;
+	struct proc *p;
+	int error;
+
+	*pp = p = NULL;
+	*tdp = td = NULL;
+	*setp = set = NULL;
+	switch (which) {
+	case CPU_WHICH_PID:
+		if (id == -1) {
+			PROC_LOCK(curproc);
+			p = curproc;
+			break;
+		}
+		if ((p = pfind(id)) == NULL)
+			return (ESRCH);
+		break;
+	case CPU_WHICH_TID:
+		if (id == -1) {
+			PROC_LOCK(curproc);
+			p = curproc;
+			td = curthread;
+			break;
+		}
+		td = tdfind(id, -1);
+		if (td == NULL)
+			return (ESRCH);
+		p = td->td_proc;
+		break;
+	case CPU_WHICH_CPUSET:
+		if (id == -1) {
+			thread_lock(curthread);
+			set = cpuset_refbase(curthread->td_cpuset);
+			thread_unlock(curthread);
+		} else
+			set = cpuset_lookup(id, curthread);
+		if (set) {
+			*setp = set;
+			return (0);
+		}
+		return (ESRCH);
+	case CPU_WHICH_JAIL:
+	{
+		/* Find `set' for prison with given id. */
+		struct prison *pr;
+
+		sx_slock(&allprison_lock);
+		pr = prison_find_child(curthread->td_ucred->cr_prison, id);
+		sx_sunlock(&allprison_lock);
+		if (pr == NULL)
+			return (ESRCH);
+		cpuset_ref(pr->pr_cpuset);
+		*setp = pr->pr_cpuset;
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+	case CPU_WHICH_IRQ:
+		return (0);
+	default:
+		return (EINVAL);
+	}
+	error = p_cansched(curthread, p);
+	if (error) {
+		PROC_UNLOCK(p);
+		return (error);
+	}
+	if (td == NULL)
+		td = FIRST_THREAD_IN_PROC(p);
+	*pp = p;
+	*tdp = td;
+	return (0);
+}
+
+/*
+ * Create an anonymous set with the provided mask in the space provided by
+ * 'fset'.  If the passed in set is anonymous we use its parent otherwise
+ * the new set is a child of 'set'.
+ */
+static int
+cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask)
+{
+	struct cpuset *parent;
+
+	if (set->cs_id == CPUSET_INVALID)
+		parent = set->cs_parent;
+	else
+		parent = set;
+	if (!CPU_SUBSET(&parent->cs_mask, mask))
+		return (EDEADLK);
+	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
+}
+
+/*
+ * Handle two cases for replacing the base set or mask of an entire process.
+ *
+ * 1) Set is non-null and mask is null.  This reparents all anonymous sets
+ *    to the provided set and replaces all non-anonymous td_cpusets with the
+ *    provided set.
+ * 2) Mask is non-null and set is null.  This replaces or creates anonymous
+ *    sets for every thread with the existing base as a parent.
+ *
+ * This is overly complicated because we can't allocate while holding a 
+ * spinlock and spinlocks must be held while changing and examining thread
+ * state.
+ */
+static int
+cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
+{
+	struct setlist freelist;
+	struct setlist droplist;
+	struct cpuset *tdset;
+	struct cpuset *nset;
+	struct thread *td;
+	struct proc *p;
+	int threads;
+	int nfree;
+	int error;
+	/*
+	 * The algorithm requires two passes due to locking considerations.
+	 * 
+	 * 1) Lookup the process and acquire the locks in the required order.
+	 * 2) If enough cpusets have not been allocated release the locks and
+	 *    allocate them.  Loop.
+	 */
+	LIST_INIT(&freelist);
+	LIST_INIT(&droplist);
+	nfree = 0;
+	for (;;) {
+		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
+		if (error)
+			goto out;
+		if (nfree >= p->p_numthreads)
+			break;
+		threads = p->p_numthreads;
+		PROC_UNLOCK(p);
+		for (; nfree < threads; nfree++) {
+			nset = uma_zalloc(cpuset_zone, M_WAITOK);
+			LIST_INSERT_HEAD(&freelist, nset, cs_link);
+		}
+	}
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	/*
+	 * Now that the appropriate locks are held and we have enough cpusets,
+	 * make sure the operation will succeed before applying changes.  The
+	 * proc lock prevents td_cpuset from changing between calls.
+	 */
+	error = 0;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		tdset = td->td_cpuset;
+		/*
+		 * Verify that a new mask doesn't specify cpus outside of
+		 * the set the thread is a member of.
+		 */
+		if (mask) {
+			if (tdset->cs_id == CPUSET_INVALID)
+				tdset = tdset->cs_parent;
+			if (!CPU_SUBSET(&tdset->cs_mask, mask))
+				error = EDEADLK;
+		/*
+		 * Verify that a new set won't leave an existing thread
+		 * mask without a cpu to run on.  It can, however, restrict
+		 * the set.
+		 */
+		} else if (tdset->cs_id == CPUSET_INVALID) {
+			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
+				error = EDEADLK;
+		}
+		thread_unlock(td);
+		if (error)
+			goto unlock_out;
+	}
+	/*
+	 * Replace each thread's cpuset while using deferred release.  We
+	 * must do this because the thread lock must be held while operating
+	 * on the thread and this limits the type of operations allowed.
+	 */
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		/*
+		 * If we presently have an anonymous set or are applying a
+		 * mask we must create an anonymous shadow set.  That is
+		 * either parented to our existing base or the supplied set.
+		 *
+		 * If we have a base set with no anonymous shadow we simply
+		 * replace it outright.
+		 */
+		tdset = td->td_cpuset;
+		if (tdset->cs_id == CPUSET_INVALID || mask) {
+			nset = LIST_FIRST(&freelist);
+			LIST_REMOVE(nset, cs_link);
+			if (mask)
+				error = cpuset_shadow(tdset, nset, mask);
+			else
+				error = _cpuset_create(nset, set,
+				    &tdset->cs_mask, CPUSET_INVALID);
+			if (error) {
+				LIST_INSERT_HEAD(&freelist, nset, cs_link);
+				thread_unlock(td);
+				break;
+			}
+		} else
+			nset = cpuset_ref(set);
+		cpuset_rel_defer(&droplist, tdset);
+		td->td_cpuset = nset;
+		sched_affinity(td);
+		thread_unlock(td);
+	}
+unlock_out:
+	PROC_UNLOCK(p);
+out:
+	while ((nset = LIST_FIRST(&droplist)) != NULL)
+		cpuset_rel_complete(nset);
+	while ((nset = LIST_FIRST(&freelist)) != NULL) {
+		LIST_REMOVE(nset, cs_link);
+		uma_zfree(cpuset_zone, nset);
+	}
+	return (error);
+}
+
+/*
+ * Return a string representing a valid layout for a cpuset_t object.
+ * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
+ */
+char *
+cpusetobj_strprint(char *buf, const cpuset_t *set)
+{
+	char *tbuf;
+	size_t i, bytesp, bufsiz;
+
+	tbuf = buf;
+	bytesp = 0;
+	bufsiz = CPUSETBUFSIZ;
+
+	for (i = 0; i < (_NCPUWORDS - 1); i++) {
+		bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
+		bufsiz -= bytesp;
+		tbuf += bytesp;
+	}
+	snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
+	return (buf);
+}
+
+/*
+ * Build a valid cpuset_t object from a string representation.
+ * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
+ */
+int
+cpusetobj_strscan(cpuset_t *set, const char *buf)
+{
+	u_int nwords;
+	int i, ret;
+
+	if (strlen(buf) > CPUSETBUFSIZ - 1)
+		return (-1);
+
+	/* Allow to pass a shorter version of the mask when necessary. */
+	nwords = 1;
+	for (i = 0; buf[i] != '\0'; i++)
+		if (buf[i] == ',')
+			nwords++;
+	if (nwords > _NCPUWORDS)
+		return (-1);
+
+	CPU_ZERO(set);
+	for (i = 0; i < (nwords - 1); i++) {
+		ret = sscanf(buf, "%lx,", &set->__bits[i]);
+		if (ret == 0 || ret == -1)
+			return (-1);
+		buf = strstr(buf, ",");
+		if (buf == NULL)
+			return (-1);
+		buf++;
+	}
+	ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
+	if (ret == 0 || ret == -1)
+		return (-1);
+	return (0);
+}
+
+/*
+ * Apply an anonymous mask to a single thread.
+ */
+int
+cpuset_setthread(lwpid_t id, cpuset_t *mask)
+{
+	struct cpuset *nset;
+	struct cpuset *set;
+	struct thread *td;
+	struct proc *p;
+	int error;
+
+	nset = uma_zalloc(cpuset_zone, M_WAITOK);
+	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
+	if (error)
+		goto out;
+	set = NULL;
+	thread_lock(td);
+	error = cpuset_shadow(td->td_cpuset, nset, mask);
+	if (error == 0) {
+		set = td->td_cpuset;
+		td->td_cpuset = nset;
+		sched_affinity(td);
+		nset = NULL;
+	}
+	thread_unlock(td);
+	PROC_UNLOCK(p);
+	if (set)
+		cpuset_rel(set);
+out:
+	if (nset)
+		uma_zfree(cpuset_zone, nset);
+	return (error);
+}
+
+/*
+ * Creates the cpuset for thread0.  We make two sets:
+ * 
+ * 0 - The root set which should represent all valid processors in the
+ *     system.  It is initially created with a mask of all processors
+ *     because we don't know what processors are valid until cpuset_init()
+ *     runs.  This set is immutable.
+ * 1 - The default set which all processes are a member of until changed.
+ *     This allows an administrator to move all threads off of given cpus to
+ *     dedicate them to high priority tasks or save power etc.
+ */
+struct cpuset *
+cpuset_thread0(void)
+{
+	struct cpuset *set;
+	int error;
+
+	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
+	/*
+	 * Create the root system set for the whole machine.  Doesn't use
+	 * cpuset_create() due to NULL parent.
+	 */
+	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
+	CPU_FILL(&set->cs_mask);
+	LIST_INIT(&set->cs_children);
+	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
+	set->cs_ref = 1;
+	set->cs_flags = CPU_SET_ROOT;
+	cpuset_zero = set;
+	cpuset_root = &set->cs_mask;
+	/*
+	 * Now derive a default, modifiable set from that to give out.
+	 */
+	set = uma_zalloc(cpuset_zone, M_WAITOK);
+	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
+	KASSERT(error == 0, ("Error creating default set: %d\n", error));
+	/*
+	 * Initialize the unit allocator. 0 and 1 are allocated above.
+	 */
+	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
+
+	return (set);
+}
+
+/*
+ * Create a cpuset, which would be cpuset_create() but
+ * mark the new 'set' as root.
+ *
+ * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
+ * for that.
+ *
+ * In case of no error, returns the set in *setp locked with a reference.
+ */
+int
+cpuset_create_root(struct prison *pr, struct cpuset **setp)
+{
+	struct cpuset *set;
+	int error;
+
+	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
+	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
+
+	error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
+	if (error)
+		return (error);
+
+	KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
+	    __func__, __LINE__));
+
+	/* Mark the set as root. */
+	set = *setp;
+	set->cs_flags |= CPU_SET_ROOT;
+
+	return (0);
+}
+
+int
+cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
+{
+	int error;
+
+	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
+	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
+
+	cpuset_ref(set);
+	error = cpuset_setproc(p->p_pid, set, NULL);
+	if (error)
+		return (error);
+	cpuset_rel(set);
+	return (0);
+}
+
+/*
+ * This is called once the final set of system cpus is known.  Modifies
+ * the root set and all children and mark the root read-only.  
+ */
+static void
+cpuset_init(void *arg)
+{
+	cpuset_t mask;
+
+	mask = all_cpus;
+	if (cpuset_modify(cpuset_zero, &mask))
+		panic("Can't set initial cpuset mask.\n");
+	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
+}
+SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_args {
+	cpusetid_t	*setid;
+};
+#endif
+int
+sys_cpuset(struct thread *td, struct cpuset_args *uap)
+{
+	struct cpuset *root;
+	struct cpuset *set;
+	int error;
+
+	thread_lock(td);
+	root = cpuset_refroot(td->td_cpuset);
+	thread_unlock(td);
+	error = cpuset_create(&set, root, &root->cs_mask);
+	cpuset_rel(root);
+	if (error)
+		return (error);
+	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
+	if (error == 0)
+		error = cpuset_setproc(-1, set, NULL);
+	cpuset_rel(set);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_setid_args {
+	cpuwhich_t	which;
+	id_t		id;
+	cpusetid_t	setid;
+};
+#endif
+int
+sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
+{
+	struct cpuset *set;
+	int error;
+
+	/*
+	 * Presently we only support per-process sets.
+	 */
+	if (uap->which != CPU_WHICH_PID)
+		return (EINVAL);
+	set = cpuset_lookup(uap->setid, td);
+	if (set == NULL)
+		return (ESRCH);
+	error = cpuset_setproc(uap->id, set, NULL);
+	cpuset_rel(set);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_getid_args {
+	cpulevel_t	level;
+	cpuwhich_t	which;
+	id_t		id;
+	cpusetid_t	*setid;
+};
+#endif
+int
+sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
+{
+	struct cpuset *nset;
+	struct cpuset *set;
+	struct thread *ttd;
+	struct proc *p;
+	cpusetid_t id;
+	int error;
+
+	if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
+		return (EINVAL);
+	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
+	if (error)
+		return (error);
+	switch (uap->which) {
+	case CPU_WHICH_TID:
+	case CPU_WHICH_PID:
+		thread_lock(ttd);
+		set = cpuset_refbase(ttd->td_cpuset);
+		thread_unlock(ttd);
+		PROC_UNLOCK(p);
+		break;
+	case CPU_WHICH_CPUSET:
+	case CPU_WHICH_JAIL:
+		break;
+	case CPU_WHICH_IRQ:
+		return (EINVAL);
+	}
+	switch (uap->level) {
+	case CPU_LEVEL_ROOT:
+		nset = cpuset_refroot(set);
+		cpuset_rel(set);
+		set = nset;
+		break;
+	case CPU_LEVEL_CPUSET:
+		break;
+	case CPU_LEVEL_WHICH:
+		break;
+	}
+	id = set->cs_id;
+	cpuset_rel(set);
+	if (error == 0)
+		error = copyout(&id, uap->setid, sizeof(id));
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_getaffinity_args {
+	cpulevel_t	level;
+	cpuwhich_t	which;
+	id_t		id;
+	size_t		cpusetsize;
+	cpuset_t	*mask;
+};
+#endif
+int
+sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
+{
+	struct thread *ttd;
+	struct cpuset *nset;
+	struct cpuset *set;
+	struct proc *p;
+	cpuset_t *mask;
+	int error;
+	size_t size;
+
+	if (uap->cpusetsize < sizeof(cpuset_t) ||
+	    uap->cpusetsize > CPU_MAXSIZE / NBBY)
+		return (ERANGE);
+	size = uap->cpusetsize;
+	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
+	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
+	if (error)
+		goto out;
+	switch (uap->level) {
+	case CPU_LEVEL_ROOT:
+	case CPU_LEVEL_CPUSET:
+		switch (uap->which) {
+		case CPU_WHICH_TID:
+		case CPU_WHICH_PID:
+			thread_lock(ttd);
+			set = cpuset_ref(ttd->td_cpuset);
+			thread_unlock(ttd);
+			break;
+		case CPU_WHICH_CPUSET:
+		case CPU_WHICH_JAIL:
+			break;
+		case CPU_WHICH_IRQ:
+			error = EINVAL;
+			goto out;
+		}
+		if (uap->level == CPU_LEVEL_ROOT)
+			nset = cpuset_refroot(set);
+		else
+			nset = cpuset_refbase(set);
+		CPU_COPY(&nset->cs_mask, mask);
+		cpuset_rel(nset);
+		break;
+	case CPU_LEVEL_WHICH:
+		switch (uap->which) {
+		case CPU_WHICH_TID:
+			thread_lock(ttd);
+			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
+			thread_unlock(ttd);
+			break;
+		case CPU_WHICH_PID:
+			FOREACH_THREAD_IN_PROC(p, ttd) {
+				thread_lock(ttd);
+				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
+				thread_unlock(ttd);
+			}
+			break;
+		case CPU_WHICH_CPUSET:
+		case CPU_WHICH_JAIL:
+			CPU_COPY(&set->cs_mask, mask);
+			break;
+		case CPU_WHICH_IRQ:
+			error = intr_getaffinity(uap->id, mask);
+			break;
+		}
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	if (set)
+		cpuset_rel(set);
+	if (p)
+		PROC_UNLOCK(p);
+	if (error == 0)
+		error = copyout(mask, uap->mask, size);
+out:
+	free(mask, M_TEMP);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_setaffinity_args {
+	cpulevel_t	level;
+	cpuwhich_t	which;
+	id_t		id;
+	size_t		cpusetsize;
+	const cpuset_t	*mask;
+};
+#endif
+int
+sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
+{
+	struct cpuset *nset;
+	struct cpuset *set;
+	struct thread *ttd;
+	struct proc *p;
+	cpuset_t *mask;
+	int error;
+
+	if (uap->cpusetsize < sizeof(cpuset_t) ||
+	    uap->cpusetsize > CPU_MAXSIZE / NBBY)
+		return (ERANGE);
+	mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
+	error = copyin(uap->mask, mask, uap->cpusetsize);
+	if (error)
+		goto out;
+	/*
+	 * Verify that no high bits are set.
+	 */
+	if (uap->cpusetsize > sizeof(cpuset_t)) {
+		char *end;
+		char *cp;
+
+		end = cp = (char *)&mask->__bits;
+		end += uap->cpusetsize;
+		cp += sizeof(cpuset_t);
+		while (cp != end)
+			if (*cp++ != 0) {
+				error = EINVAL;
+				goto out;
+			}
+
+	}
+	switch (uap->level) {
+	case CPU_LEVEL_ROOT:
+	case CPU_LEVEL_CPUSET:
+		error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
+		if (error)
+			break;
+		switch (uap->which) {
+		case CPU_WHICH_TID:
+		case CPU_WHICH_PID:
+			thread_lock(ttd);
+			set = cpuset_ref(ttd->td_cpuset);
+			thread_unlock(ttd);
+			PROC_UNLOCK(p);
+			break;
+		case CPU_WHICH_CPUSET:
+		case CPU_WHICH_JAIL:
+			break;
+		case CPU_WHICH_IRQ:
+			error = EINVAL;
+			goto out;
+		}
+		if (uap->level == CPU_LEVEL_ROOT)
+			nset = cpuset_refroot(set);
+		else
+			nset = cpuset_refbase(set);
+		error = cpuset_modify(nset, mask);
+		cpuset_rel(nset);
+		cpuset_rel(set);
+		break;
+	case CPU_LEVEL_WHICH:
+		switch (uap->which) {
+		case CPU_WHICH_TID:
+			error = cpuset_setthread(uap->id, mask);
+			break;
+		case CPU_WHICH_PID:
+			error = cpuset_setproc(uap->id, NULL, mask);
+			break;
+		case CPU_WHICH_CPUSET:
+		case CPU_WHICH_JAIL:
+			error = cpuset_which(uap->which, uap->id, &p,
+			    &ttd, &set);
+			if (error == 0) {
+				error = cpuset_modify(set, mask);
+				cpuset_rel(set);
+			}
+			break;
+		case CPU_WHICH_IRQ:
+			error = intr_setaffinity(uap->id, mask);
+			break;
+		default:
+			error = EINVAL;
+			break;
+		}
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+out:
+	free(mask, M_TEMP);
+	return (error);
+}
+
+#ifdef DDB
+void
+ddb_display_cpuset(const cpuset_t *set)
+{
+	int cpu, once;
+
+	for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+		if (CPU_ISSET(cpu, set)) {
+			if (once == 0) {
+				db_printf("%d", cpu);
+				once = 1;
+			} else  
+				db_printf(",%d", cpu);
+		}
+	}
+	if (once == 0)
+		db_printf("<none>");
+}
+
+DB_SHOW_COMMAND(cpusets, db_show_cpusets)
+{
+	struct cpuset *set;
+
+	LIST_FOREACH(set, &cpuset_ids, cs_link) {
+		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
+		    set, set->cs_id, set->cs_ref, set->cs_flags,
+		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
+		db_printf("  mask=");
+		ddb_display_cpuset(&set->cs_mask);
+		db_printf("\n");
+		if (db_pager_quit)
+			break;
+	}
+}
+#endif /* DDB */
diff --git a/sys/kern/kern_ctf.c b/sys/kern/kern_ctf.c
new file mode 100644
index 0000000..319414c
--- /dev/null
+++ b/sys/kern/kern_ctf.c
@@ -0,0 +1,340 @@
+/*-
+ * Copyright (c) 2008 John Birrell <jb@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Note this file is included by both link_elf.c and link_elf_obj.c.
+ *
+ * The CTF header structure definition can't be used here because it's
+ * (annoyingly) covered by the CDDL. We will just use a few bytes from
+ * it as an integer array where we 'know' what they mean.
+ */
+#define CTF_HDR_SIZE		36
+#define CTF_HDR_STRTAB_U32	7
+#define CTF_HDR_STRLEN_U32	8
+
+#ifdef DDB_CTF
+static void *
+z_alloc(void *nil, u_int items, u_int size)
+{
+	void *ptr;
+
+	ptr = malloc(items * size, M_TEMP, M_NOWAIT);
+	return ptr;
+}
+
+static void
+z_free(void *nil, void *ptr)
+{
+	free(ptr, M_TEMP);
+}
+
+#endif
+
+static int
+link_elf_ctf_get(linker_file_t lf, linker_ctf_t *lc)
+{
+#ifdef DDB_CTF
+	Elf_Ehdr *hdr = NULL;
+	Elf_Shdr *shdr = NULL;
+	caddr_t ctftab = NULL;
+	caddr_t raw = NULL;
+	caddr_t shstrtab = NULL;
+	elf_file_t ef = (elf_file_t) lf;
+	int flags;
+	int i;
+	int nbytes;
+	ssize_t resid;
+	size_t sz;
+	struct nameidata nd;
+	struct thread *td = curthread;
+	uint8_t ctf_hdr[CTF_HDR_SIZE];
+#endif
+	int error = 0;
+
+	if (lf == NULL || lc == NULL)
+		return (EINVAL);
+
+	/* Set the defaults for no CTF present. That's not a crime! */
+	bzero(lc, sizeof(*lc));
+
+#ifdef DDB_CTF
+	/*
+	 * First check if we've tried to load CTF data previously and the
+	 * CTF ELF section wasn't found. We flag that condition by setting
+	 * ctfcnt to -1. See below.
+	 */
+	if (ef->ctfcnt < 0)
+		return (EFTYPE);
+
+	/* Now check if we've already loaded the CTF data.. */
+	if (ef->ctfcnt > 0) {
+		/* We only need to load once. */
+		lc->ctftab = ef->ctftab;
+		lc->ctfcnt = ef->ctfcnt;
+		lc->symtab = ef->ddbsymtab;
+		lc->strtab = ef->ddbstrtab;
+		lc->strcnt = ef->ddbstrcnt;
+		lc->nsym   = ef->ddbsymcnt;
+		lc->ctfoffp = (uint32_t **) &ef->ctfoff;
+		lc->typoffp = (uint32_t **) &ef->typoff;
+		lc->typlenp = &ef->typlen;
+		return (0);
+	}
+
+	/*
+	 * We need to try reading the CTF data. Flag no CTF data present
+	 * by default and if we actually succeed in reading it, we'll
+	 * update ctfcnt to the number of bytes read.
+	 */
+	ef->ctfcnt = -1;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, lf->pathname, td);
+	flags = FREAD;
+	error = vn_open(&nd, &flags, 0, NULL);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	/* Allocate memory for the FLF header. */
+	if ((hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK)) == NULL) {
+		error = ENOMEM;
+		goto out;
+	}
+
+	/* Read the ELF header. */
+	if ((error = vn_rdwr(UIO_READ, nd.ni_vp, hdr, sizeof(*hdr),
+	    0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid,
+	    td)) != 0)
+		goto out;
+
+	/* Sanity check. */
+	if (!IS_ELF(*hdr)) {
+		error = ENOEXEC;
+		goto out;
+	}
+
+	nbytes = hdr->e_shnum * hdr->e_shentsize;
+	if (nbytes == 0 || hdr->e_shoff == 0 ||
+	    hdr->e_shentsize != sizeof(Elf_Shdr)) {
+		error = ENOEXEC;
+		goto out;
+	}
+
+	/* Allocate memory for all the section headers */
+	if ((shdr = malloc(nbytes, M_LINKER, M_WAITOK)) == NULL) {
+		error = ENOMEM;
+		goto out;
+	}
+
+	/* Read all the section headers */
+	if ((error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes,
+	    hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+	    &resid, td)) != 0)
+		goto out;
+
+	/*
+	 * We need to search for the CTF section by name, so if the
+	 * section names aren't present, then we can't locate the
+	 * .SUNW_ctf section containing the CTF data.
+	 */
+	if (hdr->e_shstrndx == 0 || shdr[hdr->e_shstrndx].sh_type != SHT_STRTAB) {
+		printf("%s(%d): module %s e_shstrndx is %d, sh_type is %d\n",
+		    __func__, __LINE__, lf->pathname, hdr->e_shstrndx,
+		    shdr[hdr->e_shstrndx].sh_type);
+		error = EFTYPE;
+		goto out;
+	}
+
+	/* Allocate memory to buffer the section header strings. */
+	if ((shstrtab = malloc(shdr[hdr->e_shstrndx].sh_size, M_LINKER,
+	    M_WAITOK)) == NULL) {
+		error = ENOMEM;
+		goto out;
+	}
+
+	/* Read the section header strings. */
+	if ((error = vn_rdwr(UIO_READ, nd.ni_vp, shstrtab,
+	    shdr[hdr->e_shstrndx].sh_size, shdr[hdr->e_shstrndx].sh_offset,
+	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid,
+	    td)) != 0)
+		goto out;
+
+	/* Search for the section containing the CTF data. */
+	for (i = 0; i < hdr->e_shnum; i++)
+		if (strcmp(".SUNW_ctf", shstrtab + shdr[i].sh_name) == 0)
+			break;
+
+	/* Check if the CTF section wasn't found. */
+	if (i >= hdr->e_shnum) {
+		printf("%s(%d): module %s has no .SUNW_ctf section\n",
+		    __func__, __LINE__, lf->pathname);
+		error = EFTYPE;
+		goto out;
+	}
+
+	/* Read the CTF header. */
+	if ((error = vn_rdwr(UIO_READ, nd.ni_vp, ctf_hdr, sizeof(ctf_hdr),
+	    shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
+	    NOCRED, &resid, td)) != 0)
+		goto out;
+
+	/* Check the CTF magic number. (XXX check for big endian!) */
+	if (ctf_hdr[0] != 0xf1 || ctf_hdr[1] != 0xcf) {
+		printf("%s(%d): module %s has invalid format\n",
+		    __func__, __LINE__, lf->pathname);
+		error = EFTYPE;
+		goto out;
+	}
+
+	/* Check if version 2. */
+	if (ctf_hdr[2] != 2) {
+		printf("%s(%d): module %s CTF format version is %d "
+		    "(2 expected)\n",
+		    __func__, __LINE__, lf->pathname, ctf_hdr[2]);
+		error = EFTYPE;
+		goto out;
+	}
+
+	/* Check if the data is compressed. */
+	if ((ctf_hdr[3] & 0x1) != 0) {
+		uint32_t *u32 = (uint32_t *) ctf_hdr;
+
+		/*
+		 * The last two fields in the CTF header are the offset
+		 * from the end of the header to the start of the string
+		 * data and the length of that string data. se this
+		 * information to determine the decompressed CTF data
+		 * buffer required.
+		 */
+		sz = u32[CTF_HDR_STRTAB_U32] + u32[CTF_HDR_STRLEN_U32] +
+		    sizeof(ctf_hdr);
+
+		/*
+		 * Allocate memory for the compressed CTF data, including
+		 * the header (which isn't compressed).
+		 */
+		if ((raw = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK)) == NULL) {
+			error = ENOMEM;
+			goto out;
+		}
+	} else {
+		/*
+		 * The CTF data is not compressed, so the ELF section
+		 * size is the same as the buffer size required.
+		 */
+		sz = shdr[i].sh_size;
+	}
+
+	/*
+	 * Allocate memory to buffer the CTF data in it's decompressed
+	 * form.
+	 */
+	if ((ctftab = malloc(sz, M_LINKER, M_WAITOK)) == NULL) {
+		error = ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * Read the CTF data into the raw buffer if compressed, or
+	 * directly into the CTF buffer otherwise.
+	 */
+	if ((error = vn_rdwr(UIO_READ, nd.ni_vp, raw == NULL ? ctftab : raw,
+	    shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED,
+	    td->td_ucred, NOCRED, &resid, td)) != 0)
+		goto out;
+
+	/* Check if decompression is required. */
+	if (raw != NULL) {
+		z_stream zs;
+		int ret;
+
+		/*
+		 * The header isn't compressed, so copy that into the
+		 * CTF buffer first.
+		 */
+		bcopy(ctf_hdr, ctftab, sizeof(ctf_hdr));
+
+		/* Initialise the zlib structure. */
+		bzero(&zs, sizeof(zs));
+		zs.zalloc = z_alloc;
+		zs.zfree = z_free;
+
+		if (inflateInit(&zs) != Z_OK) {
+			error = EIO;
+			goto out;
+		}
+
+		zs.avail_in = shdr[i].sh_size - sizeof(ctf_hdr);
+		zs.next_in = ((uint8_t *) raw) + sizeof(ctf_hdr);
+		zs.avail_out = sz - sizeof(ctf_hdr);
+		zs.next_out = ((uint8_t *) ctftab) + sizeof(ctf_hdr);
+		if ((ret = inflate(&zs, Z_FINISH)) != Z_STREAM_END) {
+			printf("%s(%d): zlib inflate returned %d\n", __func__, __LINE__, ret);
+			error = EIO;
+			goto out;
+		}
+	}
+
+	/* Got the CTF data! */
+	ef->ctftab = ctftab;
+	ef->ctfcnt = shdr[i].sh_size;
+
+	/* We'll retain the memory allocated for the CTF data. */
+	ctftab = NULL;
+
+	/* Let the caller use the CTF data read. */
+	lc->ctftab = ef->ctftab;
+	lc->ctfcnt = ef->ctfcnt;
+	lc->symtab = ef->ddbsymtab;
+	lc->strtab = ef->ddbstrtab;
+	lc->strcnt = ef->ddbstrcnt;
+	lc->nsym   = ef->ddbsymcnt;
+	lc->ctfoffp = (uint32_t **) &ef->ctfoff;
+	lc->typoffp = (uint32_t **) &ef->typoff;
+	lc->typlenp = &ef->typlen;
+
+out:
+	VOP_UNLOCK(nd.ni_vp, 0);
+	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+
+	if (hdr != NULL)
+		free(hdr, M_LINKER);
+	if (shdr != NULL)
+		free(shdr, M_LINKER);
+	if (shstrtab != NULL)
+		free(shstrtab, M_LINKER);
+	if (ctftab != NULL)
+		free(ctftab, M_LINKER);
+	if (raw != NULL)
+		free(raw, M_LINKER);
+#else
+	error = EOPNOTSUPP;
+#endif
+
+	return (error);
+}
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
new file mode 100644
index 0000000..9e9010f
--- /dev/null
+++ b/sys/kern/kern_descrip.c
@@ -0,0 +1,4016 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+#include "opt_ddb.h"
+#include "opt_ktrace.h"
+#include "opt_procdesc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <sys/capability.h>
+#include <sys/conf.h>
+#include <sys/domain.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/ksem.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/mqueue.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/selinfo.h>
+#include <sys/pipe.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/procdesc.h>
+#include <sys/protosw.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sbuf.h>
+#include <sys/signalvar.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/tty.h>
+#include <sys/unistd.h>
+#include <sys/un.h>
+#include <sys/unpcb.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+
+#include <security/audit/audit.h>
+
+#include <vm/uma.h>
+#include <vm/vm.h>
+
+#include <ddb/ddb.h>
+
+static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
+static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
+    "file desc to leader structures");
+static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
+MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
+
+MALLOC_DECLARE(M_FADVISE);
+
+static uma_zone_t file_zone;
+
+void	(*ksem_info)(struct ksem *ks, char *path, size_t size, uint32_t *value);
+
+static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
+		    struct thread *td, int holdleaders);
+static int	fd_first_free(struct filedesc *fdp, int low, int size);
+static int	fd_last_used(struct filedesc *fdp, int size);
+static void	fdgrowtable(struct filedesc *fdp, int nfd);
+static void	fdunused(struct filedesc *fdp, int fd);
+static void	fdused(struct filedesc *fdp, int fd);
+static int	fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
+static int	fill_procdesc_info(struct procdesc *pdp,
+		    struct kinfo_file *kif);
+static int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
+static int	fill_sem_info(struct file *fp, struct kinfo_file *kif);
+static int	fill_shm_info(struct file *fp, struct kinfo_file *kif);
+static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
+static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
+
+/*
+ * Each process has:
+ *
+ * - An array of open file descriptors (fd_ofiles)
+ * - An array of file flags (fd_ofileflags)
+ * - A bitmap recording which descriptors are in use (fd_map)
+ *
+ * A process starts out with NDFILE descriptors.  The value of NDFILE has
+ * been selected based the historical limit of 20 open files, and an
+ * assumption that the majority of processes, especially short-lived
+ * processes like shells, will never need more.
+ *
+ * If this initial allocation is exhausted, a larger descriptor table and
+ * map are allocated dynamically, and the pointers in the process's struct
+ * filedesc are updated to point to those.  This is repeated every time
+ * the process runs out of file descriptors (provided it hasn't hit its
+ * resource limit).
+ *
+ * Since threads may hold references to individual descriptor table
+ * entries, the tables are never freed.  Instead, they are placed on a
+ * linked list and freed only when the struct filedesc is released.
+ */
+#define NDFILE		20
+#define NDSLOTSIZE	sizeof(NDSLOTTYPE)
+#define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
+#define NDSLOT(x)	((x) / NDENTRIES)
+#define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
+#define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
+
+/*
+ * SLIST entry used to keep track of ofiles which must be reclaimed when
+ * the process exits.
+ */
+struct freetable {
+	struct filedescent *ft_table;
+	SLIST_ENTRY(freetable) ft_next;
+};
+
+/*
+ * Initial allocation: a filedesc structure + the head of SLIST used to
+ * keep track of old ofiles + enough space for NDFILE descriptors.
+ */
+struct filedesc0 {
+	struct filedesc fd_fd;
+	SLIST_HEAD(, freetable) fd_free;
+	struct	filedescent fd_dfiles[NDFILE];
+	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
+};
+
+/*
+ * Descriptor management.
+ */
+volatile int openfiles;			/* actual number of open files */
+struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
+void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
+
+/* A mutex to protect the association between a proc and filedesc. */
+static struct mtx fdesc_mtx;
+
+/*
+ * If low >= size, just return low. Otherwise find the first zero bit in the
+ * given bitmap, starting at low and not exceeding size - 1. Return size if
+ * not found.
+ */
+static int
+fd_first_free(struct filedesc *fdp, int low, int size)
+{
+	NDSLOTTYPE *map = fdp->fd_map;
+	NDSLOTTYPE mask;
+	int off, maxoff;
+
+	if (low >= size)
+		return (low);
+
+	off = NDSLOT(low);
+	if (low % NDENTRIES) {
+		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
+		if ((mask &= ~map[off]) != 0UL)
+			return (off * NDENTRIES + ffsl(mask) - 1);
+		++off;
+	}
+	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
+		if (map[off] != ~0UL)
+			return (off * NDENTRIES + ffsl(~map[off]) - 1);
+	return (size);
+}
+
+/*
+ * Find the highest non-zero bit in the given bitmap, starting at 0 and
+ * not exceeding size - 1. Return -1 if not found.
+ */
+static int
+fd_last_used(struct filedesc *fdp, int size)
+{
+	NDSLOTTYPE *map = fdp->fd_map;
+	NDSLOTTYPE mask;
+	int off, minoff;
+
+	off = NDSLOT(size);
+	if (size % NDENTRIES) {
+		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
+		if ((mask &= map[off]) != 0)
+			return (off * NDENTRIES + flsl(mask) - 1);
+		--off;
+	}
+	for (minoff = NDSLOT(0); off >= minoff; --off)
+		if (map[off] != 0)
+			return (off * NDENTRIES + flsl(map[off]) - 1);
+	return (-1);
+}
+
+static int
+fdisused(struct filedesc *fdp, int fd)
+{
+
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
+
+	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
+}
+
+/*
+ * Mark a file descriptor as used.
+ */
+static void
+fdused(struct filedesc *fdp, int fd)
+{
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
+
+	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
+	if (fd > fdp->fd_lastfile)
+		fdp->fd_lastfile = fd;
+	if (fd == fdp->fd_freefile)
+		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
+}
+
+/*
+ * Mark a file descriptor as unused.
+ */
+static void
+fdunused(struct filedesc *fdp, int fd)
+{
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
+	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
+	    ("fd=%d is still in use", fd));
+
+	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
+	if (fd < fdp->fd_freefile)
+		fdp->fd_freefile = fd;
+	if (fd == fdp->fd_lastfile)
+		fdp->fd_lastfile = fd_last_used(fdp, fd);
+}
+
+/*
+ * Free a file descriptor.
+ */
+static inline void
+fdfree(struct filedesc *fdp, int fd)
+{
+	struct filedescent *fde;
+
+	fde = &fdp->fd_ofiles[fd];
+	filecaps_free(&fde->fde_caps);
+	bzero(fde, sizeof(*fde));
+	fdunused(fdp, fd);
+}
+
+/*
+ * System calls on descriptors.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdtablesize_args {
+	int	dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
+{
+	struct proc *p = td->td_proc;
+	uint64_t lim;
+
+	PROC_LOCK(p);
+	td->td_retval[0] =
+	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
+	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
+	PROC_UNLOCK(p);
+	if (lim < td->td_retval[0])
+		td->td_retval[0] = lim;
+	return (0);
+}
+
+/*
+ * Duplicate a file descriptor to a particular value.
+ *
+ * Note: keep in mind that a potential race condition exists when closing
+ * descriptors from a shared descriptor table (via rfork).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup2_args {
+	u_int	from;
+	u_int	to;
+};
+#endif
+/* ARGSUSED */
+int
+sys_dup2(struct thread *td, struct dup2_args *uap)
+{
+
+	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
+		    td->td_retval));
+}
+
+/*
+ * Duplicate a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup_args {
+	u_int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_dup(struct thread *td, struct dup_args *uap)
+{
+
+	return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
+}
+
+/*
+ * The file control system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fcntl_args {
+	int	fd;
+	int	cmd;
+	long	arg;
+};
+#endif
+/* ARGSUSED */
+int
+sys_fcntl(struct thread *td, struct fcntl_args *uap)
+{
+	struct flock fl;
+	struct __oflock ofl;
+	intptr_t arg;
+	int error;
+	int cmd;
+
+	error = 0;
+	cmd = uap->cmd;
+	switch (uap->cmd) {
+	case F_OGETLK:
+	case F_OSETLK:
+	case F_OSETLKW:
+		/*
+		 * Convert old flock structure to new.
+		 */
+		error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
+		fl.l_start = ofl.l_start;
+		fl.l_len = ofl.l_len;
+		fl.l_pid = ofl.l_pid;
+		fl.l_type = ofl.l_type;
+		fl.l_whence = ofl.l_whence;
+		fl.l_sysid = 0;
+
+		switch (uap->cmd) {
+		case F_OGETLK:
+		    cmd = F_GETLK;
+		    break;
+		case F_OSETLK:
+		    cmd = F_SETLK;
+		    break;
+		case F_OSETLKW:
+		    cmd = F_SETLKW;
+		    break;
+		}
+		arg = (intptr_t)&fl;
+		break;
+        case F_GETLK:
+        case F_SETLK:
+        case F_SETLKW:
+	case F_SETLK_REMOTE:
+                error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
+                arg = (intptr_t)&fl;
+                break;
+	default:
+		arg = uap->arg;
+		break;
+	}
+	if (error)
+		return (error);
+	error = kern_fcntl(td, uap->fd, cmd, arg);
+	if (error)
+		return (error);
+	if (uap->cmd == F_OGETLK) {
+		ofl.l_start = fl.l_start;
+		ofl.l_len = fl.l_len;
+		ofl.l_pid = fl.l_pid;
+		ofl.l_type = fl.l_type;
+		ofl.l_whence = fl.l_whence;
+		error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
+	} else if (uap->cmd == F_GETLK) {
+		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
+	}
+	return (error);
+}
+
+int
+kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
+{
+	struct filedesc *fdp;
+	struct flock *flp;
+	struct file *fp, *fp2;
+	struct filedescent *fde;
+	struct proc *p;
+	struct vnode *vp;
+	cap_rights_t rights;
+	int error, flg, tmp;
+	u_int old, new;
+	uint64_t bsize;
+	off_t foffset;
+
+	error = 0;
+	flg = F_POSIX;
+	p = td->td_proc;
+	fdp = p->p_fd;
+
+	switch (cmd) {
+	case F_DUPFD:
+		tmp = arg;
+		error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
+		break;
+
+	case F_DUPFD_CLOEXEC:
+		tmp = arg;
+		error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp,
+		    td->td_retval);
+		break;
+
+	case F_DUP2FD:
+		tmp = arg;
+		error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
+		break;
+
+	case F_DUP2FD_CLOEXEC:
+		tmp = arg;
+		error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp,
+		    td->td_retval);
+		break;
+
+	case F_GETFD:
+		FILEDESC_SLOCK(fdp);
+		if ((fp = fget_locked(fdp, fd)) == NULL) {
+			FILEDESC_SUNLOCK(fdp);
+			error = EBADF;
+			break;
+		}
+		fde = &fdp->fd_ofiles[fd];
+		td->td_retval[0] =
+		    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
+		FILEDESC_SUNLOCK(fdp);
+		break;
+
+	case F_SETFD:
+		FILEDESC_XLOCK(fdp);
+		if ((fp = fget_locked(fdp, fd)) == NULL) {
+			FILEDESC_XUNLOCK(fdp);
+			error = EBADF;
+			break;
+		}
+		fde = &fdp->fd_ofiles[fd];
+		fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
+		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
+		FILEDESC_XUNLOCK(fdp);
+		break;
+
+	case F_GETFL:
+		error = fget_unlocked(fdp, fd,
+		    cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp, NULL);
+		if (error != 0)
+			break;
+		td->td_retval[0] = OFLAGS(fp->f_flag);
+		fdrop(fp, td);
+		break;
+
+	case F_SETFL:
+		error = fget_unlocked(fdp, fd,
+		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp, NULL);
+		if (error != 0)
+			break;
+		do {
+			tmp = flg = fp->f_flag;
+			tmp &= ~FCNTLFLAGS;
+			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
+		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
+		tmp = fp->f_flag & FNONBLOCK;
+		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
+		if (error != 0) {
+			fdrop(fp, td);
+			break;
+		}
+		tmp = fp->f_flag & FASYNC;
+		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
+		if (error == 0) {
+			fdrop(fp, td);
+			break;
+		}
+		atomic_clear_int(&fp->f_flag, FNONBLOCK);
+		tmp = 0;
+		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
+		fdrop(fp, td);
+		break;
+
+	case F_GETOWN:
+		error = fget_unlocked(fdp, fd,
+		    cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp, NULL);
+		if (error != 0)
+			break;
+		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
+		if (error == 0)
+			td->td_retval[0] = tmp;
+		fdrop(fp, td);
+		break;
+
+	case F_SETOWN:
+		error = fget_unlocked(fdp, fd,
+		    cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp, NULL);
+		if (error != 0)
+			break;
+		tmp = arg;
+		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
+		fdrop(fp, td);
+		break;
+
+	case F_SETLK_REMOTE:
+		error = priv_check(td, PRIV_NFS_LOCKD);
+		if (error)
+			return (error);
+		flg = F_REMOTE;
+		goto do_setlk;
+
+	case F_SETLKW:
+		flg |= F_WAIT;
+		/* FALLTHROUGH F_SETLK */
+
+	case F_SETLK:
+	do_setlk:
+		cap_rights_init(&rights, CAP_FLOCK);
+		error = fget_unlocked(fdp, fd, &rights, 0, &fp, NULL);
+		if (error != 0)
+			break;
+		if (fp->f_type != DTYPE_VNODE) {
+			error = EBADF;
+			fdrop(fp, td);
+			break;
+		}
+
+		flp = (struct flock *)arg;
+		if (flp->l_whence == SEEK_CUR) {
+			foffset = foffset_get(fp);
+			if (foffset < 0 ||
+			    (flp->l_start > 0 &&
+			     foffset > OFF_MAX - flp->l_start)) {
+				FILEDESC_SUNLOCK(fdp);
+				error = EOVERFLOW;
+				fdrop(fp, td);
+				break;
+			}
+			flp->l_start += foffset;
+		}
+
+		vp = fp->f_vnode;
+		switch (flp->l_type) {
+		case F_RDLCK:
+			if ((fp->f_flag & FREAD) == 0) {
+				error = EBADF;
+				break;
+			}
+			PROC_LOCK(p->p_leader);
+			p->p_leader->p_flag |= P_ADVLOCK;
+			PROC_UNLOCK(p->p_leader);
+			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
+			    flp, flg);
+			break;
+		case F_WRLCK:
+			if ((fp->f_flag & FWRITE) == 0) {
+				error = EBADF;
+				break;
+			}
+			PROC_LOCK(p->p_leader);
+			p->p_leader->p_flag |= P_ADVLOCK;
+			PROC_UNLOCK(p->p_leader);
+			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
+			    flp, flg);
+			break;
+		case F_UNLCK:
+			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
+			    flp, flg);
+			break;
+		case F_UNLCKSYS:
+			/*
+			 * Temporary api for testing remote lock
+			 * infrastructure.
+			 */
+			if (flg != F_REMOTE) {
+				error = EINVAL;
+				break;
+			}
+			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
+			    F_UNLCKSYS, flp, flg);
+			break;
+		default:
+			error = EINVAL;
+			break;
+		}
+		if (error != 0 || flp->l_type == F_UNLCK ||
+		    flp->l_type == F_UNLCKSYS) {
+			fdrop(fp, td);
+			break;
+		}
+
+		/*
+		 * Check for a race with close.
+		 *
+		 * The vnode is now advisory locked (or unlocked, but this case
+		 * is not really important) as the caller requested.
+		 * We had to drop the filedesc lock, so we need to recheck if
+		 * the descriptor is still valid, because if it was closed
+		 * in the meantime we need to remove advisory lock from the
+		 * vnode - close on any descriptor leading to an advisory
+		 * locked vnode, removes that lock.
+		 * We will return 0 on purpose in that case, as the result of
+		 * successful advisory lock might have been externally visible
+		 * already. This is fine - effectively we pretend to the caller
+		 * that the closing thread was a bit slower and that the
+		 * advisory lock succeeded before the close.
+		 */
+		error = fget_unlocked(fdp, fd, &rights, 0, &fp2, NULL);
+		if (error != 0) {
+			fdrop(fp, td);
+			break;
+		}
+		if (fp != fp2) {
+			flp->l_whence = SEEK_SET;
+			flp->l_start = 0;
+			flp->l_len = 0;
+			flp->l_type = F_UNLCK;
+			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
+			    F_UNLCK, flp, F_POSIX);
+		}
+		fdrop(fp, td);
+		fdrop(fp2, td);
+		break;
+
+	case F_GETLK:
+		error = fget_unlocked(fdp, fd,
+		    cap_rights_init(&rights, CAP_FLOCK), 0, &fp, NULL);
+		if (error != 0)
+			break;
+		if (fp->f_type != DTYPE_VNODE) {
+			error = EBADF;
+			fdrop(fp, td);
+			break;
+		}
+		flp = (struct flock *)arg;
+		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
+		    flp->l_type != F_UNLCK) {
+			error = EINVAL;
+			fdrop(fp, td);
+			break;
+		}
+		if (flp->l_whence == SEEK_CUR) {
+			foffset = foffset_get(fp);
+			if ((flp->l_start > 0 &&
+			    foffset > OFF_MAX - flp->l_start) ||
+			    (flp->l_start < 0 &&
+			     foffset < OFF_MIN - flp->l_start)) {
+				FILEDESC_SUNLOCK(fdp);
+				error = EOVERFLOW;
+				fdrop(fp, td);
+				break;
+			}
+			flp->l_start += foffset;
+		}
+		vp = fp->f_vnode;
+		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
+		    F_POSIX);
+		fdrop(fp, td);
+		break;
+
+	case F_RDAHEAD:
+		arg = arg ? 128 * 1024: 0;
+		/* FALLTHROUGH */
+	case F_READAHEAD:
+		error = fget_unlocked(fdp, fd, NULL, 0, &fp, NULL);
+		if (error != 0)
+			break;
+		if (fp->f_type != DTYPE_VNODE) {
+			fdrop(fp, td);
+			error = EBADF;
+			break;
+		}
+		if (arg >= 0) {
+			vp = fp->f_vnode;
+			error = vn_lock(vp, LK_SHARED);
+			if (error != 0) {
+				fdrop(fp, td);
+				break;
+			}
+			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
+			VOP_UNLOCK(vp, 0);
+			fp->f_seqcount = (arg + bsize - 1) / bsize;
+			do {
+				new = old = fp->f_flag;
+				new |= FRDAHEAD;
+			} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
+		} else {
+			do {
+				new = old = fp->f_flag;
+				new &= ~FRDAHEAD;
+			} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
+		}
+		fdrop(fp, td);
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+/*
+ * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
+ */
+int
+do_dup(struct thread *td, int flags, int old, int new,
+    register_t *retval)
+{
+	struct filedesc *fdp;
+	struct filedescent *oldfde, *newfde;
+	struct proc *p;
+	struct file *fp;
+	struct file *delfp;
+	int error, maxfd;
+
+	p = td->td_proc;
+	fdp = p->p_fd;
+
+	/*
+	 * Verify we have a valid descriptor to dup from and possibly to
+	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
+	 * return EINVAL when the new descriptor is out of bounds.
+	 */
+	if (old < 0)
+		return (EBADF);
+	if (new < 0)
+		return (flags & DUP_FCNTL ? EINVAL : EBADF);
+	PROC_LOCK(p);
+	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
+	PROC_UNLOCK(p);
+	if (new >= maxfd)
+		return (flags & DUP_FCNTL ? EINVAL : EBADF);
+
+	FILEDESC_XLOCK(fdp);
+	if (fget_locked(fdp, old) == NULL) {
+		FILEDESC_XUNLOCK(fdp);
+		return (EBADF);
+	}
+	oldfde = &fdp->fd_ofiles[old];
+	if (flags & DUP_FIXED && old == new) {
+		*retval = new;
+		if (flags & DUP_CLOEXEC)
+			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
+		FILEDESC_XUNLOCK(fdp);
+		return (0);
+	}
+	fp = oldfde->fde_file;
+	fhold(fp);
+
+	/*
+	 * If the caller specified a file descriptor, make sure the file
+	 * table is large enough to hold it, and grab it.  Otherwise, just
+	 * allocate a new descriptor the usual way.
+	 */
+	if (flags & DUP_FIXED) {
+		if (new >= fdp->fd_nfiles) {
+			/*
+			 * The resource limits are here instead of e.g.
+			 * fdalloc(), because the file descriptor table may be
+			 * shared between processes, so we can't really use
+			 * racct_add()/racct_sub().  Instead of counting the
+			 * number of actually allocated descriptors, just put
+			 * the limit on the size of the file descriptor table.
+			 */
+#ifdef RACCT
+			PROC_LOCK(p);
+			error = racct_set(p, RACCT_NOFILE, new + 1);
+			PROC_UNLOCK(p);
+			if (error != 0) {
+				FILEDESC_XUNLOCK(fdp);
+				fdrop(fp, td);
+				return (EMFILE);
+			}
+#endif
+			fdgrowtable(fdp, new + 1);
+			oldfde = &fdp->fd_ofiles[old];
+		}
+		newfde = &fdp->fd_ofiles[new];
+		if (newfde->fde_file == NULL)
+			fdused(fdp, new);
+	} else {
+		if ((error = fdalloc(td, new, &new)) != 0) {
+			FILEDESC_XUNLOCK(fdp);
+			fdrop(fp, td);
+			return (error);
+		}
+		newfde = &fdp->fd_ofiles[new];
+	}
+
+	KASSERT(fp == oldfde->fde_file, ("old fd has been modified"));
+	KASSERT(old != new, ("new fd is same as old"));
+
+	delfp = newfde->fde_file;
+
+	/*
+	 * Duplicate the source descriptor.
+	 */
+	*newfde = *oldfde;
+	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
+	if ((flags & DUP_CLOEXEC) != 0)
+		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
+	else
+		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
+	if (new > fdp->fd_lastfile)
+		fdp->fd_lastfile = new;
+	*retval = new;
+
+	if (delfp != NULL) {
+		(void) closefp(fdp, new, delfp, td, 1);
+		/* closefp() drops the FILEDESC lock for us. */
+	} else {
+		FILEDESC_XUNLOCK(fdp);
+	}
+
+	return (0);
+}
+
+/*
+ * If sigio is on the list associated with a process or process group,
+ * disable signalling from the device, remove sigio from the list and
+ * free sigio.
+ */
+void
+funsetown(struct sigio **sigiop)
+{
+	struct sigio *sigio;
+
+	SIGIO_LOCK();
+	sigio = *sigiop;
+	if (sigio == NULL) {
+		SIGIO_UNLOCK();
+		return;
+	}
+	*(sigio->sio_myref) = NULL;
+	if ((sigio)->sio_pgid < 0) {
+		struct pgrp *pg = (sigio)->sio_pgrp;
+		PGRP_LOCK(pg);
+		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
+			     sigio, sio_pgsigio);
+		PGRP_UNLOCK(pg);
+	} else {
+		struct proc *p = (sigio)->sio_proc;
+		PROC_LOCK(p);
+		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
+			     sigio, sio_pgsigio);
+		PROC_UNLOCK(p);
+	}
+	SIGIO_UNLOCK();
+	crfree(sigio->sio_ucred);
+	free(sigio, M_SIGIO);
+}
+
+/*
+ * Free a list of sigio structures.
+ * We only need to lock the SIGIO_LOCK because we have made ourselves
+ * inaccessible to callers of fsetown and therefore do not need to lock
+ * the proc or pgrp struct for the list manipulation.
+ */
+void
+funsetownlst(struct sigiolst *sigiolst)
+{
+	struct proc *p;
+	struct pgrp *pg;
+	struct sigio *sigio;
+
+	sigio = SLIST_FIRST(sigiolst);
+	if (sigio == NULL)
+		return;
+	p = NULL;
+	pg = NULL;
+
+	/*
+	 * Every entry of the list should belong
+	 * to a single proc or pgrp.
+	 */
+	if (sigio->sio_pgid < 0) {
+		pg = sigio->sio_pgrp;
+		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
+	} else /* if (sigio->sio_pgid > 0) */ {
+		p = sigio->sio_proc;
+		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+	}
+
+	SIGIO_LOCK();
+	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
+		*(sigio->sio_myref) = NULL;
+		if (pg != NULL) {
+			KASSERT(sigio->sio_pgid < 0,
+			    ("Proc sigio in pgrp sigio list"));
+			KASSERT(sigio->sio_pgrp == pg,
+			    ("Bogus pgrp in sigio list"));
+			PGRP_LOCK(pg);
+			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
+			    sio_pgsigio);
+			PGRP_UNLOCK(pg);
+		} else /* if (p != NULL) */ {
+			KASSERT(sigio->sio_pgid > 0,
+			    ("Pgrp sigio in proc sigio list"));
+			KASSERT(sigio->sio_proc == p,
+			    ("Bogus proc in sigio list"));
+			PROC_LOCK(p);
+			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
+			    sio_pgsigio);
+			PROC_UNLOCK(p);
+		}
+		SIGIO_UNLOCK();
+		crfree(sigio->sio_ucred);
+		free(sigio, M_SIGIO);
+		SIGIO_LOCK();
+	}
+	SIGIO_UNLOCK();
+}
+
+/*
+ * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
+ *
+ * After permission checking, add a sigio structure to the sigio list for
+ * the process or process group.
+ */
+int
+fsetown(pid_t pgid, struct sigio **sigiop)
+{
+	struct proc *proc;
+	struct pgrp *pgrp;
+	struct sigio *sigio;
+	int ret;
+
+	if (pgid == 0) {
+		funsetown(sigiop);
+		return (0);
+	}
+
+	ret = 0;
+
+	/* Allocate and fill in the new sigio out of locks. */
+	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
+	sigio->sio_pgid = pgid;
+	sigio->sio_ucred = crhold(curthread->td_ucred);
+	sigio->sio_myref = sigiop;
+
+	sx_slock(&proctree_lock);
+	if (pgid > 0) {
+		proc = pfind(pgid);
+		if (proc == NULL) {
+			ret = ESRCH;
+			goto fail;
+		}
+
+		/*
+		 * Policy - Don't allow a process to FSETOWN a process
+		 * in another session.
+		 *
+		 * Remove this test to allow maximum flexibility or
+		 * restrict FSETOWN to the current process or process
+		 * group for maximum safety.
+		 */
+		PROC_UNLOCK(proc);
+		if (proc->p_session != curthread->td_proc->p_session) {
+			ret = EPERM;
+			goto fail;
+		}
+
+		pgrp = NULL;
+	} else /* if (pgid < 0) */ {
+		pgrp = pgfind(-pgid);
+		if (pgrp == NULL) {
+			ret = ESRCH;
+			goto fail;
+		}
+		PGRP_UNLOCK(pgrp);
+
+		/*
+		 * Policy - Don't allow a process to FSETOWN a process
+		 * in another session.
+		 *
+		 * Remove this test to allow maximum flexibility or
+		 * restrict FSETOWN to the current process or process
+		 * group for maximum safety.
+		 */
+		if (pgrp->pg_session != curthread->td_proc->p_session) {
+			ret = EPERM;
+			goto fail;
+		}
+
+		proc = NULL;
+	}
+	funsetown(sigiop);
+	if (pgid > 0) {
+		PROC_LOCK(proc);
+		/*
+		 * Since funsetownlst() is called without the proctree
+		 * locked, we need to check for P_WEXIT.
+		 * XXX: is ESRCH correct?
+		 */
+		if ((proc->p_flag & P_WEXIT) != 0) {
+			PROC_UNLOCK(proc);
+			ret = ESRCH;
+			goto fail;
+		}
+		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
+		sigio->sio_proc = proc;
+		PROC_UNLOCK(proc);
+	} else {
+		PGRP_LOCK(pgrp);
+		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
+		sigio->sio_pgrp = pgrp;
+		PGRP_UNLOCK(pgrp);
+	}
+	sx_sunlock(&proctree_lock);
+	SIGIO_LOCK();
+	*sigiop = sigio;
+	SIGIO_UNLOCK();
+	return (0);
+
+fail:
+	sx_sunlock(&proctree_lock);
+	crfree(sigio->sio_ucred);
+	free(sigio, M_SIGIO);
+	return (ret);
+}
+
+/*
+ * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
+ */
+pid_t
+fgetown(sigiop)
+	struct sigio **sigiop;
+{
+	pid_t pgid;
+
+	SIGIO_LOCK();
+	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
+	SIGIO_UNLOCK();
+	return (pgid);
+}
+
+/*
+ * Function drops the filedesc lock on return.
+ */
+static int
+closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
+    int holdleaders)
+{
+	int error;
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	if (holdleaders) {
+		if (td->td_proc->p_fdtol != NULL) {
+			/*
+			 * Ask fdfree() to sleep to ensure that all relevant
+			 * process leaders can be traversed in closef().
+			 */
+			fdp->fd_holdleaderscount++;
+		} else {
+			holdleaders = 0;
+		}
+	}
+
+	/*
+	 * We now hold the fp reference that used to be owned by the
+	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
+	 * knote_fdclose to prevent a race of the fd getting opened, a knote
+	 * added, and deleteing a knote for the new fd.
+	 */
+	knote_fdclose(td, fd);
+
+	/*
+	 * We need to notify mqueue if the object is of type mqueue.
+	 */
+	if (fp->f_type == DTYPE_MQUEUE)
+		mq_fdclose(td, fd, fp);
+	FILEDESC_XUNLOCK(fdp);
+
+	error = closef(fp, td);
+	if (holdleaders) {
+		FILEDESC_XLOCK(fdp);
+		fdp->fd_holdleaderscount--;
+		if (fdp->fd_holdleaderscount == 0 &&
+		    fdp->fd_holdleaderswakeup != 0) {
+			fdp->fd_holdleaderswakeup = 0;
+			wakeup(&fdp->fd_holdleaderscount);
+		}
+		FILEDESC_XUNLOCK(fdp);
+	}
+	return (error);
+}
+
+/*
+ * Close a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct close_args {
+	int     fd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_close(td, uap)
+	struct thread *td;
+	struct close_args *uap;
+{
+
+	return (kern_close(td, uap->fd));
+}
+
+int
+kern_close(td, fd)
+	struct thread *td;
+	int fd;
+{
+	struct filedesc *fdp;
+	struct file *fp;
+
+	fdp = td->td_proc->p_fd;
+
+	AUDIT_SYSCLOSE(td, fd);
+
+	FILEDESC_XLOCK(fdp);
+	if ((fp = fget_locked(fdp, fd)) == NULL) {
+		FILEDESC_XUNLOCK(fdp);
+		return (EBADF);
+	}
+	fdfree(fdp, fd);
+
+	/* closefp() drops the FILEDESC lock for us. */
+	return (closefp(fdp, fd, fp, td, 1));
+}
+
+/*
+ * Close open file descriptors.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct closefrom_args {
+	int	lowfd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_closefrom(struct thread *td, struct closefrom_args *uap)
+{
+	struct filedesc *fdp;
+	int fd;
+
+	fdp = td->td_proc->p_fd;
+	AUDIT_ARG_FD(uap->lowfd);
+
+	/*
+	 * Treat negative starting file descriptor values identical to
+	 * closefrom(0) which closes all files.
+	 */
+	if (uap->lowfd < 0)
+		uap->lowfd = 0;
+	FILEDESC_SLOCK(fdp);
+	for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) {
+		if (fdp->fd_ofiles[fd].fde_file != NULL) {
+			FILEDESC_SUNLOCK(fdp);
+			(void)kern_close(td, fd);
+			FILEDESC_SLOCK(fdp);
+		}
+	}
+	FILEDESC_SUNLOCK(fdp);
+	return (0);
+}
+
+#if defined(COMPAT_43)
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ofstat_args {
+	int	fd;
+	struct	ostat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+ofstat(struct thread *td, struct ofstat_args *uap)
+{
+	struct ostat oub;
+	struct stat ub;
+	int error;
+
+	error = kern_fstat(td, uap->fd, &ub);
+	if (error == 0) {
+		cvtstat(&ub, &oub);
+		error = copyout(&oub, uap->sb, sizeof(oub));
+	}
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstat_args {
+	int	fd;
+	struct	stat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+sys_fstat(struct thread *td, struct fstat_args *uap)
+{
+	struct stat ub;
+	int error;
+
+	error = kern_fstat(td, uap->fd, &ub);
+	if (error == 0)
+		error = copyout(&ub, uap->sb, sizeof(ub));
+	return (error);
+}
+
+int
+kern_fstat(struct thread *td, int fd, struct stat *sbp)
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(fd);
+
+	error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
+	if (error != 0)
+		return (error);
+
+	AUDIT_ARG_FILE(td->td_proc, fp);
+
+	error = fo_stat(fp, sbp, td->td_ucred, td);
+	fdrop(fp, td);
+#ifdef KTRACE
+	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
+		ktrstat(sbp);
+#endif
+	return (error);
+}
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nfstat_args {
+	int	fd;
+	struct	nstat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+sys_nfstat(struct thread *td, struct nfstat_args *uap)
+{
+	struct nstat nub;
+	struct stat ub;
+	int error;
+
+	error = kern_fstat(td, uap->fd, &ub);
+	if (error == 0) {
+		cvtnstat(&ub, &nub);
+		error = copyout(&nub, uap->sb, sizeof(nub));
+	}
+	return (error);
+}
+
+/*
+ * Return pathconf information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fpathconf_args {
+	int	fd;
+	int	name;
+};
+#endif
+/* ARGSUSED */
+int
+sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
+{
+	struct file *fp;
+	struct vnode *vp;
+	cap_rights_t rights;
+	int error;
+
+	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
+	if (error != 0)
+		return (error);
+
+	/* If asynchronous I/O is available, it works for all descriptors. */
+	if (uap->name == _PC_ASYNC_IO) {
+		td->td_retval[0] = async_io_version;
+		goto out;
+	}
+	vp = fp->f_vnode;
+	if (vp != NULL) {
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
+		VOP_UNLOCK(vp, 0);
+	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
+		if (uap->name != _PC_PIPE_BUF) {
+			error = EINVAL;
+		} else {
+			td->td_retval[0] = PIPE_BUF;
+			error = 0;
+		}
+	} else {
+		error = EOPNOTSUPP;
+	}
+out:
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Initialize filecaps structure.
+ */
+void
+filecaps_init(struct filecaps *fcaps)
+{
+
+	bzero(fcaps, sizeof(*fcaps));
+	fcaps->fc_nioctls = -1;
+}
+
+/*
+ * Copy filecaps structure allocating memory for ioctls array if needed.
+ */
+void
+filecaps_copy(const struct filecaps *src, struct filecaps *dst)
+{
+	size_t size;
+
+	*dst = *src;
+	if (src->fc_ioctls != NULL) {
+		KASSERT(src->fc_nioctls > 0,
+		    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
+
+		size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
+		dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
+		bcopy(src->fc_ioctls, dst->fc_ioctls, size);
+	}
+}
+
+/*
+ * Move filecaps structure to the new place and clear the old place.
+ */
+void
+filecaps_move(struct filecaps *src, struct filecaps *dst)
+{
+
+	*dst = *src;
+	bzero(src, sizeof(*src));
+}
+
+/*
+ * Fill the given filecaps structure with full rights.
+ */
+static void
+filecaps_fill(struct filecaps *fcaps)
+{
+
+	CAP_ALL(&fcaps->fc_rights);
+	fcaps->fc_ioctls = NULL;
+	fcaps->fc_nioctls = -1;
+	fcaps->fc_fcntls = CAP_FCNTL_ALL;
+}
+
+/*
+ * Free memory allocated within filecaps structure.
+ */
+void
+filecaps_free(struct filecaps *fcaps)
+{
+
+	free(fcaps->fc_ioctls, M_FILECAPS);
+	bzero(fcaps, sizeof(*fcaps));
+}
+
+/*
+ * Validate the given filecaps structure.
+ */
+static void
+filecaps_validate(const struct filecaps *fcaps, const char *func)
+{
+
+	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
+	    ("%s: invalid rights", func));
+	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
+	    ("%s: invalid fcntls", func));
+	KASSERT(fcaps->fc_fcntls == 0 ||
+	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
+	    ("%s: fcntls without CAP_FCNTL", func));
+	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
+	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
+	    ("%s: invalid ioctls", func));
+	KASSERT(fcaps->fc_nioctls == 0 ||
+	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
+	    ("%s: ioctls without CAP_IOCTL", func));
+}
+
+/*
+ * Grow the file table to accomodate (at least) nfd descriptors.
+ */
+static void
+fdgrowtable(struct filedesc *fdp, int nfd)
+{
+	struct filedesc0 *fdp0;
+	struct freetable *ft;
+	struct filedescent *ntable;
+	struct filedescent *otable;
+	int nnfiles, onfiles;
+	NDSLOTTYPE *nmap, *omap;
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
+
+	/* save old values */
+	onfiles = fdp->fd_nfiles;
+	otable = fdp->fd_ofiles;
+	omap = fdp->fd_map;
+
+	/* compute the size of the new table */
+	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
+	if (nnfiles <= onfiles)
+		/* the table is already large enough */
+		return;
+
+	/*
+	 * Allocate a new table and map.  We need enough space for the
+	 * file entries themselves and the struct freetable we will use
+	 * when we decommission the table and place it on the freelist.
+	 * We place the struct freetable in the middle so we don't have
+	 * to worry about padding.
+	 */
+	ntable = malloc(nnfiles * sizeof(ntable[0]) + sizeof(struct freetable),
+	    M_FILEDESC, M_ZERO | M_WAITOK);
+	nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
+	    M_ZERO | M_WAITOK);
+
+	/* copy the old data over and point at the new tables */
+	memcpy(ntable, otable, onfiles * sizeof(*otable));
+	memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
+
+	/* update the pointers and counters */
+	fdp->fd_nfiles = nnfiles;
+	memcpy(ntable, otable, onfiles * sizeof(ntable[0]));
+	fdp->fd_ofiles = ntable;
+	fdp->fd_map = nmap;
+
+	/*
+	 * Do not free the old file table, as some threads may still
+	 * reference entries within it.  Instead, place it on a freelist
+	 * which will be processed when the struct filedesc is released.
+	 *
+	 * Do, however, free the old map.
+	 *
+	 * Note that if onfiles == NDFILE, we're dealing with the original
+	 * static allocation contained within (struct filedesc0 *)fdp,
+	 * which must not be freed.
+	 */
+	if (onfiles > NDFILE) {
+		ft = (struct freetable *)&otable[onfiles];
+		fdp0 = (struct filedesc0 *)fdp;
+		ft->ft_table = otable;
+		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
+		free(omap, M_FILEDESC);
+	}
+}
+
+/*
+ * Allocate a file descriptor for the process.
+ */
+int
+fdalloc(struct thread *td, int minfd, int *result)
+{
+	struct proc *p = td->td_proc;
+	struct filedesc *fdp = p->p_fd;
+	int fd = -1, maxfd, allocfd;
+#ifdef RACCT
+	int error;
+#endif
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	if (fdp->fd_freefile > minfd)
+		minfd = fdp->fd_freefile;
+
+	PROC_LOCK(p);
+	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
+	PROC_UNLOCK(p);
+
+	/*
+	 * Search the bitmap for a free descriptor starting at minfd.
+	 * If none is found, grow the file table.
+	 */
+	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
+	if (fd >= maxfd)
+		return (EMFILE);
+	if (fd >= fdp->fd_nfiles) {
+		allocfd = min(fd * 2, maxfd);
+#ifdef RACCT
+		PROC_LOCK(p);
+		error = racct_set(p, RACCT_NOFILE, allocfd);
+		PROC_UNLOCK(p);
+		if (error != 0)
+			return (EMFILE);
+#endif
+		/*
+		 * fd is already equal to first free descriptor >= minfd, so
+		 * we only need to grow the table and we are done.
+		 */
+		fdgrowtable(fdp, allocfd);
+	}
+
+	/*
+	 * Perform some sanity checks, then mark the file descriptor as
+	 * used and return it to the caller.
+	 */
+	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
+	    ("invalid descriptor %d", fd));
+	KASSERT(!fdisused(fdp, fd),
+	    ("fd_first_free() returned non-free descriptor"));
+	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
+	    ("file descriptor isn't free"));
+	KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set"));
+	fdused(fdp, fd);
+	*result = fd;
+	return (0);
+}
+
+/*
+ * Allocate n file descriptors for the process.
+ */
+int
+fdallocn(struct thread *td, int minfd, int *fds, int n)
+{
+	struct proc *p = td->td_proc;
+	struct filedesc *fdp = p->p_fd;
+	int i;
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	if (!fdavail(td, n))
+		return (EMFILE);
+
+	for (i = 0; i < n; i++)
+		if (fdalloc(td, 0, &fds[i]) != 0)
+			break;
+
+	if (i < n) {
+		for (i--; i >= 0; i--)
+			fdunused(fdp, fds[i]);
+		return (EMFILE);
+	}
+
+	return (0);
+}
+
+/*
+ * Check to see whether n user file descriptors are available to the process
+ * p.
+ */
+int
+fdavail(struct thread *td, int n)
+{
+	struct proc *p = td->td_proc;
+	struct filedesc *fdp = td->td_proc->p_fd;
+	int i, lim, last;
+
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	/*
+	 * XXX: This is only called from uipc_usrreq.c:unp_externalize();
+	 *      call racct_add() from there instead of dealing with containers
+	 *      here.
+	 */
+	PROC_LOCK(p);
+	lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
+	PROC_UNLOCK(p);
+	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
+		return (1);
+	last = min(fdp->fd_nfiles, lim);
+	for (i = fdp->fd_freefile; i < last; i++) {
+		if (fdp->fd_ofiles[i].fde_file == NULL && --n <= 0)
+			return (1);
+	}
+	return (0);
+}
+
+/*
+ * Create a new open file structure and allocate a file decriptor for the
+ * process that refers to it.  We add one reference to the file for the
+ * descriptor table and one reference for resultfp. This is to prevent us
+ * being preempted and the entry in the descriptor table closed after we
+ * release the FILEDESC lock.
+ */
+int
+falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
+{
+	struct file *fp;
+	int error, fd;
+
+	error = falloc_noinstall(td, &fp);
+	if (error)
+		return (error);		/* no reference held on error */
+
+	error = finstall(td, fp, &fd, flags, NULL);
+	if (error) {
+		fdrop(fp, td);		/* one reference (fp only) */
+		return (error);
+	}
+
+	if (resultfp != NULL)
+		*resultfp = fp;		/* copy out result */
+	else
+		fdrop(fp, td);		/* release local reference */
+
+	if (resultfd != NULL)
+		*resultfd = fd;
+
+	return (0);
+}
+
+/*
+ * Create a new open file structure without allocating a file descriptor.
+ */
+int
+falloc_noinstall(struct thread *td, struct file **resultfp)
+{
+	struct file *fp;
+	int maxuserfiles = maxfiles - (maxfiles / 20);
+	static struct timeval lastfail;
+	static int curfail;
+
+	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
+
+	if ((openfiles >= maxuserfiles &&
+	    priv_check(td, PRIV_MAXFILES) != 0) ||
+	    openfiles >= maxfiles) {
+		if (ppsratecheck(&lastfail, &curfail, 1)) {
+			printf("kern.maxfiles limit exceeded by uid %i, "
+			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
+		}
+		return (ENFILE);
+	}
+	atomic_add_int(&openfiles, 1);
+	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
+	refcount_init(&fp->f_count, 1);
+	fp->f_cred = crhold(td->td_ucred);
+	fp->f_ops = &badfileops;
+	fp->f_data = NULL;
+	fp->f_vnode = NULL;
+	*resultfp = fp;
+	return (0);
+}
+
+/*
+ * Install a file in a file descriptor table.
+ */
+int
+finstall(struct thread *td, struct file *fp, int *fd, int flags,
+    struct filecaps *fcaps)
+{
+	struct filedesc *fdp = td->td_proc->p_fd;
+	struct filedescent *fde;
+	int error;
+
+	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
+	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
+	if (fcaps != NULL)
+		filecaps_validate(fcaps, __func__);
+
+	FILEDESC_XLOCK(fdp);
+	if ((error = fdalloc(td, 0, fd))) {
+		FILEDESC_XUNLOCK(fdp);
+		return (error);
+	}
+	fhold(fp);
+	fde = &fdp->fd_ofiles[*fd];
+	fde->fde_file = fp;
+	if ((flags & O_CLOEXEC) != 0)
+		fde->fde_flags |= UF_EXCLOSE;
+	if (fcaps != NULL)
+		filecaps_move(fcaps, &fde->fde_caps);
+	else
+		filecaps_fill(&fde->fde_caps);
+	FILEDESC_XUNLOCK(fdp);
+	return (0);
+}
+
+/*
+ * Build a new filedesc structure from another.
+ * Copy the current, root, and jail root vnode references.
+ */
+struct filedesc *
+fdinit(struct filedesc *fdp)
+{
+	struct filedesc0 *newfdp;
+
+	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
+	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
+	if (fdp != NULL) {
+		FILEDESC_XLOCK(fdp);
+		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
+		if (newfdp->fd_fd.fd_cdir)
+			VREF(newfdp->fd_fd.fd_cdir);
+		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
+		if (newfdp->fd_fd.fd_rdir)
+			VREF(newfdp->fd_fd.fd_rdir);
+		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
+		if (newfdp->fd_fd.fd_jdir)
+			VREF(newfdp->fd_fd.fd_jdir);
+		FILEDESC_XUNLOCK(fdp);
+	}
+
+	/* Create the file descriptor table. */
+	newfdp->fd_fd.fd_refcnt = 1;
+	newfdp->fd_fd.fd_holdcnt = 1;
+	newfdp->fd_fd.fd_cmask = CMASK;
+	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
+	newfdp->fd_fd.fd_nfiles = NDFILE;
+	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
+	newfdp->fd_fd.fd_lastfile = -1;
+	return (&newfdp->fd_fd);
+}
+
+static struct filedesc *
+fdhold(struct proc *p)
+{
+	struct filedesc *fdp;
+
+	mtx_lock(&fdesc_mtx);
+	fdp = p->p_fd;
+	if (fdp != NULL)
+		fdp->fd_holdcnt++;
+	mtx_unlock(&fdesc_mtx);
+	return (fdp);
+}
+
+static void
+fddrop(struct filedesc *fdp)
+{
+	struct filedesc0 *fdp0;
+	struct freetable *ft;
+	int i;
+
+	mtx_lock(&fdesc_mtx);
+	i = --fdp->fd_holdcnt;
+	mtx_unlock(&fdesc_mtx);
+	if (i > 0)
+		return;
+
+	FILEDESC_LOCK_DESTROY(fdp);
+	fdp0 = (struct filedesc0 *)fdp;
+	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
+		SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
+		free(ft->ft_table, M_FILEDESC);
+	}
+	free(fdp, M_FILEDESC);
+}
+
+/*
+ * Share a filedesc structure.
+ */
+struct filedesc *
+fdshare(struct filedesc *fdp)
+{
+
+	FILEDESC_XLOCK(fdp);
+	fdp->fd_refcnt++;
+	FILEDESC_XUNLOCK(fdp);
+	return (fdp);
+}
+
+/*
+ * Unshare a filedesc structure, if necessary by making a copy
+ */
+void
+fdunshare(struct proc *p, struct thread *td)
+{
+
+	FILEDESC_XLOCK(p->p_fd);
+	if (p->p_fd->fd_refcnt > 1) {
+		struct filedesc *tmp;
+
+		FILEDESC_XUNLOCK(p->p_fd);
+		tmp = fdcopy(p->p_fd);
+		fdescfree(td);
+		p->p_fd = tmp;
+	} else
+		FILEDESC_XUNLOCK(p->p_fd);
+}
+
+/*
+ * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
+ * this is to ease callers, not catch errors.
+ */
+struct filedesc *
+fdcopy(struct filedesc *fdp)
+{
+	struct filedesc *newfdp;
+	struct filedescent *nfde, *ofde;
+	int i;
+
+	/* Certain daemons might not have file descriptors. */
+	if (fdp == NULL)
+		return (NULL);
+
+	newfdp = fdinit(fdp);
+	FILEDESC_SLOCK(fdp);
+	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
+		FILEDESC_SUNLOCK(fdp);
+		FILEDESC_XLOCK(newfdp);
+		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
+		FILEDESC_XUNLOCK(newfdp);
+		FILEDESC_SLOCK(fdp);
+	}
+	/* copy all passable descriptors (i.e. not kqueue) */
+	newfdp->fd_freefile = -1;
+	for (i = 0; i <= fdp->fd_lastfile; ++i) {
+		ofde = &fdp->fd_ofiles[i];
+		if (fdisused(fdp, i) &&
+		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) &&
+		    ofde->fde_file->f_ops != &badfileops) {
+			nfde = &newfdp->fd_ofiles[i];
+			*nfde = *ofde;
+			filecaps_copy(&ofde->fde_caps, &nfde->fde_caps);
+			fhold(nfde->fde_file);
+			newfdp->fd_lastfile = i;
+		} else {
+			if (newfdp->fd_freefile == -1)
+				newfdp->fd_freefile = i;
+		}
+	}
+	newfdp->fd_cmask = fdp->fd_cmask;
+	FILEDESC_SUNLOCK(fdp);
+	FILEDESC_XLOCK(newfdp);
+	for (i = 0; i <= newfdp->fd_lastfile; ++i) {
+		if (newfdp->fd_ofiles[i].fde_file != NULL)
+			fdused(newfdp, i);
+	}
+	if (newfdp->fd_freefile == -1)
+		newfdp->fd_freefile = i;
+	FILEDESC_XUNLOCK(newfdp);
+	return (newfdp);
+}
+
+/*
+ * Release a filedesc structure.
+ */
+void
+fdescfree(struct thread *td)
+{
+	struct filedesc *fdp;
+	int i;
+	struct filedesc_to_leader *fdtol;
+	struct file *fp;
+	struct vnode *cdir, *jdir, *rdir, *vp;
+	struct flock lf;
+
+	/* Certain daemons might not have file descriptors. */
+	fdp = td->td_proc->p_fd;
+	if (fdp == NULL)
+		return;
+
+#ifdef RACCT
+	PROC_LOCK(td->td_proc);
+	racct_set(td->td_proc, RACCT_NOFILE, 0);
+	PROC_UNLOCK(td->td_proc);
+#endif
+
+	/* Check for special need to clear POSIX style locks */
+	fdtol = td->td_proc->p_fdtol;
+	if (fdtol != NULL) {
+		FILEDESC_XLOCK(fdp);
+		KASSERT(fdtol->fdl_refcount > 0,
+		    ("filedesc_to_refcount botch: fdl_refcount=%d",
+		    fdtol->fdl_refcount));
+		if (fdtol->fdl_refcount == 1 &&
+		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
+			for (i = 0; i <= fdp->fd_lastfile; i++) {
+				fp = fdp->fd_ofiles[i].fde_file;
+				if (fp == NULL || fp->f_type != DTYPE_VNODE)
+					continue;
+				fhold(fp);
+				FILEDESC_XUNLOCK(fdp);
+				lf.l_whence = SEEK_SET;
+				lf.l_start = 0;
+				lf.l_len = 0;
+				lf.l_type = F_UNLCK;
+				vp = fp->f_vnode;
+				(void) VOP_ADVLOCK(vp,
+				    (caddr_t)td->td_proc->p_leader, F_UNLCK,
+				    &lf, F_POSIX);
+				FILEDESC_XLOCK(fdp);
+				fdrop(fp, td);
+			}
+		}
+	retry:
+		if (fdtol->fdl_refcount == 1) {
+			if (fdp->fd_holdleaderscount > 0 &&
+			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
+				/*
+				 * close() or do_dup() has cleared a reference
+				 * in a shared file descriptor table.
+				 */
+				fdp->fd_holdleaderswakeup = 1;
+				sx_sleep(&fdp->fd_holdleaderscount,
+				    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
+				goto retry;
+			}
+			if (fdtol->fdl_holdcount > 0) {
+				/*
+				 * Ensure that fdtol->fdl_leader remains
+				 * valid in closef().
+				 */
+				fdtol->fdl_wakeup = 1;
+				sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
+				    "fdlhold", 0);
+				goto retry;
+			}
+		}
+		fdtol->fdl_refcount--;
+		if (fdtol->fdl_refcount == 0 &&
+		    fdtol->fdl_holdcount == 0) {
+			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
+			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
+		} else
+			fdtol = NULL;
+		td->td_proc->p_fdtol = NULL;
+		FILEDESC_XUNLOCK(fdp);
+		if (fdtol != NULL)
+			free(fdtol, M_FILEDESC_TO_LEADER);
+	}
+	FILEDESC_XLOCK(fdp);
+	i = --fdp->fd_refcnt;
+	FILEDESC_XUNLOCK(fdp);
+	if (i > 0)
+		return;
+
+	for (i = 0; i <= fdp->fd_lastfile; i++) {
+		fp = fdp->fd_ofiles[i].fde_file;
+		if (fp != NULL) {
+			FILEDESC_XLOCK(fdp);
+			fdfree(fdp, i);
+			FILEDESC_XUNLOCK(fdp);
+			(void) closef(fp, td);
+		}
+	}
+	FILEDESC_XLOCK(fdp);
+
+	/* XXX This should happen earlier. */
+	mtx_lock(&fdesc_mtx);
+	td->td_proc->p_fd = NULL;
+	mtx_unlock(&fdesc_mtx);
+
+	if (fdp->fd_nfiles > NDFILE)
+		free(fdp->fd_ofiles, M_FILEDESC);
+	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
+		free(fdp->fd_map, M_FILEDESC);
+
+	fdp->fd_nfiles = 0;
+
+	cdir = fdp->fd_cdir;
+	fdp->fd_cdir = NULL;
+	rdir = fdp->fd_rdir;
+	fdp->fd_rdir = NULL;
+	jdir = fdp->fd_jdir;
+	fdp->fd_jdir = NULL;
+	FILEDESC_XUNLOCK(fdp);
+
+	if (cdir != NULL)
+		vrele(cdir);
+	if (rdir != NULL)
+		vrele(rdir);
+	if (jdir != NULL)
+		vrele(jdir);
+
+	fddrop(fdp);
+}
+
+/*
+ * For setugid programs, we don't want to people to use that setugidness
+ * to generate error messages which write to a file which otherwise would
+ * otherwise be off-limits to the process.  We check for filesystems where
+ * the vnode can change out from under us after execve (like [lin]procfs).
+ *
+ * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
+ * sufficient.  We also don't check for setugidness since we know we are.
+ */
+static int
+is_unsafe(struct file *fp)
+{
+	if (fp->f_type == DTYPE_VNODE) {
+		struct vnode *vp = fp->f_vnode;
+
+		if ((vp->v_vflag & VV_PROCDEP) != 0)
+			return (1);
+	}
+	return (0);
+}
+
+/*
+ * Make this setguid thing safe, if at all possible.
+ */
+void
+setugidsafety(struct thread *td)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+	int i;
+
+	/* Certain daemons might not have file descriptors. */
+	fdp = td->td_proc->p_fd;
+	if (fdp == NULL)
+		return;
+
+	/*
+	 * Note: fdp->fd_ofiles may be reallocated out from under us while
+	 * we are blocked in a close.  Be careful!
+	 */
+	FILEDESC_XLOCK(fdp);
+	for (i = 0; i <= fdp->fd_lastfile; i++) {
+		if (i > 2)
+			break;
+		fp = fdp->fd_ofiles[i].fde_file;
+		if (fp != NULL && is_unsafe(fp)) {
+			knote_fdclose(td, i);
+			/*
+			 * NULL-out descriptor prior to close to avoid
+			 * a race while close blocks.
+			 */
+			fdfree(fdp, i);
+			FILEDESC_XUNLOCK(fdp);
+			(void) closef(fp, td);
+			FILEDESC_XLOCK(fdp);
+		}
+	}
+	FILEDESC_XUNLOCK(fdp);
+}
+
+/*
+ * If a specific file object occupies a specific file descriptor, close the
+ * file descriptor entry and drop a reference on the file object.  This is a
+ * convenience function to handle a subsequent error in a function that calls
+ * falloc() that handles the race that another thread might have closed the
+ * file descriptor out from under the thread creating the file object.
+ */
+void
+fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
+{
+
+	FILEDESC_XLOCK(fdp);
+	if (fdp->fd_ofiles[idx].fde_file == fp) {
+		fdfree(fdp, idx);
+		FILEDESC_XUNLOCK(fdp);
+		fdrop(fp, td);
+	} else
+		FILEDESC_XUNLOCK(fdp);
+}
+
+/*
+ * Close any files on exec?
+ */
+void
+fdcloseexec(struct thread *td)
+{
+	struct filedesc *fdp;
+	struct filedescent *fde;
+	struct file *fp;
+	int i;
+
+	/* Certain daemons might not have file descriptors. */
+	fdp = td->td_proc->p_fd;
+	if (fdp == NULL)
+		return;
+
+	/*
+	 * We cannot cache fd_ofiles since operations
+	 * may block and rip them out from under us.
+	 */
+	FILEDESC_XLOCK(fdp);
+	for (i = 0; i <= fdp->fd_lastfile; i++) {
+		fde = &fdp->fd_ofiles[i];
+		fp = fde->fde_file;
+		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
+		    (fde->fde_flags & UF_EXCLOSE))) {
+			fdfree(fdp, i);
+			(void) closefp(fdp, i, fp, td, 0);
+			/* closefp() drops the FILEDESC lock. */
+			FILEDESC_XLOCK(fdp);
+		}
+	}
+	FILEDESC_XUNLOCK(fdp);
+}
+
+/*
+ * It is unsafe for set[ug]id processes to be started with file
+ * descriptors 0..2 closed, as these descriptors are given implicit
+ * significance in the Standard C library.  fdcheckstd() will create a
+ * descriptor referencing /dev/null for each of stdin, stdout, and
+ * stderr that is not already open.
+ */
+int
+fdcheckstd(struct thread *td)
+{
+	struct filedesc *fdp;
+	register_t retval, save;
+	int i, error, devnull;
+
+	fdp = td->td_proc->p_fd;
+	if (fdp == NULL)
+		return (0);
+	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
+	devnull = -1;
+	error = 0;
+	for (i = 0; i < 3; i++) {
+		if (fdp->fd_ofiles[i].fde_file != NULL)
+			continue;
+		if (devnull < 0) {
+			save = td->td_retval[0];
+			error = kern_open(td, "/dev/null", UIO_SYSSPACE,
+			    O_RDWR, 0);
+			devnull = td->td_retval[0];
+			td->td_retval[0] = save;
+			if (error)
+				break;
+			KASSERT(devnull == i, ("oof, we didn't get our fd"));
+		} else {
+			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
+			if (error != 0)
+				break;
+		}
+	}
+	return (error);
+}
+
+/*
+ * Internal form of close.  Decrement reference count on file structure.
+ * Note: td may be NULL when closing a file that was being passed in a
+ * message.
+ *
+ * XXXRW: Giant is not required for the caller, but often will be held; this
+ * makes it moderately likely the Giant will be recursed in the VFS case.
+ */
+int
+closef(struct file *fp, struct thread *td)
+{
+	struct vnode *vp;
+	struct flock lf;
+	struct filedesc_to_leader *fdtol;
+	struct filedesc *fdp;
+
+	/*
+	 * POSIX record locking dictates that any close releases ALL
+	 * locks owned by this process.  This is handled by setting
+	 * a flag in the unlock to free ONLY locks obeying POSIX
+	 * semantics, and not to free BSD-style file locks.
+	 * If the descriptor was in a message, POSIX-style locks
+	 * aren't passed with the descriptor, and the thread pointer
+	 * will be NULL.  Callers should be careful only to pass a
+	 * NULL thread pointer when there really is no owning
+	 * context that might have locks, or the locks will be
+	 * leaked.
+	 */
+	if (fp->f_type == DTYPE_VNODE && td != NULL) {
+		vp = fp->f_vnode;
+		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
+			lf.l_whence = SEEK_SET;
+			lf.l_start = 0;
+			lf.l_len = 0;
+			lf.l_type = F_UNLCK;
+			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
+			    F_UNLCK, &lf, F_POSIX);
+		}
+		fdtol = td->td_proc->p_fdtol;
+		if (fdtol != NULL) {
+			/*
+			 * Handle special case where file descriptor table is
+			 * shared between multiple process leaders.
+			 */
+			fdp = td->td_proc->p_fd;
+			FILEDESC_XLOCK(fdp);
+			for (fdtol = fdtol->fdl_next;
+			     fdtol != td->td_proc->p_fdtol;
+			     fdtol = fdtol->fdl_next) {
+				if ((fdtol->fdl_leader->p_flag &
+				     P_ADVLOCK) == 0)
+					continue;
+				fdtol->fdl_holdcount++;
+				FILEDESC_XUNLOCK(fdp);
+				lf.l_whence = SEEK_SET;
+				lf.l_start = 0;
+				lf.l_len = 0;
+				lf.l_type = F_UNLCK;
+				vp = fp->f_vnode;
+				(void) VOP_ADVLOCK(vp,
+				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
+				    F_POSIX);
+				FILEDESC_XLOCK(fdp);
+				fdtol->fdl_holdcount--;
+				if (fdtol->fdl_holdcount == 0 &&
+				    fdtol->fdl_wakeup != 0) {
+					fdtol->fdl_wakeup = 0;
+					wakeup(fdtol);
+				}
+			}
+			FILEDESC_XUNLOCK(fdp);
+		}
+	}
+	return (fdrop(fp, td));
+}
+
+/*
+ * Initialize the file pointer with the specified properties.
+ *
+ * The ops are set with release semantics to be certain that the flags, type,
+ * and data are visible when ops is.  This is to prevent ops methods from being
+ * called with bad data.
+ */
+void
+finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
+{
+	fp->f_data = data;
+	fp->f_flag = flag;
+	fp->f_type = type;
+	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
+}
+
+int
+fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
+    int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
+{
+	struct file *fp;
+	u_int count;
+#ifdef CAPABILITIES
+	cap_rights_t haverights;
+	int error;
+#endif
+
+	if (fd < 0 || fd >= fdp->fd_nfiles)
+		return (EBADF);
+	/*
+	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
+	 * never raising a refcount above 0.  To accomplish this we have
+	 * to use a cmpset loop rather than an atomic_add.  The descriptor
+	 * must be re-verified once we acquire a reference to be certain
+	 * that the identity is still correct and we did not lose a race
+	 * due to preemption.
+	 */
+	for (;;) {
+		fp = fdp->fd_ofiles[fd].fde_file;
+		if (fp == NULL)
+			return (EBADF);
+#ifdef CAPABILITIES
+		haverights = *cap_rights(fdp, fd);
+		if (needrightsp != NULL) {
+			error = cap_check(&haverights, needrightsp);
+			if (error != 0)
+				return (error);
+			if (cap_rights_is_set(needrightsp, CAP_FCNTL)) {
+				error = cap_fcntl_check(fdp, fd, needfcntl);
+				if (error != 0)
+					return (error);
+			}
+		}
+#endif
+		count = fp->f_count;
+		if (count == 0)
+			continue;
+		/*
+		 * Use an acquire barrier to prevent caching of fd_ofiles
+		 * so it is refreshed for verification.
+		 */
+		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
+			continue;
+		if (fp == fdp->fd_ofiles[fd].fde_file)
+			break;
+		fdrop(fp, curthread);
+	}
+	*fpp = fp;
+	if (haverightsp != NULL) {
+#ifdef CAPABILITIES
+		*haverightsp = haverights;
+#else
+		CAP_ALL(haverightsp);
+#endif
+	}
+	return (0);
+}
+
+/*
+ * Extract the file pointer associated with the specified descriptor for the
+ * current user process.
+ *
+ * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
+ * returned.
+ *
+ * File's rights will be checked against the capability rights mask.
+ *
+ * If an error occured the non-zero error is returned and *fpp is set to
+ * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
+ * responsible for fdrop().
+ */
+static __inline int
+_fget(struct thread *td, int fd, struct file **fpp, int flags,
+    cap_rights_t *needrightsp, u_char *maxprotp)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+	cap_rights_t haverights, needrights;
+	int error;
+
+	*fpp = NULL;
+	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
+		return (EBADF);
+	if (needrightsp != NULL)
+		needrights = *needrightsp;
+	else
+		cap_rights_init(&needrights);
+	if (maxprotp != NULL)
+		cap_rights_set(&needrights, CAP_MMAP);
+	error = fget_unlocked(fdp, fd, &needrights, 0, &fp, &haverights);
+	if (error != 0)
+		return (error);
+	if (fp->f_ops == &badfileops) {
+		fdrop(fp, td);
+		return (EBADF);
+	}
+
+#ifdef CAPABILITIES
+	/*
+	 * If requested, convert capability rights to access flags.
+	 */
+	if (maxprotp != NULL)
+		*maxprotp = cap_rights_to_vmprot(&haverights);
+#else /* !CAPABILITIES */
+	if (maxprotp != NULL)
+		*maxprotp = VM_PROT_ALL;
+#endif /* CAPABILITIES */
+
+	/*
+	 * FREAD and FWRITE failure return EBADF as per POSIX.
+	 */
+	error = 0;
+	switch (flags) {
+	case FREAD:
+	case FWRITE:
+		if ((fp->f_flag & flags) == 0)
+			error = EBADF;
+		break;
+	case FEXEC:
+	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
+		    ((fp->f_flag & FWRITE) != 0))
+			error = EBADF;
+		break;
+	case 0:
+		break;
+	default:
+		KASSERT(0, ("wrong flags"));
+	}
+
+	if (error != 0) {
+		fdrop(fp, td);
+		return (error);
+	}
+
+	*fpp = fp;
+	return (0);
+}
+
+int
+fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+
+	return(_fget(td, fd, fpp, 0, rightsp, NULL));
+}
+
+int
+fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
+    struct file **fpp)
+{
+
+	return (_fget(td, fd, fpp, 0, rightsp, maxprotp));
+}
+
+int
+fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+
+	return(_fget(td, fd, fpp, FREAD, rightsp, NULL));
+}
+
+int
+fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+
+	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
+}
+
+/*
+ * Like fget() but loads the underlying vnode, or returns an error if the
+ * descriptor does not represent a vnode.  Note that pipes use vnodes but
+ * never have VM objects.  The returned vnode will be vref()'d.
+ *
+ * XXX: what about the unused flags ?
+ */
+static __inline int
+_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
+    struct vnode **vpp)
+{
+	struct file *fp;
+	int error;
+
+	*vpp = NULL;
+	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
+	if (error != 0)
+		return (error);
+	if (fp->f_vnode == NULL) {
+		error = EINVAL;
+	} else {
+		*vpp = fp->f_vnode;
+		vref(*vpp);
+	}
+	fdrop(fp, td);
+
+	return (error);
+}
+
+int
+fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
+{
+
+	return (_fgetvp(td, fd, 0, rightsp, vpp));
+}
+
+int
+fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
+    struct filecaps *havecaps, struct vnode **vpp)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+#ifdef CAPABILITIES
+	int error;
+#endif
+
+	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
+		return (EBADF);
+
+	fp = fget_locked(fdp, fd);
+	if (fp == NULL || fp->f_ops == &badfileops)
+		return (EBADF);
+
+#ifdef CAPABILITIES
+	if (needrightsp != NULL) {
+		error = cap_check(cap_rights(fdp, fd), needrightsp);
+		if (error != 0)
+			return (error);
+	}
+#endif
+
+	if (fp->f_vnode == NULL)
+		return (EINVAL);
+
+	*vpp = fp->f_vnode;
+	vref(*vpp);
+	filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps);
+
+	return (0);
+}
+
+int
+fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
+{
+
+	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
+}
+
+int
+fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
+{
+
+	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
+}
+
+#ifdef notyet
+int
+fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
+    struct vnode **vpp)
+{
+
+	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
+}
+#endif
+
+/*
+ * Like fget() but loads the underlying socket, or returns an error if the
+ * descriptor does not represent a socket.
+ *
+ * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
+ * in the future.
+ *
+ * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
+ * on their file descriptor reference to prevent the socket from being free'd
+ * during use.
+ */
+int
+fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
+    u_int *fflagp)
+{
+	struct file *fp;
+	int error;
+
+	*spp = NULL;
+	if (fflagp != NULL)
+		*fflagp = 0;
+	if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
+		return (error);
+	if (fp->f_type != DTYPE_SOCKET) {
+		error = ENOTSOCK;
+	} else {
+		*spp = fp->f_data;
+		if (fflagp)
+			*fflagp = fp->f_flag;
+		SOCK_LOCK(*spp);
+		soref(*spp);
+		SOCK_UNLOCK(*spp);
+	}
+	fdrop(fp, td);
+
+	return (error);
+}
+
+/*
+ * Drop the reference count on the socket and XXX release the SX lock in the
+ * future.  The last reference closes the socket.
+ *
+ * Note: fputsock() is deprecated, see comment for fgetsock().
+ */
+void
+fputsock(struct socket *so)
+{
+
+	ACCEPT_LOCK();
+	SOCK_LOCK(so);
+	CURVNET_SET(so->so_vnet);
+	sorele(so);
+	CURVNET_RESTORE();
+}
+
+/*
+ * Handle the last reference to a file being closed.
+ */
+int
+_fdrop(struct file *fp, struct thread *td)
+{
+	int error;
+
+	error = 0;
+	if (fp->f_count != 0)
+		panic("fdrop: count %d", fp->f_count);
+	if (fp->f_ops != &badfileops)
+		error = fo_close(fp, td);
+	atomic_subtract_int(&openfiles, 1);
+	crfree(fp->f_cred);
+	free(fp->f_advice, M_FADVISE);
+	uma_zfree(file_zone, fp);
+
+	return (error);
+}
+
+/*
+ * Apply an advisory lock on a file descriptor.
+ *
+ * Just attempt to get a record lock of the requested type on the entire file
+ * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct flock_args {
+	int	fd;
+	int	how;
+};
+#endif
+/* ARGSUSED */
+int
+sys_flock(struct thread *td, struct flock_args *uap)
+{
+	struct file *fp;
+	struct vnode *vp;
+	struct flock lf;
+	cap_rights_t rights;
+	int error;
+
+	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
+	if (error != 0)
+		return (error);
+	if (fp->f_type != DTYPE_VNODE) {
+		fdrop(fp, td);
+		return (EOPNOTSUPP);
+	}
+
+	vp = fp->f_vnode;
+	lf.l_whence = SEEK_SET;
+	lf.l_start = 0;
+	lf.l_len = 0;
+	if (uap->how & LOCK_UN) {
+		lf.l_type = F_UNLCK;
+		atomic_clear_int(&fp->f_flag, FHASLOCK);
+		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
+		goto done2;
+	}
+	if (uap->how & LOCK_EX)
+		lf.l_type = F_WRLCK;
+	else if (uap->how & LOCK_SH)
+		lf.l_type = F_RDLCK;
+	else {
+		error = EBADF;
+		goto done2;
+	}
+	atomic_set_int(&fp->f_flag, FHASLOCK);
+	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
+done2:
+	fdrop(fp, td);
+	return (error);
+}
+/*
+ * Duplicate the specified descriptor to a free descriptor.
+ */
+int
+dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
+    int openerror, int *indxp)
+{
+	struct file *fp;
+	int error, indx;
+
+	KASSERT(openerror == ENODEV || openerror == ENXIO,
+	    ("unexpected error %d in %s", openerror, __func__));
+
+	/*
+	 * If the to-be-dup'd fd number is greater than the allowed number
+	 * of file descriptors, or the fd to be dup'd has already been
+	 * closed, then reject.
+	 */
+	FILEDESC_XLOCK(fdp);
+	if ((fp = fget_locked(fdp, dfd)) == NULL) {
+		FILEDESC_XUNLOCK(fdp);
+		return (EBADF);
+	}
+
+	error = fdalloc(td, 0, &indx);
+	if (error != 0) {
+		FILEDESC_XUNLOCK(fdp);
+		return (error);
+	}
+
+	/*
+	 * There are two cases of interest here.
+	 *
+	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
+	 *
+	 * For ENXIO steal away the file structure from (dfd) and store it in
+	 * (indx).  (dfd) is effectively closed by this operation.
+	 */
+	switch (openerror) {
+	case ENODEV:
+		/*
+		 * Check that the mode the file is being opened for is a
+		 * subset of the mode of the existing descriptor.
+		 */
+		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
+			fdunused(fdp, indx);
+			FILEDESC_XUNLOCK(fdp);
+			return (EACCES);
+		}
+		fhold(fp);
+		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
+		filecaps_copy(&fdp->fd_ofiles[dfd].fde_caps,
+		    &fdp->fd_ofiles[indx].fde_caps);
+		break;
+	case ENXIO:
+		/*
+		 * Steal away the file pointer from dfd and stuff it into indx.
+		 */
+		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
+		bzero(&fdp->fd_ofiles[dfd], sizeof(fdp->fd_ofiles[dfd]));
+		fdunused(fdp, dfd);
+		break;
+	}
+	FILEDESC_XUNLOCK(fdp);
+	*indxp = indx;
+	return (0);
+}
+
+/*
+ * Scan all active processes and prisons to see if any of them have a current
+ * or root directory of `olddp'. If so, replace them with the new mount point.
+ */
+void
+mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
+{
+	struct filedesc *fdp;
+	struct prison *pr;
+	struct proc *p;
+	int nrele;
+
+	if (vrefcnt(olddp) == 1)
+		return;
+	nrele = 0;
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		fdp = fdhold(p);
+		if (fdp == NULL)
+			continue;
+		FILEDESC_XLOCK(fdp);
+		if (fdp->fd_cdir == olddp) {
+			vref(newdp);
+			fdp->fd_cdir = newdp;
+			nrele++;
+		}
+		if (fdp->fd_rdir == olddp) {
+			vref(newdp);
+			fdp->fd_rdir = newdp;
+			nrele++;
+		}
+		if (fdp->fd_jdir == olddp) {
+			vref(newdp);
+			fdp->fd_jdir = newdp;
+			nrele++;
+		}
+		FILEDESC_XUNLOCK(fdp);
+		fddrop(fdp);
+	}
+	sx_sunlock(&allproc_lock);
+	if (rootvnode == olddp) {
+		vref(newdp);
+		rootvnode = newdp;
+		nrele++;
+	}
+	mtx_lock(&prison0.pr_mtx);
+	if (prison0.pr_root == olddp) {
+		vref(newdp);
+		prison0.pr_root = newdp;
+		nrele++;
+	}
+	mtx_unlock(&prison0.pr_mtx);
+	sx_slock(&allprison_lock);
+	TAILQ_FOREACH(pr, &allprison, pr_list) {
+		mtx_lock(&pr->pr_mtx);
+		if (pr->pr_root == olddp) {
+			vref(newdp);
+			pr->pr_root = newdp;
+			nrele++;
+		}
+		mtx_unlock(&pr->pr_mtx);
+	}
+	sx_sunlock(&allprison_lock);
+	while (nrele--)
+		vrele(olddp);
+}
+
+struct filedesc_to_leader *
+filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
+{
+	struct filedesc_to_leader *fdtol;
+
+	fdtol = malloc(sizeof(struct filedesc_to_leader),
+	       M_FILEDESC_TO_LEADER,
+	       M_WAITOK);
+	fdtol->fdl_refcount = 1;
+	fdtol->fdl_holdcount = 0;
+	fdtol->fdl_wakeup = 0;
+	fdtol->fdl_leader = leader;
+	if (old != NULL) {
+		FILEDESC_XLOCK(fdp);
+		fdtol->fdl_next = old->fdl_next;
+		fdtol->fdl_prev = old;
+		old->fdl_next = fdtol;
+		fdtol->fdl_next->fdl_prev = fdtol;
+		FILEDESC_XUNLOCK(fdp);
+	} else {
+		fdtol->fdl_next = fdtol;
+		fdtol->fdl_prev = fdtol;
+	}
+	return (fdtol);
+}
+
+/*
+ * Get file structures globally.
+ */
+static int
+sysctl_kern_file(SYSCTL_HANDLER_ARGS)
+{
+	struct xfile xf;
+	struct filedesc *fdp;
+	struct file *fp;
+	struct proc *p;
+	int error, n;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	if (req->oldptr == NULL) {
+		n = 0;
+		sx_slock(&allproc_lock);
+		FOREACH_PROC_IN_SYSTEM(p) {
+			if (p->p_state == PRS_NEW)
+				continue;
+			fdp = fdhold(p);
+			if (fdp == NULL)
+				continue;
+			/* overestimates sparse tables. */
+			if (fdp->fd_lastfile > 0)
+				n += fdp->fd_lastfile;
+			fddrop(fdp);
+		}
+		sx_sunlock(&allproc_lock);
+		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
+	}
+	error = 0;
+	bzero(&xf, sizeof(xf));
+	xf.xf_size = sizeof(xf);
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		if (p->p_state == PRS_NEW) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		if (p_cansee(req->td, p) != 0) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		xf.xf_pid = p->p_pid;
+		xf.xf_uid = p->p_ucred->cr_uid;
+		PROC_UNLOCK(p);
+		fdp = fdhold(p);
+		if (fdp == NULL)
+			continue;
+		FILEDESC_SLOCK(fdp);
+		for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
+			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+				continue;
+			xf.xf_fd = n;
+			xf.xf_file = fp;
+			xf.xf_data = fp->f_data;
+			xf.xf_vnode = fp->f_vnode;
+			xf.xf_type = fp->f_type;
+			xf.xf_count = fp->f_count;
+			xf.xf_msgcount = 0;
+			xf.xf_offset = foffset_get(fp);
+			xf.xf_flag = fp->f_flag;
+			error = SYSCTL_OUT(req, &xf, sizeof(xf));
+			if (error)
+				break;
+		}
+		FILEDESC_SUNLOCK(fdp);
+		fddrop(fdp);
+		if (error)
+			break;
+	}
+	sx_sunlock(&allproc_lock);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
+    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
+
+#ifdef KINFO_OFILE_SIZE
+CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
+#endif
+
+#ifdef COMPAT_FREEBSD7
+static int
+export_vnode_for_osysctl(struct vnode *vp, int type,
+    struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req)
+{
+	int error;
+	char *fullpath, *freepath;
+
+	bzero(kif, sizeof(*kif));
+	kif->kf_structsize = sizeof(*kif);
+
+	vref(vp);
+	kif->kf_fd = type;
+	kif->kf_type = KF_TYPE_VNODE;
+	/* This function only handles directories. */
+	if (vp->v_type != VDIR) {
+		vrele(vp);
+		return (ENOTDIR);
+	}
+	kif->kf_vnode_type = KF_VTYPE_VDIR;
+
+	/*
+	 * This is not a true file descriptor, so we set a bogus refcount
+	 * and offset to indicate these fields should be ignored.
+	 */
+	kif->kf_ref_count = -1;
+	kif->kf_offset = -1;
+
+	freepath = NULL;
+	fullpath = "-";
+	FILEDESC_SUNLOCK(fdp);
+	vn_fullpath(curthread, vp, &fullpath, &freepath);
+	vrele(vp);
+	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
+	if (freepath != NULL)
+		free(freepath, M_TEMP);
+	error = SYSCTL_OUT(req, kif, sizeof(*kif));
+	FILEDESC_SLOCK(fdp);
+	return (error);
+}
+
+/*
+ * Get per-process file descriptors for use by procstat(1), et al.
+ */
+static int
+sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
+{
+	char *fullpath, *freepath;
+	struct kinfo_ofile *kif;
+	struct filedesc *fdp;
+	int error, i, *name;
+	struct shmfd *shmfd;
+	struct socket *so;
+	struct vnode *vp;
+	struct ksem *ks;
+	struct file *fp;
+	struct proc *p;
+	struct tty *tp;
+
+	name = (int *)arg1;
+	error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
+	if (error != 0)
+		return (error);
+	fdp = fdhold(p);
+	PROC_UNLOCK(p);
+	if (fdp == NULL)
+		return (ENOENT);
+	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
+	FILEDESC_SLOCK(fdp);
+	if (fdp->fd_cdir != NULL)
+		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
+				fdp, req);
+	if (fdp->fd_rdir != NULL)
+		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
+				fdp, req);
+	if (fdp->fd_jdir != NULL)
+		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
+				fdp, req);
+	for (i = 0; i < fdp->fd_nfiles; i++) {
+		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
+			continue;
+		bzero(kif, sizeof(*kif));
+		kif->kf_structsize = sizeof(*kif);
+		ks = NULL;
+		vp = NULL;
+		so = NULL;
+		tp = NULL;
+		shmfd = NULL;
+		kif->kf_fd = i;
+
+		switch (fp->f_type) {
+		case DTYPE_VNODE:
+			kif->kf_type = KF_TYPE_VNODE;
+			vp = fp->f_vnode;
+			break;
+
+		case DTYPE_SOCKET:
+			kif->kf_type = KF_TYPE_SOCKET;
+			so = fp->f_data;
+			break;
+
+		case DTYPE_PIPE:
+			kif->kf_type = KF_TYPE_PIPE;
+			break;
+
+		case DTYPE_FIFO:
+			kif->kf_type = KF_TYPE_FIFO;
+			vp = fp->f_vnode;
+			break;
+
+		case DTYPE_KQUEUE:
+			kif->kf_type = KF_TYPE_KQUEUE;
+			break;
+
+		case DTYPE_CRYPTO:
+			kif->kf_type = KF_TYPE_CRYPTO;
+			break;
+
+		case DTYPE_MQUEUE:
+			kif->kf_type = KF_TYPE_MQUEUE;
+			break;
+
+		case DTYPE_SHM:
+			kif->kf_type = KF_TYPE_SHM;
+			shmfd = fp->f_data;
+			break;
+
+		case DTYPE_SEM:
+			kif->kf_type = KF_TYPE_SEM;
+			ks = fp->f_data;
+			break;
+
+		case DTYPE_PTS:
+			kif->kf_type = KF_TYPE_PTS;
+			tp = fp->f_data;
+			break;
+
+#ifdef PROCDESC
+		case DTYPE_PROCDESC:
+			kif->kf_type = KF_TYPE_PROCDESC;
+			break;
+#endif
+
+		default:
+			kif->kf_type = KF_TYPE_UNKNOWN;
+			break;
+		}
+		kif->kf_ref_count = fp->f_count;
+		if (fp->f_flag & FREAD)
+			kif->kf_flags |= KF_FLAG_READ;
+		if (fp->f_flag & FWRITE)
+			kif->kf_flags |= KF_FLAG_WRITE;
+		if (fp->f_flag & FAPPEND)
+			kif->kf_flags |= KF_FLAG_APPEND;
+		if (fp->f_flag & FASYNC)
+			kif->kf_flags |= KF_FLAG_ASYNC;
+		if (fp->f_flag & FFSYNC)
+			kif->kf_flags |= KF_FLAG_FSYNC;
+		if (fp->f_flag & FNONBLOCK)
+			kif->kf_flags |= KF_FLAG_NONBLOCK;
+		if (fp->f_flag & O_DIRECT)
+			kif->kf_flags |= KF_FLAG_DIRECT;
+		if (fp->f_flag & FHASLOCK)
+			kif->kf_flags |= KF_FLAG_HASLOCK;
+		kif->kf_offset = foffset_get(fp);
+		if (vp != NULL) {
+			vref(vp);
+			switch (vp->v_type) {
+			case VNON:
+				kif->kf_vnode_type = KF_VTYPE_VNON;
+				break;
+			case VREG:
+				kif->kf_vnode_type = KF_VTYPE_VREG;
+				break;
+			case VDIR:
+				kif->kf_vnode_type = KF_VTYPE_VDIR;
+				break;
+			case VBLK:
+				kif->kf_vnode_type = KF_VTYPE_VBLK;
+				break;
+			case VCHR:
+				kif->kf_vnode_type = KF_VTYPE_VCHR;
+				break;
+			case VLNK:
+				kif->kf_vnode_type = KF_VTYPE_VLNK;
+				break;
+			case VSOCK:
+				kif->kf_vnode_type = KF_VTYPE_VSOCK;
+				break;
+			case VFIFO:
+				kif->kf_vnode_type = KF_VTYPE_VFIFO;
+				break;
+			case VBAD:
+				kif->kf_vnode_type = KF_VTYPE_VBAD;
+				break;
+			default:
+				kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
+				break;
+			}
+			/*
+			 * It is OK to drop the filedesc lock here as we will
+			 * re-validate and re-evaluate its properties when
+			 * the loop continues.
+			 */
+			freepath = NULL;
+			fullpath = "-";
+			FILEDESC_SUNLOCK(fdp);
+			vn_fullpath(curthread, vp, &fullpath, &freepath);
+			vrele(vp);
+			strlcpy(kif->kf_path, fullpath,
+			    sizeof(kif->kf_path));
+			if (freepath != NULL)
+				free(freepath, M_TEMP);
+			FILEDESC_SLOCK(fdp);
+		}
+		if (so != NULL) {
+			struct sockaddr *sa;
+
+			if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
+			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
+				bcopy(sa, &kif->kf_sa_local, sa->sa_len);
+				free(sa, M_SONAME);
+			}
+			if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
+			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
+				bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
+				free(sa, M_SONAME);
+			}
+			kif->kf_sock_domain =
+			    so->so_proto->pr_domain->dom_family;
+			kif->kf_sock_type = so->so_type;
+			kif->kf_sock_protocol = so->so_proto->pr_protocol;
+		}
+		if (tp != NULL) {
+			strlcpy(kif->kf_path, tty_devname(tp),
+			    sizeof(kif->kf_path));
+		}
+		if (shmfd != NULL)
+			shm_path(shmfd, kif->kf_path, sizeof(kif->kf_path));
+		if (ks != NULL && ksem_info != NULL)
+			ksem_info(ks, kif->kf_path, sizeof(kif->kf_path), NULL);
+		error = SYSCTL_OUT(req, kif, sizeof(*kif));
+		if (error)
+			break;
+	}
+	FILEDESC_SUNLOCK(fdp);
+	fddrop(fdp);
+	free(kif, M_TEMP);
+	return (0);
+}
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD,
+    sysctl_kern_proc_ofiledesc, "Process ofiledesc entries");
+#endif	/* COMPAT_FREEBSD7 */
+
+#ifdef KINFO_FILE_SIZE
+CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
+#endif
+
+struct export_fd_buf {
+	struct filedesc		*fdp;
+	struct sbuf 		*sb;
+	ssize_t			remainder;
+	struct kinfo_file	kif;
+};
+
+static int
+export_fd_to_sb(void *data, int type, int fd, int fflags, int refcnt,
+    int64_t offset, cap_rights_t *rightsp, struct export_fd_buf *efbuf)
+{
+	struct {
+		int	fflag;
+		int	kf_fflag;
+	} fflags_table[] = {
+		{ FAPPEND, KF_FLAG_APPEND },
+		{ FASYNC, KF_FLAG_ASYNC },
+		{ FFSYNC, KF_FLAG_FSYNC },
+		{ FHASLOCK, KF_FLAG_HASLOCK },
+		{ FNONBLOCK, KF_FLAG_NONBLOCK },
+		{ FREAD, KF_FLAG_READ },
+		{ FWRITE, KF_FLAG_WRITE },
+		{ O_CREAT, KF_FLAG_CREAT },
+		{ O_DIRECT, KF_FLAG_DIRECT },
+		{ O_EXCL, KF_FLAG_EXCL },
+		{ O_EXEC, KF_FLAG_EXEC },
+		{ O_EXLOCK, KF_FLAG_EXLOCK },
+		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
+		{ O_SHLOCK, KF_FLAG_SHLOCK },
+		{ O_TRUNC, KF_FLAG_TRUNC }
+	};
+#define	NFFLAGS	(sizeof(fflags_table) / sizeof(*fflags_table))
+	struct kinfo_file *kif;
+	struct vnode *vp;
+	int error, locked;
+	unsigned int i;
+
+	if (efbuf->remainder == 0)
+		return (0);
+	kif = &efbuf->kif;
+	bzero(kif, sizeof(*kif));
+	locked = efbuf->fdp != NULL;
+	switch (type) {
+	case KF_TYPE_FIFO:
+	case KF_TYPE_VNODE:
+		if (locked) {
+			FILEDESC_SUNLOCK(efbuf->fdp);
+			locked = 0;
+		}
+		vp = (struct vnode *)data;
+		error = fill_vnode_info(vp, kif);
+		vrele(vp);
+		break;
+	case KF_TYPE_SOCKET:
+		error = fill_socket_info((struct socket *)data, kif);
+		break;
+	case KF_TYPE_PIPE:
+		error = fill_pipe_info((struct pipe *)data, kif);
+		break;
+	case KF_TYPE_PTS:
+		error = fill_pts_info((struct tty *)data, kif);
+		break;
+	case KF_TYPE_PROCDESC:
+		error = fill_procdesc_info((struct procdesc *)data, kif);
+		break;
+	case KF_TYPE_SEM:
+		error = fill_sem_info((struct file *)data, kif);
+		break;
+	case KF_TYPE_SHM:
+		error = fill_shm_info((struct file *)data, kif);
+		break;
+	default:
+		error = 0;
+	}
+	if (error == 0)
+		kif->kf_status |= KF_ATTR_VALID;
+
+	/*
+	 * Translate file access flags.
+	 */
+	for (i = 0; i < NFFLAGS; i++)
+		if (fflags & fflags_table[i].fflag)
+			kif->kf_flags |=  fflags_table[i].kf_fflag;
+	if (rightsp != NULL)
+		kif->kf_cap_rights = *rightsp;
+	else
+		cap_rights_init(&kif->kf_cap_rights);
+	kif->kf_fd = fd;
+	kif->kf_type = type;
+	kif->kf_ref_count = refcnt;
+	kif->kf_offset = offset;
+	/* Pack record size down */
+	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
+	    strlen(kif->kf_path) + 1;
+	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
+	if (efbuf->remainder != -1) {
+		if (efbuf->remainder < kif->kf_structsize) {
+			/* Terminate export. */
+			efbuf->remainder = 0;
+			if (efbuf->fdp != NULL && !locked)
+				FILEDESC_SLOCK(efbuf->fdp);
+			return (0);
+		}
+		efbuf->remainder -= kif->kf_structsize;
+	}
+	if (locked)
+		FILEDESC_SUNLOCK(efbuf->fdp);
+	error = sbuf_bcat(efbuf->sb, kif, kif->kf_structsize);
+	if (efbuf->fdp != NULL)
+		FILEDESC_SLOCK(efbuf->fdp);
+	return (error);
+}
+
+/*
+ * Store a process file descriptor information to sbuf.
+ *
+ * Takes a locked proc as argument, and returns with the proc unlocked.
+ */
+int
+kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
+{
+	struct file *fp;
+	struct filedesc *fdp;
+	struct export_fd_buf *efbuf;
+	struct vnode *cttyvp, *textvp, *tracevp;
+	int64_t offset;
+	void *data;
+	int error, i;
+	int type, refcnt, fflags;
+	cap_rights_t rights;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	/* ktrace vnode */
+	tracevp = p->p_tracevp;
+	if (tracevp != NULL)
+		vref(tracevp);
+	/* text vnode */
+	textvp = p->p_textvp;
+	if (textvp != NULL)
+		vref(textvp);
+	/* Controlling tty. */
+	cttyvp = NULL;
+	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
+		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
+		if (cttyvp != NULL)
+			vref(cttyvp);
+	}
+	fdp = fdhold(p);
+	PROC_UNLOCK(p);
+	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
+	efbuf->fdp = NULL;
+	efbuf->sb = sb;
+	efbuf->remainder = maxlen;
+	if (tracevp != NULL)
+		export_fd_to_sb(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE,
+		    FREAD | FWRITE, -1, -1, NULL, efbuf);
+	if (textvp != NULL)
+		export_fd_to_sb(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT,
+		    FREAD, -1, -1, NULL, efbuf);
+	if (cttyvp != NULL)
+		export_fd_to_sb(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY,
+		    FREAD | FWRITE, -1, -1, NULL, efbuf);
+	error = 0;
+	if (fdp == NULL)
+		goto fail;
+	efbuf->fdp = fdp;
+	FILEDESC_SLOCK(fdp);
+	/* working directory */
+	if (fdp->fd_cdir != NULL) {
+		vref(fdp->fd_cdir);
+		data = fdp->fd_cdir;
+		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD,
+		    FREAD, -1, -1, NULL, efbuf);
+	}
+	/* root directory */
+	if (fdp->fd_rdir != NULL) {
+		vref(fdp->fd_rdir);
+		data = fdp->fd_rdir;
+		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT,
+		    FREAD, -1, -1, NULL, efbuf);
+	}
+	/* jail directory */
+	if (fdp->fd_jdir != NULL) {
+		vref(fdp->fd_jdir);
+		data = fdp->fd_jdir;
+		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL,
+		    FREAD, -1, -1, NULL, efbuf);
+	}
+	for (i = 0; i < fdp->fd_nfiles; i++) {
+		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
+			continue;
+		data = NULL;
+#ifdef CAPABILITIES
+		rights = *cap_rights(fdp, i);
+#else /* !CAPABILITIES */
+		cap_rights_init(&rights);
+#endif
+		switch (fp->f_type) {
+		case DTYPE_VNODE:
+			type = KF_TYPE_VNODE;
+			vref(fp->f_vnode);
+			data = fp->f_vnode;
+			break;
+
+		case DTYPE_SOCKET:
+			type = KF_TYPE_SOCKET;
+			data = fp->f_data;
+			break;
+
+		case DTYPE_PIPE:
+			type = KF_TYPE_PIPE;
+			data = fp->f_data;
+			break;
+
+		case DTYPE_FIFO:
+			type = KF_TYPE_FIFO;
+			vref(fp->f_vnode);
+			data = fp->f_vnode;
+			break;
+
+		case DTYPE_KQUEUE:
+			type = KF_TYPE_KQUEUE;
+			break;
+
+		case DTYPE_CRYPTO:
+			type = KF_TYPE_CRYPTO;
+			break;
+
+		case DTYPE_MQUEUE:
+			type = KF_TYPE_MQUEUE;
+			break;
+
+		case DTYPE_SHM:
+			type = KF_TYPE_SHM;
+			data = fp;
+			break;
+
+		case DTYPE_SEM:
+			type = KF_TYPE_SEM;
+			data = fp;
+			break;
+
+		case DTYPE_PTS:
+			type = KF_TYPE_PTS;
+			data = fp->f_data;
+			break;
+
+#ifdef PROCDESC
+		case DTYPE_PROCDESC:
+			type = KF_TYPE_PROCDESC;
+			data = fp->f_data;
+			break;
+#endif
+
+		default:
+			type = KF_TYPE_UNKNOWN;
+			break;
+		}
+		refcnt = fp->f_count;
+		fflags = fp->f_flag;
+		offset = foffset_get(fp);
+
+		/*
+		 * Create sysctl entry.
+		 * It is OK to drop the filedesc lock here as we will
+		 * re-validate and re-evaluate its properties when
+		 * the loop continues.
+		 */
+		error = export_fd_to_sb(data, type, i, fflags, refcnt,
+		    offset, &rights, efbuf);
+		if (error != 0)
+			break;
+	}
+	FILEDESC_SUNLOCK(fdp);
+	fddrop(fdp);
+fail:
+	free(efbuf, M_TEMP);
+	return (error);
+}
+
+#define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
+
+/*
+ * Get per-process file descriptors for use by procstat(1), et al.
+ */
+static int
+sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf sb;
+	struct proc *p;
+	ssize_t maxlen;
+	int error, error2, *name;
+
+	name = (int *)arg1;
+
+	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
+	error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
+	if (error != 0) {
+		sbuf_delete(&sb);
+		return (error);
+	}
+	maxlen = req->oldptr != NULL ? req->oldlen : -1;
+	error = kern_proc_filedesc_out(p, &sb, maxlen);
+	error2 = sbuf_finish(&sb);
+	sbuf_delete(&sb);
+	return (error != 0 ? error : error2);
+}
+
+int
+vntype_to_kinfo(int vtype)
+{
+	struct {
+		int	vtype;
+		int	kf_vtype;
+	} vtypes_table[] = {
+		{ VBAD, KF_VTYPE_VBAD },
+		{ VBLK, KF_VTYPE_VBLK },
+		{ VCHR, KF_VTYPE_VCHR },
+		{ VDIR, KF_VTYPE_VDIR },
+		{ VFIFO, KF_VTYPE_VFIFO },
+		{ VLNK, KF_VTYPE_VLNK },
+		{ VNON, KF_VTYPE_VNON },
+		{ VREG, KF_VTYPE_VREG },
+		{ VSOCK, KF_VTYPE_VSOCK }
+	};
+#define	NVTYPES	(sizeof(vtypes_table) / sizeof(*vtypes_table))
+	unsigned int i;
+
+	/*
+	 * Perform vtype translation.
+	 */
+	for (i = 0; i < NVTYPES; i++)
+		if (vtypes_table[i].vtype == vtype)
+			break;
+	if (i < NVTYPES)
+		return (vtypes_table[i].kf_vtype);
+
+	return (KF_VTYPE_UNKNOWN);
+}
+
+static int
+fill_vnode_info(struct vnode *vp, struct kinfo_file *kif)
+{
+	struct vattr va;
+	char *fullpath, *freepath;
+	int error;
+
+	if (vp == NULL)
+		return (1);
+	kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
+	freepath = NULL;
+	fullpath = "-";
+	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
+	if (error == 0) {
+		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
+	}
+	if (freepath != NULL)
+		free(freepath, M_TEMP);
+
+	/*
+	 * Retrieve vnode attributes.
+	 */
+	va.va_fsid = VNOVAL;
+	va.va_rdev = NODEV;
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
+	VOP_UNLOCK(vp, 0);
+	if (error != 0)
+		return (error);
+	if (va.va_fsid != VNOVAL)
+		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
+	else
+		kif->kf_un.kf_file.kf_file_fsid =
+		    vp->v_mount->mnt_stat.f_fsid.val[0];
+	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
+	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
+	kif->kf_un.kf_file.kf_file_size = va.va_size;
+	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
+	return (0);
+}
+
+static int
+fill_socket_info(struct socket *so, struct kinfo_file *kif)
+{
+	struct sockaddr *sa;
+	struct inpcb *inpcb;
+	struct unpcb *unpcb;
+	int error;
+
+	if (so == NULL)
+		return (1);
+	kif->kf_sock_domain = so->so_proto->pr_domain->dom_family;
+	kif->kf_sock_type = so->so_type;
+	kif->kf_sock_protocol = so->so_proto->pr_protocol;
+	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
+	switch(kif->kf_sock_domain) {
+	case AF_INET:
+	case AF_INET6:
+		if (kif->kf_sock_protocol == IPPROTO_TCP) {
+			if (so->so_pcb != NULL) {
+				inpcb = (struct inpcb *)(so->so_pcb);
+				kif->kf_un.kf_sock.kf_sock_inpcb =
+				    (uintptr_t)inpcb->inp_ppcb;
+			}
+		}
+		break;
+	case AF_UNIX:
+		if (so->so_pcb != NULL) {
+			unpcb = (struct unpcb *)(so->so_pcb);
+			if (unpcb->unp_conn) {
+				kif->kf_un.kf_sock.kf_sock_unpconn =
+				    (uintptr_t)unpcb->unp_conn;
+				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
+				    so->so_rcv.sb_state;
+				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
+				    so->so_snd.sb_state;
+			}
+		}
+		break;
+	}
+	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
+	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
+		bcopy(sa, &kif->kf_sa_local, sa->sa_len);
+		free(sa, M_SONAME);
+	}
+	error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
+	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
+		bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
+		free(sa, M_SONAME);
+	}
+	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
+	    sizeof(kif->kf_path));
+	return (0);
+}
+
+static int
+fill_pts_info(struct tty *tp, struct kinfo_file *kif)
+{
+
+	if (tp == NULL)
+		return (1);
+	kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
+	strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
+	return (0);
+}
+
+static int
+fill_pipe_info(struct pipe *pi, struct kinfo_file *kif)
+{
+
+	if (pi == NULL)
+		return (1);
+	kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
+	kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
+	kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
+	return (0);
+}
+
+static int
+fill_procdesc_info(struct procdesc *pdp, struct kinfo_file *kif)
+{
+
+	if (pdp == NULL)
+		return (1);
+	kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
+	return (0);
+}
+
+static int
+fill_sem_info(struct file *fp, struct kinfo_file *kif)
+{
+	struct thread *td;
+	struct stat sb;
+
+	td = curthread;
+	if (fp->f_data == NULL)
+		return (1);
+	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
+		return (1);
+	if (ksem_info == NULL)
+		return (1);
+	ksem_info(fp->f_data, kif->kf_path, sizeof(kif->kf_path),
+	    &kif->kf_un.kf_sem.kf_sem_value);
+	kif->kf_un.kf_sem.kf_sem_mode = sb.st_mode;
+	return (0);
+}
+
+static int
+fill_shm_info(struct file *fp, struct kinfo_file *kif)
+{
+	struct thread *td;
+	struct stat sb;
+
+	td = curthread;
+	if (fp->f_data == NULL)
+		return (1);
+	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
+		return (1);
+	shm_path(fp->f_data, kif->kf_path, sizeof(kif->kf_path));
+	kif->kf_un.kf_file.kf_file_mode = sb.st_mode;
+	kif->kf_un.kf_file.kf_file_size = sb.st_size;
+	return (0);
+}
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD,
+    sysctl_kern_proc_filedesc, "Process filedesc entries");
+
+#ifdef DDB
+/*
+ * For the purposes of debugging, generate a human-readable string for the
+ * file type.
+ */
+static const char *
+file_type_to_name(short type)
+{
+
+	switch (type) {
+	case 0:
+		return ("zero");
+	case DTYPE_VNODE:
+		return ("vnod");
+	case DTYPE_SOCKET:
+		return ("sock");
+	case DTYPE_PIPE:
+		return ("pipe");
+	case DTYPE_FIFO:
+		return ("fifo");
+	case DTYPE_KQUEUE:
+		return ("kque");
+	case DTYPE_CRYPTO:
+		return ("crpt");
+	case DTYPE_MQUEUE:
+		return ("mque");
+	case DTYPE_SHM:
+		return ("shm");
+	case DTYPE_SEM:
+		return ("ksem");
+	default:
+		return ("unkn");
+	}
+}
+
+/*
+ * For the purposes of debugging, identify a process (if any, perhaps one of
+ * many) that references the passed file in its file descriptor array. Return
+ * NULL if none.
+ */
+static struct proc *
+file_to_first_proc(struct file *fp)
+{
+	struct filedesc *fdp;
+	struct proc *p;
+	int n;
+
+	FOREACH_PROC_IN_SYSTEM(p) {
+		if (p->p_state == PRS_NEW)
+			continue;
+		fdp = p->p_fd;
+		if (fdp == NULL)
+			continue;
+		for (n = 0; n < fdp->fd_nfiles; n++) {
+			if (fp == fdp->fd_ofiles[n].fde_file)
+				return (p);
+		}
+	}
+	return (NULL);
+}
+
+static void
+db_print_file(struct file *fp, int header)
+{
+	struct proc *p;
+
+	if (header)
+		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
+		    "File", "Type", "Data", "Flag", "GCFl", "Count",
+		    "MCount", "Vnode", "FPID", "FCmd");
+	p = file_to_first_proc(fp);
+	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
+	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
+	    0, fp->f_count, 0, fp->f_vnode,
+	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
+}
+
+DB_SHOW_COMMAND(file, db_show_file)
+{
+	struct file *fp;
+
+	if (!have_addr) {
+		db_printf("usage: show file <addr>\n");
+		return;
+	}
+	fp = (struct file *)addr;
+	db_print_file(fp, 1);
+}
+
+DB_SHOW_COMMAND(files, db_show_files)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+	struct proc *p;
+	int header;
+	int n;
+
+	header = 1;
+	FOREACH_PROC_IN_SYSTEM(p) {
+		if (p->p_state == PRS_NEW)
+			continue;
+		if ((fdp = p->p_fd) == NULL)
+			continue;
+		for (n = 0; n < fdp->fd_nfiles; ++n) {
+			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+				continue;
+			db_print_file(fp, header);
+			header = 0;
+		}
+	}
+}
+#endif
+
+SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
+    &maxfilesperproc, 0, "Maximum files allowed open per process");
+
+SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
+    &maxfiles, 0, "Maximum number of files");
+
+SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
+    __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
+
+/* ARGSUSED*/
+static void
+filelistinit(void *dummy)
+{
+
+	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
+	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
+}
+SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
+
+/*-------------------------------------------------------------------*/
+
+static int
+badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+static int
+badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (0);
+}
+
+static int
+badfo_kqfilter(struct file *fp, struct knote *kn)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_close(struct file *fp, struct thread *td)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+    int kflags, struct thread *td)
+{
+
+	return (EBADF);
+}
+
+struct fileops badfileops = {
+	.fo_read = badfo_readwrite,
+	.fo_write = badfo_readwrite,
+	.fo_truncate = badfo_truncate,
+	.fo_ioctl = badfo_ioctl,
+	.fo_poll = badfo_poll,
+	.fo_kqfilter = badfo_kqfilter,
+	.fo_stat = badfo_stat,
+	.fo_close = badfo_close,
+	.fo_chmod = badfo_chmod,
+	.fo_chown = badfo_chown,
+	.fo_sendfile = badfo_sendfile,
+};
+
+int
+invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+int
+invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+int
+invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+    int kflags, struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+/*-------------------------------------------------------------------*/
+
+/*
+ * File Descriptor pseudo-device driver (/dev/fd/).
+ *
+ * Opening minor device N dup()s the file (if any) connected to file
+ * descriptor N belonging to the calling process.  Note that this driver
+ * consists of only the ``open()'' routine, because all subsequent
+ * references to this file will be direct to the other driver.
+ *
+ * XXX: we could give this one a cloning event handler if necessary.
+ */
+
+/* ARGSUSED */
+static int
+fdopen(struct cdev *dev, int mode, int type, struct thread *td)
+{
+
+	/*
+	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
+	 * the file descriptor being sought for duplication. The error
+	 * return ensures that the vnode for this device will be released
+	 * by vn_open. Open will detect this special error and take the
+	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
+	 * will simply report the error.
+	 */
+	td->td_dupfd = dev2unit(dev);
+	return (ENODEV);
+}
+
+static struct cdevsw fildesc_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_open =	fdopen,
+	.d_name =	"FD",
+};
+
+static void
+fildesc_drvinit(void *unused)
+{
+	struct cdev *dev;
+
+	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
+	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
+	make_dev_alias(dev, "stdin");
+	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
+	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
+	make_dev_alias(dev, "stdout");
+	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
+	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
+	make_dev_alias(dev, "stderr");
+}
+
+SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
diff --git a/sys/kern/kern_dtrace.c b/sys/kern/kern_dtrace.c
new file mode 100644
index 0000000..5582fb9
--- /dev/null
+++ b/sys/kern/kern_dtrace.c
@@ -0,0 +1,117 @@
+/*-
+ * Copyright (c) 2007-2008 John Birrell <jb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/dtrace_bsd.h>
+#include <sys/sysctl.h>
+
+#define KDTRACE_PROC_SIZE	64
+#define	KDTRACE_THREAD_SIZE	256
+
+FEATURE(kdtrace_hooks,
+    "Kernel DTrace hooks which are required to load DTrace kernel modules");
+
+static MALLOC_DEFINE(M_KDTRACE, "kdtrace", "DTrace hooks");
+
+/* Return the DTrace process data size compiled in the kernel hooks. */
+size_t
+kdtrace_proc_size()
+{
+
+	return (KDTRACE_PROC_SIZE);
+}
+
+static void
+kdtrace_proc_ctor(void *arg __unused, struct proc *p)
+{
+
+	p->p_dtrace = malloc(KDTRACE_PROC_SIZE, M_KDTRACE, M_WAITOK|M_ZERO);
+}
+
+static void
+kdtrace_proc_dtor(void *arg __unused, struct proc *p)
+{
+
+	if (p->p_dtrace != NULL) {
+		free(p->p_dtrace, M_KDTRACE);
+		p->p_dtrace = NULL;
+	}
+}
+
+/* Return the DTrace thread data size compiled in the kernel hooks. */
+size_t
+kdtrace_thread_size()
+{
+
+	return (KDTRACE_THREAD_SIZE);
+}
+
+static void
+kdtrace_thread_ctor(void *arg __unused, struct thread *td)
+{
+
+	td->td_dtrace = malloc(KDTRACE_THREAD_SIZE, M_KDTRACE, M_WAITOK|M_ZERO);
+}
+
+static void
+kdtrace_thread_dtor(void *arg __unused, struct thread *td)
+{
+
+	if (td->td_dtrace != NULL) {
+		free(td->td_dtrace, M_KDTRACE);
+		td->td_dtrace = NULL;
+	}
+}
+
+/*
+ *  Initialise the kernel DTrace hooks.
+ */
+static void
+init_dtrace(void *dummy __unused)
+{
+
+	EVENTHANDLER_REGISTER(process_ctor, kdtrace_proc_ctor, NULL,
+	    EVENTHANDLER_PRI_ANY);
+	EVENTHANDLER_REGISTER(process_dtor, kdtrace_proc_dtor, NULL,
+	    EVENTHANDLER_PRI_ANY);
+	EVENTHANDLER_REGISTER(thread_ctor, kdtrace_thread_ctor, NULL,
+	    EVENTHANDLER_PRI_ANY);
+	EVENTHANDLER_REGISTER(thread_dtor, kdtrace_thread_dtor, NULL,
+	    EVENTHANDLER_PRI_ANY);
+}
+
+SYSINIT(kdtrace, SI_SUB_KDTRACE, SI_ORDER_FIRST, init_dtrace, NULL);
diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c
new file mode 100644
index 0000000..e89b3f7
--- /dev/null
+++ b/sys/kern/kern_environment.c
@@ -0,0 +1,626 @@
+/*-
+ * Copyright (c) 1998 Michael Smith
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The unified bootloader passes us a pointer to a preserved copy of
+ * bootstrap/kernel environment variables.  We convert them to a
+ * dynamic array of strings later when the VM subsystem is up.
+ *
+ * We make these available through the kenv(2) syscall for userland
+ * and through getenv()/freeenv() setenv() unsetenv() testenv() for
+ * the kernel.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/libkern.h>
+#include <sys/kenv.h>
+
+#include <security/mac/mac_framework.h>
+
+static MALLOC_DEFINE(M_KENV, "kenv", "kernel environment");
+
+#define KENV_SIZE	512	/* Maximum number of environment strings */
+
+/* pointer to the static environment */
+char		*kern_envp;
+static int	env_len;
+static int	env_pos;
+static char	*kernenv_next(char *);
+
+/* dynamic environment variables */
+char		**kenvp;
+struct mtx	kenv_lock;
+
+/*
+ * No need to protect this with a mutex since SYSINITS are single threaded.
+ */
+int	dynamic_kenv = 0;
+
+#define KENV_CHECK	if (!dynamic_kenv) \
+			    panic("%s: called before SI_SUB_KMEM", __func__)
+
+int
+sys_kenv(td, uap)
+	struct thread *td;
+	struct kenv_args /* {
+		int what;
+		const char *name;
+		char *value;
+		int len;
+	} */ *uap;
+{
+	char *name, *value, *buffer = NULL;
+	size_t len, done, needed, buflen;
+	int error, i;
+
+	KASSERT(dynamic_kenv, ("kenv: dynamic_kenv = 0"));
+
+	error = 0;
+	if (uap->what == KENV_DUMP) {
+#ifdef MAC
+		error = mac_kenv_check_dump(td->td_ucred);
+		if (error)
+			return (error);
+#endif
+		done = needed = 0;
+		buflen = uap->len;
+		if (buflen > KENV_SIZE * (KENV_MNAMELEN + KENV_MVALLEN + 2))
+			buflen = KENV_SIZE * (KENV_MNAMELEN +
+			    KENV_MVALLEN + 2);
+		if (uap->len > 0 && uap->value != NULL)
+			buffer = malloc(buflen, M_TEMP, M_WAITOK|M_ZERO);
+		mtx_lock(&kenv_lock);
+		for (i = 0; kenvp[i] != NULL; i++) {
+			len = strlen(kenvp[i]) + 1;
+			needed += len;
+			len = min(len, buflen - done);
+			/*
+			 * If called with a NULL or insufficiently large
+			 * buffer, just keep computing the required size.
+			 */
+			if (uap->value != NULL && buffer != NULL && len > 0) {
+				bcopy(kenvp[i], buffer + done, len);
+				done += len;
+			}
+		}
+		mtx_unlock(&kenv_lock);
+		if (buffer != NULL) {
+			error = copyout(buffer, uap->value, done);
+			free(buffer, M_TEMP);
+		}
+		td->td_retval[0] = ((done == needed) ? 0 : needed);
+		return (error);
+	}
+
+	switch (uap->what) {
+	case KENV_SET:
+		error = priv_check(td, PRIV_KENV_SET);
+		if (error)
+			return (error);
+		break;
+
+	case KENV_UNSET:
+		error = priv_check(td, PRIV_KENV_UNSET);
+		if (error)
+			return (error);
+		break;
+	}
+
+	name = malloc(KENV_MNAMELEN + 1, M_TEMP, M_WAITOK);
+
+	error = copyinstr(uap->name, name, KENV_MNAMELEN + 1, NULL);
+	if (error)
+		goto done;
+
+	switch (uap->what) {
+	case KENV_GET:
+#ifdef MAC
+		error = mac_kenv_check_get(td->td_ucred, name);
+		if (error)
+			goto done;
+#endif
+		value = getenv(name);
+		if (value == NULL) {
+			error = ENOENT;
+			goto done;
+		}
+		len = strlen(value) + 1;
+		if (len > uap->len)
+			len = uap->len;
+		error = copyout(value, uap->value, len);
+		freeenv(value);
+		if (error)
+			goto done;
+		td->td_retval[0] = len;
+		break;
+	case KENV_SET:
+		len = uap->len;
+		if (len < 1) {
+			error = EINVAL;
+			goto done;
+		}
+		if (len > KENV_MVALLEN + 1)
+			len = KENV_MVALLEN + 1;
+		value = malloc(len, M_TEMP, M_WAITOK);
+		error = copyinstr(uap->value, value, len, NULL);
+		if (error) {
+			free(value, M_TEMP);
+			goto done;
+		}
+#ifdef MAC
+		error = mac_kenv_check_set(td->td_ucred, name, value);
+		if (error == 0)
+#endif
+			setenv(name, value);
+		free(value, M_TEMP);
+		break;
+	case KENV_UNSET:
+#ifdef MAC
+		error = mac_kenv_check_unset(td->td_ucred, name);
+		if (error)
+			goto done;
+#endif
+		error = unsetenv(name);
+		if (error)
+			error = ENOENT;
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+done:
+	free(name, M_TEMP);
+	return (error);
+}
+
+void
+init_static_kenv(char *buf, size_t len)
+{
+	kern_envp = buf;
+	env_len = len;
+	env_pos = 0;
+}
+
+/*
+ * Setup the dynamic kernel environment.
+ */
+static void
+init_dynamic_kenv(void *data __unused)
+{
+	char *cp;
+	size_t len;
+	int i;
+
+	kenvp = malloc((KENV_SIZE + 1) * sizeof(char *), M_KENV,
+		M_WAITOK | M_ZERO);
+	i = 0;
+	if (kern_envp && *kern_envp != '\0') {
+		for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
+			len = strlen(cp) + 1;
+			if (len > KENV_MNAMELEN + 1 + KENV_MVALLEN + 1) {
+				printf(
+				"WARNING: too long kenv string, ignoring %s\n",
+				    cp);
+				continue;
+			}
+			if (i < KENV_SIZE) {
+				kenvp[i] = malloc(len, M_KENV, M_WAITOK);
+				strcpy(kenvp[i++], cp);
+			} else
+				printf(
+				"WARNING: too many kenv strings, ignoring %s\n",
+				    cp);
+		}
+	}
+	kenvp[i] = NULL;
+
+	mtx_init(&kenv_lock, "kernel environment", NULL, MTX_DEF);
+	dynamic_kenv = 1;
+}
+SYSINIT(kenv, SI_SUB_KMEM, SI_ORDER_ANY, init_dynamic_kenv, NULL);
+
+void
+freeenv(char *env)
+{
+
+	if (dynamic_kenv)
+		free(env, M_KENV);
+}
+
+/*
+ * Internal functions for string lookup.
+ */
+static char *
+_getenv_dynamic(const char *name, int *idx)
+{
+	char *cp;
+	int len, i;
+
+	mtx_assert(&kenv_lock, MA_OWNED);
+	len = strlen(name);
+	for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) {
+		if ((strncmp(cp, name, len) == 0) &&
+		    (cp[len] == '=')) {
+			if (idx != NULL)
+				*idx = i;
+			return (cp + len + 1);
+		}
+	}
+	return (NULL);
+}
+
+static char *
+_getenv_static(const char *name)
+{
+	char *cp, *ep;
+	int len;
+
+	for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
+		for (ep = cp; (*ep != '=') && (*ep != 0); ep++)
+			;
+		if (*ep != '=')
+			continue;
+		len = ep - cp;
+		ep++;
+		if (!strncmp(name, cp, len) && name[len] == 0)
+			return (ep);
+	}
+	return (NULL);
+}
+
+/*
+ * Look up an environment variable by name.
+ * Return a pointer to the string if found.
+ * The pointer has to be freed with freeenv()
+ * after use.
+ */
+char *
+getenv(const char *name)
+{
+	char buf[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1];
+	char *ret, *cp;
+	int len;
+
+	if (dynamic_kenv) {
+		mtx_lock(&kenv_lock);
+		cp = _getenv_dynamic(name, NULL);
+		if (cp != NULL) {
+			strcpy(buf, cp);
+			mtx_unlock(&kenv_lock);
+			len = strlen(buf) + 1;
+			ret = malloc(len, M_KENV, M_WAITOK);
+			strcpy(ret, buf);
+		} else {
+			mtx_unlock(&kenv_lock);
+			ret = NULL;
+			WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+			    "getenv");
+		}
+	} else
+		ret = _getenv_static(name);
+	return (ret);
+}
+
+/*
+ * Test if an environment variable is defined.
+ */
+int
+testenv(const char *name)
+{
+	char *cp;
+
+	if (dynamic_kenv) {
+		mtx_lock(&kenv_lock);
+		cp = _getenv_dynamic(name, NULL);
+		mtx_unlock(&kenv_lock);
+	} else
+		cp = _getenv_static(name);
+	if (cp != NULL)
+		return (1);
+	return (0);
+}
+
+static int
+setenv_static(const char *name, const char *value)
+{
+	int len;
+
+	if (env_pos >= env_len)
+		return (-1);
+
+	/* Check space for x=y and two nuls */
+	len = strlen(name) + strlen(value);
+	if (len + 3 < env_len - env_pos) {
+		len = sprintf(&kern_envp[env_pos], "%s=%s", name, value);
+		env_pos += len+1;
+		kern_envp[env_pos] = '\0';
+		return (0);
+	} else
+		return (-1);
+
+}
+
+/*
+ * Set an environment variable by name.
+ */
+int
+setenv(const char *name, const char *value)
+{
+	char *buf, *cp, *oldenv;
+	int namelen, vallen, i;
+
+	if (dynamic_kenv == 0 && env_len > 0)
+		return (setenv_static(name, value));
+
+	KENV_CHECK;
+
+	namelen = strlen(name) + 1;
+	if (namelen > KENV_MNAMELEN + 1)
+		return (-1);
+	vallen = strlen(value) + 1;
+	if (vallen > KENV_MVALLEN + 1)
+		return (-1);
+	buf = malloc(namelen + vallen, M_KENV, M_WAITOK);
+	sprintf(buf, "%s=%s", name, value);
+
+	mtx_lock(&kenv_lock);
+	cp = _getenv_dynamic(name, &i);
+	if (cp != NULL) {
+		oldenv = kenvp[i];
+		kenvp[i] = buf;
+		mtx_unlock(&kenv_lock);
+		free(oldenv, M_KENV);
+	} else {
+		/* We add the option if it wasn't found */
+		for (i = 0; (cp = kenvp[i]) != NULL; i++)
+			;
+
+		/* Bounds checking */
+		if (i < 0 || i >= KENV_SIZE) {
+			free(buf, M_KENV);
+			mtx_unlock(&kenv_lock);
+			return (-1);
+		}
+
+		kenvp[i] = buf;
+		kenvp[i + 1] = NULL;
+		mtx_unlock(&kenv_lock);
+	}
+	return (0);
+}
+
+/*
+ * Unset an environment variable string.
+ */
+int
+unsetenv(const char *name)
+{
+	char *cp, *oldenv;
+	int i, j;
+
+	KENV_CHECK;
+
+	mtx_lock(&kenv_lock);
+	cp = _getenv_dynamic(name, &i);
+	if (cp != NULL) {
+		oldenv = kenvp[i];
+		for (j = i + 1; kenvp[j] != NULL; j++)
+			kenvp[i++] = kenvp[j];
+		kenvp[i] = NULL;
+		mtx_unlock(&kenv_lock);
+		free(oldenv, M_KENV);
+		return (0);
+	}
+	mtx_unlock(&kenv_lock);
+	return (-1);
+}
+
+/*
+ * Return a string value from an environment variable.
+ */
+int
+getenv_string(const char *name, char *data, int size)
+{
+	char *tmp;
+
+	tmp = getenv(name);
+	if (tmp != NULL) {
+		strlcpy(data, tmp, size);
+		freeenv(tmp);
+		return (1);
+	} else
+		return (0);
+}
+
+/*
+ * Return an integer value from an environment variable.
+ */
+int
+getenv_int(const char *name, int *data)
+{
+	quad_t tmp;
+	int rval;
+
+	rval = getenv_quad(name, &tmp);
+	if (rval)
+		*data = (int) tmp;
+	return (rval);
+}
+
+/*
+ * Return an unsigned integer value from an environment variable.
+ */
+int
+getenv_uint(const char *name, unsigned int *data)
+{
+	quad_t tmp;
+	int rval;
+
+	rval = getenv_quad(name, &tmp);
+	if (rval)
+		*data = (unsigned int) tmp;
+	return (rval);
+}
+
+/*
+ * Return a long value from an environment variable.
+ */
+int
+getenv_long(const char *name, long *data)
+{
+	quad_t tmp;
+	int rval;
+
+	rval = getenv_quad(name, &tmp);
+	if (rval)
+		*data = (long) tmp;
+	return (rval);
+}
+
+/*
+ * Return an unsigned long value from an environment variable.
+ */
+int
+getenv_ulong(const char *name, unsigned long *data)
+{
+	quad_t tmp;
+	int rval;
+
+	rval = getenv_quad(name, &tmp);
+	if (rval)
+		*data = (unsigned long) tmp;
+	return (rval);
+}
+
+/*
+ * Return a quad_t value from an environment variable.
+ */
+int
+getenv_quad(const char *name, quad_t *data)
+{
+	char	*value;
+	char	*vtp;
+	quad_t	iv;
+
+	value = getenv(name);
+	if (value == NULL)
+		return (0);
+	iv = strtoq(value, &vtp, 0);
+	if (vtp == value || (vtp[0] != '\0' && vtp[1] != '\0')) {
+		freeenv(value);
+		return (0);
+	}
+	switch (vtp[0]) {
+	case 't': case 'T':
+		iv *= 1024;
+	case 'g': case 'G':
+		iv *= 1024;
+	case 'm': case 'M':
+		iv *= 1024;
+	case 'k': case 'K':
+		iv *= 1024;
+	case '\0':
+		break;
+	default:
+		freeenv(value);
+		return (0);
+	}
+	*data = iv;
+	freeenv(value);
+	return (1);
+}
+
+/*
+ * Find the next entry after the one which (cp) falls within, return a
+ * pointer to its start or NULL if there are no more.
+ */
+static char *
+kernenv_next(char *cp)
+{
+
+	if (cp != NULL) {
+		while (*cp != 0)
+			cp++;
+		cp++;
+		if (*cp == 0)
+			cp = NULL;
+	}
+	return (cp);
+}
+
+void
+tunable_int_init(void *data)
+{
+	struct tunable_int *d = (struct tunable_int *)data;
+
+	TUNABLE_INT_FETCH(d->path, d->var);
+}
+
+void
+tunable_long_init(void *data)
+{
+	struct tunable_long *d = (struct tunable_long *)data;
+
+	TUNABLE_LONG_FETCH(d->path, d->var);
+}
+
+void
+tunable_ulong_init(void *data)
+{
+	struct tunable_ulong *d = (struct tunable_ulong *)data;
+
+	TUNABLE_ULONG_FETCH(d->path, d->var);
+}
+
+void
+tunable_quad_init(void *data)
+{
+	struct tunable_quad *d = (struct tunable_quad *)data;
+
+	TUNABLE_QUAD_FETCH(d->path, d->var);
+}
+
+void
+tunable_str_init(void *data)
+{
+	struct tunable_str *d = (struct tunable_str *)data;
+
+	TUNABLE_STR_FETCH(d->path, d->var, d->size);
+}
diff --git a/sys/kern/kern_et.c b/sys/kern/kern_et.c
new file mode 100644
index 0000000..d07316c
--- /dev/null
+++ b/sys/kern/kern_et.c
@@ -0,0 +1,246 @@
+/*-
+ * Copyright (c) 2010-2013 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer,
+ *    without modification, immediately at the beginning of the file.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/queue.h>
+#include <sys/timeet.h>
+
+SLIST_HEAD(et_eventtimers_list, eventtimer);
+static struct et_eventtimers_list eventtimers = SLIST_HEAD_INITIALIZER(et_eventtimers);
+
+struct mtx	et_eventtimers_mtx;
+MTX_SYSINIT(et_eventtimers_init, &et_eventtimers_mtx, "et_mtx", MTX_DEF);
+
+SYSCTL_NODE(_kern, OID_AUTO, eventtimer, CTLFLAG_RW, 0, "Event timers");
+static SYSCTL_NODE(_kern_eventtimer, OID_AUTO, et, CTLFLAG_RW, 0, "");
+
+/*
+ * Register a new event timer hardware.
+ */
+int
+et_register(struct eventtimer *et)
+{
+	struct eventtimer *tmp, *next;
+
+	if (et->et_quality >= 0 || bootverbose) {
+		if (et->et_frequency == 0) {
+			printf("Event timer \"%s\" quality %d\n",
+			    et->et_name, et->et_quality);
+		} else {
+			printf("Event timer \"%s\" "
+			    "frequency %ju Hz quality %d\n",
+			    et->et_name, (uintmax_t)et->et_frequency,
+			    et->et_quality);
+		}
+	}
+	KASSERT(et->et_start, ("et_register: timer has no start function"));
+	et->et_sysctl = SYSCTL_ADD_NODE(NULL,
+	    SYSCTL_STATIC_CHILDREN(_kern_eventtimer_et), OID_AUTO, et->et_name,
+	    CTLFLAG_RW, 0, "event timer description");
+	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(et->et_sysctl), OID_AUTO,
+	    "flags", CTLFLAG_RD, &(et->et_flags), 0,
+	    "Event timer capabilities");
+	SYSCTL_ADD_UQUAD(NULL, SYSCTL_CHILDREN(et->et_sysctl), OID_AUTO,
+	    "frequency", CTLFLAG_RD, &(et->et_frequency),
+	    "Event timer base frequency");
+	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(et->et_sysctl), OID_AUTO,
+	    "quality", CTLFLAG_RD, &(et->et_quality), 0,
+	    "Goodness of event timer");
+	ET_LOCK();
+	if (SLIST_EMPTY(&eventtimers) ||
+	    SLIST_FIRST(&eventtimers)->et_quality < et->et_quality) {
+		SLIST_INSERT_HEAD(&eventtimers, et, et_all);
+	} else {
+		SLIST_FOREACH(tmp, &eventtimers, et_all) {
+			next = SLIST_NEXT(tmp, et_all);
+			if (next == NULL || next->et_quality < et->et_quality) {
+				SLIST_INSERT_AFTER(tmp, et, et_all);
+				break;
+			}
+		}
+	}
+	ET_UNLOCK();
+	return (0);
+}
+
+/*
+ * Deregister event timer hardware.
+ */
+int
+et_deregister(struct eventtimer *et)
+{
+	int err = 0;
+
+	if (et->et_deregister_cb != NULL) {
+		if ((err = et->et_deregister_cb(et, et->et_arg)) != 0)
+			return (err);
+	}
+
+	ET_LOCK();
+	SLIST_REMOVE(&eventtimers, et, eventtimer, et_all);
+	ET_UNLOCK();
+	sysctl_remove_oid(et->et_sysctl, 1, 1);
+	return (0);
+}
+
+/*
+ * Find free event timer hardware with specified parameters.
+ */
+struct eventtimer *
+et_find(const char *name, int check, int want)
+{
+	struct eventtimer *et = NULL;
+
+	SLIST_FOREACH(et, &eventtimers, et_all) {
+		if (et->et_active)
+			continue;
+		if (name != NULL && strcasecmp(et->et_name, name) != 0)
+			continue;
+		if (name == NULL && et->et_quality < 0)
+			continue;
+		if ((et->et_flags & check) != want)
+			continue;
+		break;
+	}
+	return (et);
+}
+
+/*
+ * Initialize event timer hardware. Set callbacks.
+ */
+int
+et_init(struct eventtimer *et, et_event_cb_t *event,
+    et_deregister_cb_t *deregister, void *arg)
+{
+
+	if (event == NULL)
+		return (EINVAL);
+	if (et->et_active)
+		return (EBUSY);
+
+	et->et_active = 1;
+	et->et_event_cb = event;
+	et->et_deregister_cb = deregister;
+	et->et_arg = arg;
+	return (0);
+}
+
+/*
+ * Start event timer hardware.
+ * first - delay before first tick.
+ * period - period of subsequent periodic ticks.
+ */
+int
+et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
+{
+
+	if (!et->et_active)
+		return (ENXIO);
+	KASSERT(period >= 0, ("et_start: negative period"));
+	KASSERT((et->et_flags & ET_FLAGS_PERIODIC) || period == 0,
+		("et_start: period specified for oneshot-only timer"));
+	KASSERT((et->et_flags & ET_FLAGS_ONESHOT) || period != 0,
+		("et_start: period not specified for periodic-only timer"));
+	if (period != 0) {
+		if (period < et->et_min_period)
+		        period = et->et_min_period;
+		else if (period > et->et_max_period)
+		        period = et->et_max_period;
+	}
+	if (period == 0 || first != 0) {
+		if (first < et->et_min_period)
+		        first = et->et_min_period;
+		else if (first > et->et_max_period)
+		        first = et->et_max_period;
+	}
+	return (et->et_start(et, first, period));
+}
+
+/* Stop event timer hardware. */
+int
+et_stop(struct eventtimer *et)
+{
+
+	if (!et->et_active)
+		return (ENXIO);
+	if (et->et_stop)
+		return (et->et_stop(et));
+	return (0);
+}
+
+/* Mark event timer hardware as broken. */
+int
+et_ban(struct eventtimer *et)
+{
+
+	et->et_flags &= ~(ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT);
+	return (0);
+}
+
+/* Free event timer hardware. */
+int
+et_free(struct eventtimer *et)
+{
+
+	if (!et->et_active)
+		return (ENXIO);
+
+	et->et_active = 0;
+	return (0);
+}
+
+/* Report list of supported event timers hardware via sysctl. */
+static int
+sysctl_kern_eventtimer_choice(SYSCTL_HANDLER_ARGS)
+{
+	char buf[512], *spc;
+	struct eventtimer *et;
+	int error, off;
+
+	spc = "";
+	error = 0;
+	buf[0] = 0;
+	off = 0;
+	ET_LOCK();
+	SLIST_FOREACH(et, &eventtimers, et_all) {
+		off += snprintf(buf + off, sizeof(buf) - off, "%s%s(%d)",
+		    spc, et->et_name, et->et_quality);
+		spc = " ";
+	}
+	ET_UNLOCK();
+	error = SYSCTL_OUT(req, buf, strlen(buf));
+	return (error);
+}
+SYSCTL_PROC(_kern_eventtimer, OID_AUTO, choice,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+    0, 0, sysctl_kern_eventtimer_choice, "A", "Present event timers");
+
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
new file mode 100644
index 0000000..8bde25a
--- /dev/null
+++ b/sys/kern/kern_event.c
@@ -0,0 +1,2261 @@
+/*-
+ * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
+ * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
+ * Copyright (c) 2009 Apple, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/capability.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/unistd.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/kthread.h>
+#include <sys/selinfo.h>
+#include <sys/stdatomic.h>
+#include <sys/queue.h>
+#include <sys/event.h>
+#include <sys/eventvar.h>
+#include <sys/poll.h>
+#include <sys/protosw.h>
+#include <sys/sigio.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/syscallsubr.h>
+#include <sys/taskqueue.h>
+#include <sys/uio.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/uma.h>
+
+static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
+
+/*
+ * This lock is used if multiple kq locks are required.  This possibly
+ * should be made into a per proc lock.
+ */
+static struct mtx	kq_global;
+MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
+#define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
+	if (!haslck)				\
+		mtx_lock(lck);			\
+	haslck = 1;				\
+} while (0)
+#define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
+	if (haslck)				\
+		mtx_unlock(lck);			\
+	haslck = 0;				\
+} while (0)
+
+TASKQUEUE_DEFINE_THREAD(kqueue);
+
+static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
+static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
+static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
+		    struct thread *td, int waitok);
+static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
+static void	kqueue_release(struct kqueue *kq, int locked);
+static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
+		    uintptr_t ident, int waitok);
+static void	kqueue_task(void *arg, int pending);
+static int	kqueue_scan(struct kqueue *kq, int maxevents,
+		    struct kevent_copyops *k_ops,
+		    const struct timespec *timeout,
+		    struct kevent *keva, struct thread *td);
+static void 	kqueue_wakeup(struct kqueue *kq);
+static struct filterops *kqueue_fo_find(int filt);
+static void	kqueue_fo_release(int filt);
+
+static fo_rdwr_t	kqueue_read;
+static fo_rdwr_t	kqueue_write;
+static fo_truncate_t	kqueue_truncate;
+static fo_ioctl_t	kqueue_ioctl;
+static fo_poll_t	kqueue_poll;
+static fo_kqfilter_t	kqueue_kqfilter;
+static fo_stat_t	kqueue_stat;
+static fo_close_t	kqueue_close;
+
+static struct fileops kqueueops = {
+	.fo_read = kqueue_read,
+	.fo_write = kqueue_write,
+	.fo_truncate = kqueue_truncate,
+	.fo_ioctl = kqueue_ioctl,
+	.fo_poll = kqueue_poll,
+	.fo_kqfilter = kqueue_kqfilter,
+	.fo_stat = kqueue_stat,
+	.fo_close = kqueue_close,
+	.fo_chmod = invfo_chmod,
+	.fo_chown = invfo_chown,
+	.fo_sendfile = invfo_sendfile,
+};
+
+static int 	knote_attach(struct knote *kn, struct kqueue *kq);
+static void 	knote_drop(struct knote *kn, struct thread *td);
+static void 	knote_enqueue(struct knote *kn);
+static void 	knote_dequeue(struct knote *kn);
+static void 	knote_init(void);
+static struct 	knote *knote_alloc(int waitok);
+static void 	knote_free(struct knote *kn);
+
+static void	filt_kqdetach(struct knote *kn);
+static int	filt_kqueue(struct knote *kn, long hint);
+static int	filt_procattach(struct knote *kn);
+static void	filt_procdetach(struct knote *kn);
+static int	filt_proc(struct knote *kn, long hint);
+static int	filt_fileattach(struct knote *kn);
+static void	filt_timerexpire(void *knx);
+static int	filt_timerattach(struct knote *kn);
+static void	filt_timerdetach(struct knote *kn);
+static int	filt_timer(struct knote *kn, long hint);
+static int	filt_userattach(struct knote *kn);
+static void	filt_userdetach(struct knote *kn);
+static int	filt_user(struct knote *kn, long hint);
+static void	filt_usertouch(struct knote *kn, struct kevent *kev,
+		    u_long type);
+
+static struct filterops file_filtops = {
+	.f_isfd = 1,
+	.f_attach = filt_fileattach,
+};
+static struct filterops kqread_filtops = {
+	.f_isfd = 1,
+	.f_detach = filt_kqdetach,
+	.f_event = filt_kqueue,
+};
+/* XXX - move to kern_proc.c?  */
+static struct filterops proc_filtops = {
+	.f_isfd = 0,
+	.f_attach = filt_procattach,
+	.f_detach = filt_procdetach,
+	.f_event = filt_proc,
+};
+static struct filterops timer_filtops = {
+	.f_isfd = 0,
+	.f_attach = filt_timerattach,
+	.f_detach = filt_timerdetach,
+	.f_event = filt_timer,
+};
+static struct filterops user_filtops = {
+	.f_attach = filt_userattach,
+	.f_detach = filt_userdetach,
+	.f_event = filt_user,
+	.f_touch = filt_usertouch,
+};
+
+static uma_zone_t	knote_zone;
+static atomic_uint	kq_ncallouts = ATOMIC_VAR_INIT(0);
+static unsigned int 	kq_calloutmax = 4 * 1024;
+SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
+    &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
+
+/* XXX - ensure not KN_INFLUX?? */
+#define KNOTE_ACTIVATE(kn, islock) do { 				\
+	if ((islock))							\
+		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
+	else								\
+		KQ_LOCK((kn)->kn_kq);					\
+	(kn)->kn_status |= KN_ACTIVE;					\
+	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
+		knote_enqueue((kn));					\
+	if (!(islock))							\
+		KQ_UNLOCK((kn)->kn_kq);					\
+} while(0)
+#define KQ_LOCK(kq) do {						\
+	mtx_lock(&(kq)->kq_lock);					\
+} while (0)
+#define KQ_FLUX_WAKEUP(kq) do {						\
+	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
+		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
+		wakeup((kq));						\
+	}								\
+} while (0)
+#define KQ_UNLOCK_FLUX(kq) do {						\
+	KQ_FLUX_WAKEUP(kq);						\
+	mtx_unlock(&(kq)->kq_lock);					\
+} while (0)
+#define KQ_UNLOCK(kq) do {						\
+	mtx_unlock(&(kq)->kq_lock);					\
+} while (0)
+#define KQ_OWNED(kq) do {						\
+	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
+} while (0)
+#define KQ_NOTOWNED(kq) do {						\
+	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
+} while (0)
+#define KN_LIST_LOCK(kn) do {						\
+	if (kn->kn_knlist != NULL)					\
+		kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg);	\
+} while (0)
+#define KN_LIST_UNLOCK(kn) do {						\
+	if (kn->kn_knlist != NULL) 					\
+		kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg);	\
+} while (0)
+#define	KNL_ASSERT_LOCK(knl, islocked) do {				\
+	if (islocked)							\
+		KNL_ASSERT_LOCKED(knl);				\
+	else								\
+		KNL_ASSERT_UNLOCKED(knl);				\
+} while (0)
+#ifdef INVARIANTS
+#define	KNL_ASSERT_LOCKED(knl) do {					\
+	knl->kl_assert_locked((knl)->kl_lockarg);			\
+} while (0)
+#define	KNL_ASSERT_UNLOCKED(knl) do {					\
+	knl->kl_assert_unlocked((knl)->kl_lockarg);			\
+} while (0)
+#else /* !INVARIANTS */
+#define	KNL_ASSERT_LOCKED(knl) do {} while(0)
+#define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
+#endif /* INVARIANTS */
+
+#define	KN_HASHSIZE		64		/* XXX should be tunable */
+#define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
+
+static int
+filt_nullattach(struct knote *kn)
+{
+
+	return (ENXIO);
+};
+
+struct filterops null_filtops = {
+	.f_isfd = 0,
+	.f_attach = filt_nullattach,
+};
+
+/* XXX - make SYSINIT to add these, and move into respective modules. */
+extern struct filterops sig_filtops;
+extern struct filterops fs_filtops;
+
+/*
+ * Table for for all system-defined filters.
+ */
+static struct mtx	filterops_lock;
+MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
+	MTX_DEF);
+static struct {
+	struct filterops *for_fop;
+	int for_refcnt;
+} sysfilt_ops[EVFILT_SYSCOUNT] = {
+	{ &file_filtops },			/* EVFILT_READ */
+	{ &file_filtops },			/* EVFILT_WRITE */
+	{ &null_filtops },			/* EVFILT_AIO */
+	{ &file_filtops },			/* EVFILT_VNODE */
+	{ &proc_filtops },			/* EVFILT_PROC */
+	{ &sig_filtops },			/* EVFILT_SIGNAL */
+	{ &timer_filtops },			/* EVFILT_TIMER */
+	{ &null_filtops },			/* former EVFILT_NETDEV */
+	{ &fs_filtops },			/* EVFILT_FS */
+	{ &null_filtops },			/* EVFILT_LIO */
+	{ &user_filtops },			/* EVFILT_USER */
+};
+
+/*
+ * Simple redirection for all cdevsw style objects to call their fo_kqfilter
+ * method.
+ */
+static int
+filt_fileattach(struct knote *kn)
+{
+
+	return (fo_kqfilter(kn->kn_fp, kn));
+}
+
+/*ARGSUSED*/
+static int
+kqueue_kqfilter(struct file *fp, struct knote *kn)
+{
+	struct kqueue *kq = kn->kn_fp->f_data;
+
+	if (kn->kn_filter != EVFILT_READ)
+		return (EINVAL);
+
+	kn->kn_status |= KN_KQUEUE;
+	kn->kn_fop = &kqread_filtops;
+	knlist_add(&kq->kq_sel.si_note, kn, 0);
+
+	return (0);
+}
+
+static void
+filt_kqdetach(struct knote *kn)
+{
+	struct kqueue *kq = kn->kn_fp->f_data;
+
+	knlist_remove(&kq->kq_sel.si_note, kn, 0);
+}
+
+/*ARGSUSED*/
+static int
+filt_kqueue(struct knote *kn, long hint)
+{
+	struct kqueue *kq = kn->kn_fp->f_data;
+
+	kn->kn_data = kq->kq_count;
+	return (kn->kn_data > 0);
+}
+
+/* XXX - move to kern_proc.c?  */
+static int
+filt_procattach(struct knote *kn)
+{
+	struct proc *p;
+	int immediate;
+	int error;
+
+	immediate = 0;
+	p = pfind(kn->kn_id);
+	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
+		p = zpfind(kn->kn_id);
+		immediate = 1;
+	} else if (p != NULL && (p->p_flag & P_WEXIT)) {
+		immediate = 1;
+	}
+
+	if (p == NULL)
+		return (ESRCH);
+	if ((error = p_cansee(curthread, p))) {
+		PROC_UNLOCK(p);
+		return (error);
+	}
+
+	kn->kn_ptr.p_proc = p;
+	kn->kn_flags |= EV_CLEAR;		/* automatically set */
+
+	/*
+	 * internal flag indicating registration done by kernel
+	 */
+	if (kn->kn_flags & EV_FLAG1) {
+		kn->kn_data = kn->kn_sdata;		/* ppid */
+		kn->kn_fflags = NOTE_CHILD;
+		kn->kn_flags &= ~EV_FLAG1;
+	}
+
+	if (immediate == 0)
+		knlist_add(&p->p_klist, kn, 1);
+
+	/*
+	 * Immediately activate any exit notes if the target process is a
+	 * zombie.  This is necessary to handle the case where the target
+	 * process, e.g. a child, dies before the kevent is registered.
+	 */
+	if (immediate && filt_proc(kn, NOTE_EXIT))
+		KNOTE_ACTIVATE(kn, 0);
+
+	PROC_UNLOCK(p);
+
+	return (0);
+}
+
+/*
+ * The knote may be attached to a different process, which may exit,
+ * leaving nothing for the knote to be attached to.  So when the process
+ * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
+ * it will be deleted when read out.  However, as part of the knote deletion,
+ * this routine is called, so a check is needed to avoid actually performing
+ * a detach, because the original process does not exist any more.
+ */
+/* XXX - move to kern_proc.c?  */
+static void
+filt_procdetach(struct knote *kn)
+{
+	struct proc *p;
+
+	p = kn->kn_ptr.p_proc;
+	knlist_remove(&p->p_klist, kn, 0);
+	kn->kn_ptr.p_proc = NULL;
+}
+
+/* XXX - move to kern_proc.c?  */
+static int
+filt_proc(struct knote *kn, long hint)
+{
+	struct proc *p = kn->kn_ptr.p_proc;
+	u_int event;
+
+	/*
+	 * mask off extra data
+	 */
+	event = (u_int)hint & NOTE_PCTRLMASK;
+
+	/*
+	 * if the user is interested in this event, record it.
+	 */
+	if (kn->kn_sfflags & event)
+		kn->kn_fflags |= event;
+
+	/*
+	 * process is gone, so flag the event as finished.
+	 */
+	if (event == NOTE_EXIT) {
+		if (!(kn->kn_status & KN_DETACHED))
+			knlist_remove_inevent(&p->p_klist, kn);
+		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+		kn->kn_ptr.p_proc = NULL;
+		if (kn->kn_fflags & NOTE_EXIT)
+			kn->kn_data = p->p_xstat;
+		if (kn->kn_fflags == 0)
+			kn->kn_flags |= EV_DROP;
+		return (1);
+	}
+
+	return (kn->kn_fflags != 0);
+}
+
+/*
+ * Called when the process forked. It mostly does the same as the
+ * knote(), activating all knotes registered to be activated when the
+ * process forked. Additionally, for each knote attached to the
+ * parent, check whether user wants to track the new process. If so
+ * attach a new knote to it, and immediately report an event with the
+ * child's pid.
+ */
+void
+knote_fork(struct knlist *list, int pid)
+{
+	struct kqueue *kq;
+	struct knote *kn;
+	struct kevent kev;
+	int error;
+
+	if (list == NULL)
+		return;
+	list->kl_lock(list->kl_lockarg);
+
+	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
+		if ((kn->kn_status & KN_INFLUX) == KN_INFLUX)
+			continue;
+		kq = kn->kn_kq;
+		KQ_LOCK(kq);
+		if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+			KQ_UNLOCK(kq);
+			continue;
+		}
+
+		/*
+		 * The same as knote(), activate the event.
+		 */
+		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
+			kn->kn_status |= KN_HASKQLOCK;
+			if (kn->kn_fop->f_event(kn, NOTE_FORK))
+				KNOTE_ACTIVATE(kn, 1);
+			kn->kn_status &= ~KN_HASKQLOCK;
+			KQ_UNLOCK(kq);
+			continue;
+		}
+
+		/*
+		 * The NOTE_TRACK case. In addition to the activation
+		 * of the event, we need to register new event to
+		 * track the child. Drop the locks in preparation for
+		 * the call to kqueue_register().
+		 */
+		kn->kn_status |= KN_INFLUX;
+		KQ_UNLOCK(kq);
+		list->kl_unlock(list->kl_lockarg);
+
+		/*
+		 * Activate existing knote and register a knote with
+		 * new process.
+		 */
+		kev.ident = pid;
+		kev.filter = kn->kn_filter;
+		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
+		kev.fflags = kn->kn_sfflags;
+		kev.data = kn->kn_id;		/* parent */
+		kev.udata = kn->kn_kevent.udata;/* preserve udata */
+		error = kqueue_register(kq, &kev, NULL, 0);
+		if (error)
+			kn->kn_fflags |= NOTE_TRACKERR;
+		if (kn->kn_fop->f_event(kn, NOTE_FORK))
+			KNOTE_ACTIVATE(kn, 0);
+		KQ_LOCK(kq);
+		kn->kn_status &= ~KN_INFLUX;
+		KQ_UNLOCK_FLUX(kq);
+		list->kl_lock(list->kl_lockarg);
+	}
+	list->kl_unlock(list->kl_lockarg);
+}
+
+/*
+ * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
+ * interval timer support code.
+ */
+static __inline sbintime_t 
+timer2sbintime(intptr_t data)
+{
+
+	return (SBT_1MS * data);
+}
+
+static void
+filt_timerexpire(void *knx)
+{
+	struct callout *calloutp;
+	struct knote *kn;
+
+	kn = knx;
+	kn->kn_data++;
+	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
+
+	if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
+		calloutp = (struct callout *)kn->kn_hook;
+		callout_reset_sbt_on(calloutp,
+		    timer2sbintime(kn->kn_sdata), 0 /* 1ms? */,
+		    filt_timerexpire, kn, PCPU_GET(cpuid), 0);
+	}
+}
+
+/*
+ * data contains amount of time to sleep, in milliseconds
+ */
+static int
+filt_timerattach(struct knote *kn)
+{
+	struct callout *calloutp;
+	unsigned int ncallouts;
+
+	ncallouts = atomic_load_explicit(&kq_ncallouts, memory_order_relaxed);
+	do {
+		if (ncallouts >= kq_calloutmax)
+			return (ENOMEM);
+	} while (!atomic_compare_exchange_weak_explicit(&kq_ncallouts,
+	    &ncallouts, ncallouts + 1, memory_order_relaxed,
+	    memory_order_relaxed));
+
+	kn->kn_flags |= EV_CLEAR;		/* automatically set */
+	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
+	calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
+	callout_init(calloutp, CALLOUT_MPSAFE);
+	kn->kn_hook = calloutp;
+	callout_reset_sbt_on(calloutp,
+	    timer2sbintime(kn->kn_sdata), 0 /* 1ms? */,
+	    filt_timerexpire, kn, PCPU_GET(cpuid), 0);
+
+	return (0);
+}
+
+static void
+filt_timerdetach(struct knote *kn)
+{
+	struct callout *calloutp;
+	unsigned int old;
+
+	calloutp = (struct callout *)kn->kn_hook;
+	callout_drain(calloutp);
+	free(calloutp, M_KQUEUE);
+	old = atomic_fetch_sub_explicit(&kq_ncallouts, 1, memory_order_relaxed);
+	KASSERT(old > 0, ("Number of callouts cannot become negative"));
+	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
+}
+
+static int
+filt_timer(struct knote *kn, long hint)
+{
+
+	return (kn->kn_data != 0);
+}
+
+static int
+filt_userattach(struct knote *kn)
+{
+
+	/* 
+	 * EVFILT_USER knotes are not attached to anything in the kernel.
+	 */ 
+	kn->kn_hook = NULL;
+	if (kn->kn_fflags & NOTE_TRIGGER)
+		kn->kn_hookid = 1;
+	else
+		kn->kn_hookid = 0;
+	return (0);
+}
+
+static void
+filt_userdetach(__unused struct knote *kn)
+{
+
+	/*
+	 * EVFILT_USER knotes are not attached to anything in the kernel.
+	 */
+}
+
+static int
+filt_user(struct knote *kn, __unused long hint)
+{
+
+	return (kn->kn_hookid);
+}
+
+static void
+filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
+{
+	u_int ffctrl;
+
+	switch (type) {
+	case EVENT_REGISTER:
+		if (kev->fflags & NOTE_TRIGGER)
+			kn->kn_hookid = 1;
+
+		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
+		kev->fflags &= NOTE_FFLAGSMASK;
+		switch (ffctrl) {
+		case NOTE_FFNOP:
+			break;
+
+		case NOTE_FFAND:
+			kn->kn_sfflags &= kev->fflags;
+			break;
+
+		case NOTE_FFOR:
+			kn->kn_sfflags |= kev->fflags;
+			break;
+
+		case NOTE_FFCOPY:
+			kn->kn_sfflags = kev->fflags;
+			break;
+
+		default:
+			/* XXX Return error? */
+			break;
+		}
+		kn->kn_sdata = kev->data;
+		if (kev->flags & EV_CLEAR) {
+			kn->kn_hookid = 0;
+			kn->kn_data = 0;
+			kn->kn_fflags = 0;
+		}
+		break;
+
+        case EVENT_PROCESS:
+		*kev = kn->kn_kevent;
+		kev->fflags = kn->kn_sfflags;
+		kev->data = kn->kn_sdata;
+		if (kn->kn_flags & EV_CLEAR) {
+			kn->kn_hookid = 0;
+			kn->kn_data = 0;
+			kn->kn_fflags = 0;
+		}
+		break;
+
+	default:
+		panic("filt_usertouch() - invalid type (%ld)", type);
+		break;
+	}
+}
+
+int
+sys_kqueue(struct thread *td, struct kqueue_args *uap)
+{
+	struct filedesc *fdp;
+	struct kqueue *kq;
+	struct file *fp;
+	int fd, error;
+
+	fdp = td->td_proc->p_fd;
+	error = falloc(td, &fp, &fd, 0);
+	if (error)
+		goto done2;
+
+	/* An extra reference on `fp' has been held for us by falloc(). */
+	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
+	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
+	TAILQ_INIT(&kq->kq_head);
+	kq->kq_fdp = fdp;
+	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
+	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
+
+	FILEDESC_XLOCK(fdp);
+	SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
+	FILEDESC_XUNLOCK(fdp);
+
+	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
+	fdrop(fp, td);
+
+	td->td_retval[0] = fd;
+done2:
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct kevent_args {
+	int	fd;
+	const struct kevent *changelist;
+	int	nchanges;
+	struct	kevent *eventlist;
+	int	nevents;
+	const struct timespec *timeout;
+};
+#endif
+int
+sys_kevent(struct thread *td, struct kevent_args *uap)
+{
+	struct timespec ts, *tsp;
+	struct kevent_copyops k_ops = { uap,
+					kevent_copyout,
+					kevent_copyin};
+	int error;
+#ifdef KTRACE
+	struct uio ktruio;
+	struct iovec ktriov;
+	struct uio *ktruioin = NULL;
+	struct uio *ktruioout = NULL;
+#endif
+
+	if (uap->timeout != NULL) {
+		error = copyin(uap->timeout, &ts, sizeof(ts));
+		if (error)
+			return (error);
+		tsp = &ts;
+	} else
+		tsp = NULL;
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_GENIO)) {
+		ktriov.iov_base = uap->changelist;
+		ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
+		ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
+		    .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
+		    .uio_td = td };
+		ktruioin = cloneuio(&ktruio);
+		ktriov.iov_base = uap->eventlist;
+		ktriov.iov_len = uap->nevents * sizeof(struct kevent);
+		ktruioout = cloneuio(&ktruio);
+	}
+#endif
+
+	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
+	    &k_ops, tsp);
+
+#ifdef KTRACE
+	if (ktruioin != NULL) {
+		ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
+		ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
+		ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
+		ktrgenio(uap->fd, UIO_READ, ktruioout, error);
+	}
+#endif
+
+	return (error);
+}
+
+/*
+ * Copy 'count' items into the destination list pointed to by uap->eventlist.
+ */
+static int
+kevent_copyout(void *arg, struct kevent *kevp, int count)
+{
+	struct kevent_args *uap;
+	int error;
+
+	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
+	uap = (struct kevent_args *)arg;
+
+	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
+	if (error == 0)
+		uap->eventlist += count;
+	return (error);
+}
+
+/*
+ * Copy 'count' items from the list pointed to by uap->changelist.
+ */
+static int
+kevent_copyin(void *arg, struct kevent *kevp, int count)
+{
+	struct kevent_args *uap;
+	int error;
+
+	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
+	uap = (struct kevent_args *)arg;
+
+	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
+	if (error == 0)
+		uap->changelist += count;
+	return (error);
+}
+
+int
+kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
+    struct kevent_copyops *k_ops, const struct timespec *timeout)
+{
+	struct kevent keva[KQ_NEVENTS];
+	struct kevent *kevp, *changes;
+	struct kqueue *kq;
+	struct file *fp;
+	cap_rights_t rights;
+	int i, n, nerrors, error;
+
+	error = fget(td, fd, cap_rights_init(&rights, CAP_POST_EVENT), &fp);
+	if (error != 0)
+		return (error);
+	if ((error = kqueue_acquire(fp, &kq)) != 0)
+		goto done_norel;
+
+	nerrors = 0;
+
+	while (nchanges > 0) {
+		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
+		error = k_ops->k_copyin(k_ops->arg, keva, n);
+		if (error)
+			goto done;
+		changes = keva;
+		for (i = 0; i < n; i++) {
+			kevp = &changes[i];
+			if (!kevp->filter)
+				continue;
+			kevp->flags &= ~EV_SYSFLAGS;
+			error = kqueue_register(kq, kevp, td, 1);
+			if (error || (kevp->flags & EV_RECEIPT)) {
+				if (nevents != 0) {
+					kevp->flags = EV_ERROR;
+					kevp->data = error;
+					(void) k_ops->k_copyout(k_ops->arg,
+					    kevp, 1);
+					nevents--;
+					nerrors++;
+				} else {
+					goto done;
+				}
+			}
+		}
+		nchanges -= n;
+	}
+	if (nerrors) {
+		td->td_retval[0] = nerrors;
+		error = 0;
+		goto done;
+	}
+
+	error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td);
+done:
+	kqueue_release(kq, 0);
+done_norel:
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+kqueue_add_filteropts(int filt, struct filterops *filtops)
+{
+	int error;
+
+	error = 0;
+	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
+		printf(
+"trying to add a filterop that is out of range: %d is beyond %d\n",
+		    ~filt, EVFILT_SYSCOUNT);
+		return EINVAL;
+	}
+	mtx_lock(&filterops_lock);
+	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
+	    sysfilt_ops[~filt].for_fop != NULL)
+		error = EEXIST;
+	else {
+		sysfilt_ops[~filt].for_fop = filtops;
+		sysfilt_ops[~filt].for_refcnt = 0;
+	}
+	mtx_unlock(&filterops_lock);
+
+	return (error);
+}
+
+int
+kqueue_del_filteropts(int filt)
+{
+	int error;
+
+	error = 0;
+	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
+		return EINVAL;
+
+	mtx_lock(&filterops_lock);
+	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
+	    sysfilt_ops[~filt].for_fop == NULL)
+		error = EINVAL;
+	else if (sysfilt_ops[~filt].for_refcnt != 0)
+		error = EBUSY;
+	else {
+		sysfilt_ops[~filt].for_fop = &null_filtops;
+		sysfilt_ops[~filt].for_refcnt = 0;
+	}
+	mtx_unlock(&filterops_lock);
+
+	return error;
+}
+
+static struct filterops *
+kqueue_fo_find(int filt)
+{
+
+	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
+		return NULL;
+
+	mtx_lock(&filterops_lock);
+	sysfilt_ops[~filt].for_refcnt++;
+	if (sysfilt_ops[~filt].for_fop == NULL)
+		sysfilt_ops[~filt].for_fop = &null_filtops;
+	mtx_unlock(&filterops_lock);
+
+	return sysfilt_ops[~filt].for_fop;
+}
+
+static void
+kqueue_fo_release(int filt)
+{
+
+	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
+		return;
+
+	mtx_lock(&filterops_lock);
+	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
+	    ("filter object refcount not valid on release"));
+	sysfilt_ops[~filt].for_refcnt--;
+	mtx_unlock(&filterops_lock);
+}
+
+/*
+ * A ref to kq (obtained via kqueue_acquire) must be held.  waitok will
+ * influence if memory allocation should wait.  Make sure it is 0 if you
+ * hold any mutexes.
+ */
+static int
+kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
+{
+	struct filterops *fops;
+	struct file *fp;
+	struct knote *kn, *tkn;
+	cap_rights_t rights;
+	int error, filt, event;
+	int haskqglobal;
+
+	fp = NULL;
+	kn = NULL;
+	error = 0;
+	haskqglobal = 0;
+
+	filt = kev->filter;
+	fops = kqueue_fo_find(filt);
+	if (fops == NULL)
+		return EINVAL;
+
+	tkn = knote_alloc(waitok);		/* prevent waiting with locks */
+
+findkn:
+	if (fops->f_isfd) {
+		KASSERT(td != NULL, ("td is NULL"));
+		error = fget(td, kev->ident,
+		    cap_rights_init(&rights, CAP_POLL_EVENT), &fp);
+		if (error)
+			goto done;
+
+		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
+		    kev->ident, 0) != 0) {
+			/* try again */
+			fdrop(fp, td);
+			fp = NULL;
+			error = kqueue_expand(kq, fops, kev->ident, waitok);
+			if (error)
+				goto done;
+			goto findkn;
+		}
+
+		if (fp->f_type == DTYPE_KQUEUE) {
+			/*
+			 * if we add some inteligence about what we are doing,
+			 * we should be able to support events on ourselves.
+			 * We need to know when we are doing this to prevent
+			 * getting both the knlist lock and the kq lock since
+			 * they are the same thing.
+			 */
+			if (fp->f_data == kq) {
+				error = EINVAL;
+				goto done;
+			}
+
+			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
+		}
+
+		KQ_LOCK(kq);
+		if (kev->ident < kq->kq_knlistsize) {
+			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
+				if (kev->filter == kn->kn_filter)
+					break;
+		}
+	} else {
+		if ((kev->flags & EV_ADD) == EV_ADD)
+			kqueue_expand(kq, fops, kev->ident, waitok);
+
+		KQ_LOCK(kq);
+		if (kq->kq_knhashmask != 0) {
+			struct klist *list;
+
+			list = &kq->kq_knhash[
+			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
+			SLIST_FOREACH(kn, list, kn_link)
+				if (kev->ident == kn->kn_id &&
+				    kev->filter == kn->kn_filter)
+					break;
+		}
+	}
+
+	/* knote is in the process of changing, wait for it to stablize. */
+	if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+		kq->kq_state |= KQ_FLUXWAIT;
+		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
+		if (fp != NULL) {
+			fdrop(fp, td);
+			fp = NULL;
+		}
+		goto findkn;
+	}
+
+	/*
+	 * kn now contains the matching knote, or NULL if no match
+	 */
+	if (kn == NULL) {
+		if (kev->flags & EV_ADD) {
+			kn = tkn;
+			tkn = NULL;
+			if (kn == NULL) {
+				KQ_UNLOCK(kq);
+				error = ENOMEM;
+				goto done;
+			}
+			kn->kn_fp = fp;
+			kn->kn_kq = kq;
+			kn->kn_fop = fops;
+			/*
+			 * apply reference counts to knote structure, and
+			 * do not release it at the end of this routine.
+			 */
+			fops = NULL;
+			fp = NULL;
+
+			kn->kn_sfflags = kev->fflags;
+			kn->kn_sdata = kev->data;
+			kev->fflags = 0;
+			kev->data = 0;
+			kn->kn_kevent = *kev;
+			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
+			    EV_ENABLE | EV_DISABLE);
+			kn->kn_status = KN_INFLUX|KN_DETACHED;
+
+			error = knote_attach(kn, kq);
+			KQ_UNLOCK(kq);
+			if (error != 0) {
+				tkn = kn;
+				goto done;
+			}
+
+			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
+				knote_drop(kn, td);
+				goto done;
+			}
+			KN_LIST_LOCK(kn);
+			goto done_ev_add;
+		} else {
+			/* No matching knote and the EV_ADD flag is not set. */
+			KQ_UNLOCK(kq);
+			error = ENOENT;
+			goto done;
+		}
+	}
+	
+	if (kev->flags & EV_DELETE) {
+		kn->kn_status |= KN_INFLUX;
+		KQ_UNLOCK(kq);
+		if (!(kn->kn_status & KN_DETACHED))
+			kn->kn_fop->f_detach(kn);
+		knote_drop(kn, td);
+		goto done;
+	}
+
+	/*
+	 * The user may change some filter values after the initial EV_ADD,
+	 * but doing so will not reset any filter which has already been
+	 * triggered.
+	 */
+	kn->kn_status |= KN_INFLUX;
+	KQ_UNLOCK(kq);
+	KN_LIST_LOCK(kn);
+	kn->kn_kevent.udata = kev->udata;
+	if (!fops->f_isfd && fops->f_touch != NULL) {
+		fops->f_touch(kn, kev, EVENT_REGISTER);
+	} else {
+		kn->kn_sfflags = kev->fflags;
+		kn->kn_sdata = kev->data;
+	}
+
+	/*
+	 * We can get here with kn->kn_knlist == NULL.  This can happen when
+	 * the initial attach event decides that the event is "completed" 
+	 * already.  i.e. filt_procattach is called on a zombie process.  It
+	 * will call filt_proc which will remove it from the list, and NULL
+	 * kn_knlist.
+	 */
+done_ev_add:
+	event = kn->kn_fop->f_event(kn, 0);
+	KQ_LOCK(kq);
+	if (event)
+		KNOTE_ACTIVATE(kn, 1);
+	kn->kn_status &= ~KN_INFLUX;
+	KN_LIST_UNLOCK(kn);
+
+	if ((kev->flags & EV_DISABLE) &&
+	    ((kn->kn_status & KN_DISABLED) == 0)) {
+		kn->kn_status |= KN_DISABLED;
+	}
+
+	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
+		kn->kn_status &= ~KN_DISABLED;
+		if ((kn->kn_status & KN_ACTIVE) &&
+		    ((kn->kn_status & KN_QUEUED) == 0))
+			knote_enqueue(kn);
+	}
+	KQ_UNLOCK_FLUX(kq);
+
+done:
+	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+	if (fp != NULL)
+		fdrop(fp, td);
+	if (tkn != NULL)
+		knote_free(tkn);
+	if (fops != NULL)
+		kqueue_fo_release(filt);
+	return (error);
+}
+
+static int
+kqueue_acquire(struct file *fp, struct kqueue **kqp)
+{
+	int error;
+	struct kqueue *kq;
+
+	error = 0;
+
+	kq = fp->f_data;
+	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
+		return (EBADF);
+	*kqp = kq;
+	KQ_LOCK(kq);
+	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
+		KQ_UNLOCK(kq);
+		return (EBADF);
+	}
+	kq->kq_refcnt++;
+	KQ_UNLOCK(kq);
+
+	return error;
+}
+
+static void
+kqueue_release(struct kqueue *kq, int locked)
+{
+	if (locked)
+		KQ_OWNED(kq);
+	else
+		KQ_LOCK(kq);
+	kq->kq_refcnt--;
+	if (kq->kq_refcnt == 1)
+		wakeup(&kq->kq_refcnt);
+	if (!locked)
+		KQ_UNLOCK(kq);
+}
+
+static void
+kqueue_schedtask(struct kqueue *kq)
+{
+
+	KQ_OWNED(kq);
+	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
+	    ("scheduling kqueue task while draining"));
+
+	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
+		taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
+		kq->kq_state |= KQ_TASKSCHED;
+	}
+}
+
+/*
+ * Expand the kq to make sure we have storage for fops/ident pair.
+ *
+ * Return 0 on success (or no work necessary), return errno on failure.
+ *
+ * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
+ * If kqueue_register is called from a non-fd context, there usually/should
+ * be no locks held.
+ */
+static int
+kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
+	int waitok)
+{
+	struct klist *list, *tmp_knhash, *to_free;
+	u_long tmp_knhashmask;
+	int size;
+	int fd;
+	int mflag = waitok ? M_WAITOK : M_NOWAIT;
+
+	KQ_NOTOWNED(kq);
+
+	to_free = NULL;
+	if (fops->f_isfd) {
+		fd = ident;
+		if (kq->kq_knlistsize <= fd) {
+			size = kq->kq_knlistsize;
+			while (size <= fd)
+				size += KQEXTENT;
+			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
+			if (list == NULL)
+				return ENOMEM;
+			KQ_LOCK(kq);
+			if (kq->kq_knlistsize > fd) {
+				to_free = list;
+				list = NULL;
+			} else {
+				if (kq->kq_knlist != NULL) {
+					bcopy(kq->kq_knlist, list,
+					    kq->kq_knlistsize * sizeof(*list));
+					to_free = kq->kq_knlist;
+					kq->kq_knlist = NULL;
+				}
+				bzero((caddr_t)list +
+				    kq->kq_knlistsize * sizeof(*list),
+				    (size - kq->kq_knlistsize) * sizeof(*list));
+				kq->kq_knlistsize = size;
+				kq->kq_knlist = list;
+			}
+			KQ_UNLOCK(kq);
+		}
+	} else {
+		if (kq->kq_knhashmask == 0) {
+			tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
+			    &tmp_knhashmask);
+			if (tmp_knhash == NULL)
+				return ENOMEM;
+			KQ_LOCK(kq);
+			if (kq->kq_knhashmask == 0) {
+				kq->kq_knhash = tmp_knhash;
+				kq->kq_knhashmask = tmp_knhashmask;
+			} else {
+				to_free = tmp_knhash;
+			}
+			KQ_UNLOCK(kq);
+		}
+	}
+	free(to_free, M_KQUEUE);
+
+	KQ_NOTOWNED(kq);
+	return 0;
+}
+
+static void
+kqueue_task(void *arg, int pending)
+{
+	struct kqueue *kq;
+	int haskqglobal;
+
+	haskqglobal = 0;
+	kq = arg;
+
+	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
+	KQ_LOCK(kq);
+
+	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
+
+	kq->kq_state &= ~KQ_TASKSCHED;
+	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
+		wakeup(&kq->kq_state);
+	}
+	KQ_UNLOCK(kq);
+	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+}
+
+/*
+ * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
+ * We treat KN_MARKER knotes as if they are INFLUX.
+ */
+static int
+kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
+    const struct timespec *tsp, struct kevent *keva, struct thread *td)
+{
+	struct kevent *kevp;
+	struct knote *kn, *marker;
+	sbintime_t asbt, rsbt;
+	int count, error, haskqglobal, influx, nkev, touch;
+
+	count = maxevents;
+	nkev = 0;
+	error = 0;
+	haskqglobal = 0;
+
+	if (maxevents == 0)
+		goto done_nl;
+
+	rsbt = 0;
+	if (tsp != NULL) {
+		if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
+		    tsp->tv_nsec >= 1000000000) {
+			error = EINVAL;
+			goto done_nl;
+		}
+		if (timespecisset(tsp)) {
+			if (tsp->tv_sec <= INT32_MAX) {
+				rsbt = tstosbt(*tsp);
+				if (TIMESEL(&asbt, rsbt))
+					asbt += tc_tick_sbt;
+				if (asbt <= INT64_MAX - rsbt)
+					asbt += rsbt;
+				else
+					asbt = 0;
+				rsbt >>= tc_precexp;
+			} else
+				asbt = 0;
+		} else
+			asbt = -1;
+	} else
+		asbt = 0;
+	marker = knote_alloc(1);
+	if (marker == NULL) {
+		error = ENOMEM;
+		goto done_nl;
+	}
+	marker->kn_status = KN_MARKER;
+	KQ_LOCK(kq);
+
+retry:
+	kevp = keva;
+	if (kq->kq_count == 0) {
+		if (asbt == -1) {
+			error = EWOULDBLOCK;
+		} else {
+			kq->kq_state |= KQ_SLEEP;
+			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
+			    "kqread", asbt, rsbt, C_ABSOLUTE);
+		}
+		if (error == 0)
+			goto retry;
+		/* don't restart after signals... */
+		if (error == ERESTART)
+			error = EINTR;
+		else if (error == EWOULDBLOCK)
+			error = 0;
+		goto done;
+	}
+
+	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
+	influx = 0;
+	while (count) {
+		KQ_OWNED(kq);
+		kn = TAILQ_FIRST(&kq->kq_head);
+
+		if ((kn->kn_status == KN_MARKER && kn != marker) ||
+		    (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+			if (influx) {
+				influx = 0;
+				KQ_FLUX_WAKEUP(kq);
+			}
+			kq->kq_state |= KQ_FLUXWAIT;
+			error = msleep(kq, &kq->kq_lock, PSOCK,
+			    "kqflxwt", 0);
+			continue;
+		}
+
+		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
+		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
+			kn->kn_status &= ~KN_QUEUED;
+			kq->kq_count--;
+			continue;
+		}
+		if (kn == marker) {
+			KQ_FLUX_WAKEUP(kq);
+			if (count == maxevents)
+				goto retry;
+			goto done;
+		}
+		KASSERT((kn->kn_status & KN_INFLUX) == 0,
+		    ("KN_INFLUX set when not suppose to be"));
+
+		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
+			kn->kn_status &= ~KN_QUEUED;
+			kn->kn_status |= KN_INFLUX;
+			kq->kq_count--;
+			KQ_UNLOCK(kq);
+			/*
+			 * We don't need to lock the list since we've marked
+			 * it _INFLUX.
+			 */
+			if (!(kn->kn_status & KN_DETACHED))
+				kn->kn_fop->f_detach(kn);
+			knote_drop(kn, td);
+			KQ_LOCK(kq);
+			continue;
+		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
+			kn->kn_status &= ~KN_QUEUED;
+			kn->kn_status |= KN_INFLUX;
+			kq->kq_count--;
+			KQ_UNLOCK(kq);
+			/*
+			 * We don't need to lock the list since we've marked
+			 * it _INFLUX.
+			 */
+			*kevp = kn->kn_kevent;
+			if (!(kn->kn_status & KN_DETACHED))
+				kn->kn_fop->f_detach(kn);
+			knote_drop(kn, td);
+			KQ_LOCK(kq);
+			kn = NULL;
+		} else {
+			kn->kn_status |= KN_INFLUX;
+			KQ_UNLOCK(kq);
+			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
+				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
+			KN_LIST_LOCK(kn);
+			if (kn->kn_fop->f_event(kn, 0) == 0) {
+				KQ_LOCK(kq);
+				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+				kn->kn_status &=
+				    ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX);
+				kq->kq_count--;
+				KN_LIST_UNLOCK(kn);
+				influx = 1;
+				continue;
+			}
+			touch = (!kn->kn_fop->f_isfd &&
+			    kn->kn_fop->f_touch != NULL);
+			if (touch)
+				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
+			else
+				*kevp = kn->kn_kevent;
+			KQ_LOCK(kq);
+			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
+				/* 
+				 * Manually clear knotes who weren't 
+				 * 'touch'ed.
+				 */
+				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
+					kn->kn_data = 0;
+					kn->kn_fflags = 0;
+				}
+				if (kn->kn_flags & EV_DISPATCH)
+					kn->kn_status |= KN_DISABLED;
+				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+				kq->kq_count--;
+			} else
+				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
+			
+			kn->kn_status &= ~(KN_INFLUX);
+			KN_LIST_UNLOCK(kn);
+			influx = 1;
+		}
+
+		/* we are returning a copy to the user */
+		kevp++;
+		nkev++;
+		count--;
+
+		if (nkev == KQ_NEVENTS) {
+			influx = 0;
+			KQ_UNLOCK_FLUX(kq);
+			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
+			nkev = 0;
+			kevp = keva;
+			KQ_LOCK(kq);
+			if (error)
+				break;
+		}
+	}
+	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
+done:
+	KQ_OWNED(kq);
+	KQ_UNLOCK_FLUX(kq);
+	knote_free(marker);
+done_nl:
+	KQ_NOTOWNED(kq);
+	if (nkev != 0)
+		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
+	td->td_retval[0] = maxevents - count;
+	return (error);
+}
+
+/*
+ * XXX
+ * This could be expanded to call kqueue_scan, if desired.
+ */
+/*ARGSUSED*/
+static int
+kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+	int flags, struct thread *td)
+{
+	return (ENXIO);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+	 int flags, struct thread *td)
+{
+	return (ENXIO);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+	struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_ioctl(struct file *fp, u_long cmd, void *data,
+	struct ucred *active_cred, struct thread *td)
+{
+	/*
+	 * Enabling sigio causes two major problems:
+	 * 1) infinite recursion:
+	 * Synopsys: kevent is being used to track signals and have FIOASYNC
+	 * set.  On receipt of a signal this will cause a kqueue to recurse
+	 * into itself over and over.  Sending the sigio causes the kqueue
+	 * to become ready, which in turn posts sigio again, forever.
+	 * Solution: this can be solved by setting a flag in the kqueue that
+	 * we have a SIGIO in progress.
+	 * 2) locking problems:
+	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
+	 * us above the proc and pgrp locks.
+	 * Solution: Post a signal using an async mechanism, being sure to
+	 * record a generation count in the delivery so that we do not deliver
+	 * a signal to the wrong process.
+	 *
+	 * Note, these two mechanisms are somewhat mutually exclusive!
+	 */
+#if 0
+	struct kqueue *kq;
+
+	kq = fp->f_data;
+	switch (cmd) {
+	case FIOASYNC:
+		if (*(int *)data) {
+			kq->kq_state |= KQ_ASYNC;
+		} else {
+			kq->kq_state &= ~KQ_ASYNC;
+		}
+		return (0);
+
+	case FIOSETOWN:
+		return (fsetown(*(int *)data, &kq->kq_sigio));
+
+	case FIOGETOWN:
+		*(int *)data = fgetown(&kq->kq_sigio);
+		return (0);
+	}
+#endif
+
+	return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
+	struct thread *td)
+{
+	struct kqueue *kq;
+	int revents = 0;
+	int error;
+
+	if ((error = kqueue_acquire(fp, &kq)))
+		return POLLERR;
+
+	KQ_LOCK(kq);
+	if (events & (POLLIN | POLLRDNORM)) {
+		if (kq->kq_count) {
+			revents |= events & (POLLIN | POLLRDNORM);
+		} else {
+			selrecord(td, &kq->kq_sel);
+			if (SEL_WAITING(&kq->kq_sel))
+				kq->kq_state |= KQ_SEL;
+		}
+	}
+	kqueue_release(kq, 1);
+	KQ_UNLOCK(kq);
+	return (revents);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
+	struct thread *td)
+{
+
+	bzero((void *)st, sizeof *st);
+	/*
+	 * We no longer return kq_count because the unlocked value is useless.
+	 * If you spent all this time getting the count, why not spend your
+	 * syscall better by calling kevent?
+	 *
+	 * XXX - This is needed for libc_r.
+	 */
+	st->st_mode = S_IFIFO;
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_close(struct file *fp, struct thread *td)
+{
+	struct kqueue *kq = fp->f_data;
+	struct filedesc *fdp;
+	struct knote *kn;
+	int i;
+	int error;
+
+	if ((error = kqueue_acquire(fp, &kq)))
+		return error;
+
+	KQ_LOCK(kq);
+
+	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
+	    ("kqueue already closing"));
+	kq->kq_state |= KQ_CLOSING;
+	if (kq->kq_refcnt > 1)
+		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
+
+	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
+	fdp = kq->kq_fdp;
+
+	KASSERT(knlist_empty(&kq->kq_sel.si_note),
+	    ("kqueue's knlist not empty"));
+
+	for (i = 0; i < kq->kq_knlistsize; i++) {
+		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
+			if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+				kq->kq_state |= KQ_FLUXWAIT;
+				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
+				continue;
+			}
+			kn->kn_status |= KN_INFLUX;
+			KQ_UNLOCK(kq);
+			if (!(kn->kn_status & KN_DETACHED))
+				kn->kn_fop->f_detach(kn);
+			knote_drop(kn, td);
+			KQ_LOCK(kq);
+		}
+	}
+	if (kq->kq_knhashmask != 0) {
+		for (i = 0; i <= kq->kq_knhashmask; i++) {
+			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
+				if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+					kq->kq_state |= KQ_FLUXWAIT;
+					msleep(kq, &kq->kq_lock, PSOCK,
+					       "kqclo2", 0);
+					continue;
+				}
+				kn->kn_status |= KN_INFLUX;
+				KQ_UNLOCK(kq);
+				if (!(kn->kn_status & KN_DETACHED))
+					kn->kn_fop->f_detach(kn);
+				knote_drop(kn, td);
+				KQ_LOCK(kq);
+			}
+		}
+	}
+
+	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
+		kq->kq_state |= KQ_TASKDRAIN;
+		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
+	}
+
+	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
+		selwakeuppri(&kq->kq_sel, PSOCK);
+		if (!SEL_WAITING(&kq->kq_sel))
+			kq->kq_state &= ~KQ_SEL;
+	}
+
+	KQ_UNLOCK(kq);
+
+	FILEDESC_XLOCK(fdp);
+	SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
+	FILEDESC_XUNLOCK(fdp);
+
+	seldrain(&kq->kq_sel);
+	knlist_destroy(&kq->kq_sel.si_note);
+	mtx_destroy(&kq->kq_lock);
+	kq->kq_fdp = NULL;
+
+	if (kq->kq_knhash != NULL)
+		free(kq->kq_knhash, M_KQUEUE);
+	if (kq->kq_knlist != NULL)
+		free(kq->kq_knlist, M_KQUEUE);
+
+	funsetown(&kq->kq_sigio);
+	free(kq, M_KQUEUE);
+	fp->f_data = NULL;
+
+	return (0);
+}
+
+static void
+kqueue_wakeup(struct kqueue *kq)
+{
+	KQ_OWNED(kq);
+
+	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
+		kq->kq_state &= ~KQ_SLEEP;
+		wakeup(kq);
+	}
+	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
+		selwakeuppri(&kq->kq_sel, PSOCK);
+		if (!SEL_WAITING(&kq->kq_sel))
+			kq->kq_state &= ~KQ_SEL;
+	}
+	if (!knlist_empty(&kq->kq_sel.si_note))
+		kqueue_schedtask(kq);
+	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
+		pgsigio(&kq->kq_sigio, SIGIO, 0);
+	}
+}
+
+/*
+ * Walk down a list of knotes, activating them if their event has triggered.
+ *
+ * There is a possibility to optimize in the case of one kq watching another.
+ * Instead of scheduling a task to wake it up, you could pass enough state
+ * down the chain to make up the parent kqueue.  Make this code functional
+ * first.
+ */
+void
+knote(struct knlist *list, long hint, int lockflags)
+{
+	struct kqueue *kq;
+	struct knote *kn;
+	int error;
+
+	if (list == NULL)
+		return;
+
+	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
+
+	if ((lockflags & KNF_LISTLOCKED) == 0)
+		list->kl_lock(list->kl_lockarg); 
+
+	/*
+	 * If we unlock the list lock (and set KN_INFLUX), we can eliminate
+	 * the kqueue scheduling, but this will introduce four
+	 * lock/unlock's for each knote to test.  If we do, continue to use
+	 * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
+	 * only safe if you want to remove the current item, which we are
+	 * not doing.
+	 */
+	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
+		kq = kn->kn_kq;
+		if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
+			KQ_LOCK(kq);
+			if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+				KQ_UNLOCK(kq);
+			} else if ((lockflags & KNF_NOKQLOCK) != 0) {
+				kn->kn_status |= KN_INFLUX;
+				KQ_UNLOCK(kq);
+				error = kn->kn_fop->f_event(kn, hint);
+				KQ_LOCK(kq);
+				kn->kn_status &= ~KN_INFLUX;
+				if (error)
+					KNOTE_ACTIVATE(kn, 1);
+				KQ_UNLOCK_FLUX(kq);
+			} else {
+				kn->kn_status |= KN_HASKQLOCK;
+				if (kn->kn_fop->f_event(kn, hint))
+					KNOTE_ACTIVATE(kn, 1);
+				kn->kn_status &= ~KN_HASKQLOCK;
+				KQ_UNLOCK(kq);
+			}
+		}
+		kq = NULL;
+	}
+	if ((lockflags & KNF_LISTLOCKED) == 0)
+		list->kl_unlock(list->kl_lockarg); 
+}
+
+/*
+ * add a knote to a knlist
+ */
+void
+knlist_add(struct knlist *knl, struct knote *kn, int islocked)
+{
+	KNL_ASSERT_LOCK(knl, islocked);
+	KQ_NOTOWNED(kn->kn_kq);
+	KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
+	    (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
+	if (!islocked)
+		knl->kl_lock(knl->kl_lockarg);
+	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
+	if (!islocked)
+		knl->kl_unlock(knl->kl_lockarg);
+	KQ_LOCK(kn->kn_kq);
+	kn->kn_knlist = knl;
+	kn->kn_status &= ~KN_DETACHED;
+	KQ_UNLOCK(kn->kn_kq);
+}
+
+static void
+knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
+{
+	KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
+	KNL_ASSERT_LOCK(knl, knlislocked);
+	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
+	if (!kqislocked)
+		KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
+    ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
+	if (!knlislocked)
+		knl->kl_lock(knl->kl_lockarg);
+	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
+	kn->kn_knlist = NULL;
+	if (!knlislocked)
+		knl->kl_unlock(knl->kl_lockarg);
+	if (!kqislocked)
+		KQ_LOCK(kn->kn_kq);
+	kn->kn_status |= KN_DETACHED;
+	if (!kqislocked)
+		KQ_UNLOCK(kn->kn_kq);
+}
+
+/*
+ * remove knote from the specified knlist
+ */
+void
+knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
+{
+
+	knlist_remove_kq(knl, kn, islocked, 0);
+}
+
+/*
+ * remove knote from the specified knlist while in f_event handler.
+ */
+void
+knlist_remove_inevent(struct knlist *knl, struct knote *kn)
+{
+
+	knlist_remove_kq(knl, kn, 1,
+	    (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
+}
+
+int
+knlist_empty(struct knlist *knl)
+{
+
+	KNL_ASSERT_LOCKED(knl);
+	return SLIST_EMPTY(&knl->kl_list);
+}
+
+static struct mtx	knlist_lock;
+MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
+	MTX_DEF);
+static void knlist_mtx_lock(void *arg);
+static void knlist_mtx_unlock(void *arg);
+
+static void
+knlist_mtx_lock(void *arg)
+{
+
+	mtx_lock((struct mtx *)arg);
+}
+
+static void
+knlist_mtx_unlock(void *arg)
+{
+
+	mtx_unlock((struct mtx *)arg);
+}
+
+static void
+knlist_mtx_assert_locked(void *arg)
+{
+
+	mtx_assert((struct mtx *)arg, MA_OWNED);
+}
+
+static void
+knlist_mtx_assert_unlocked(void *arg)
+{
+
+	mtx_assert((struct mtx *)arg, MA_NOTOWNED);
+}
+
+static void
+knlist_rw_rlock(void *arg)
+{
+
+	rw_rlock((struct rwlock *)arg);
+}
+
+static void
+knlist_rw_runlock(void *arg)
+{
+
+	rw_runlock((struct rwlock *)arg);
+}
+
+static void
+knlist_rw_assert_locked(void *arg)
+{
+
+	rw_assert((struct rwlock *)arg, RA_LOCKED);
+}
+
+static void
+knlist_rw_assert_unlocked(void *arg)
+{
+
+	rw_assert((struct rwlock *)arg, RA_UNLOCKED);
+}
+
+void
+knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
+    void (*kl_unlock)(void *),
+    void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
+{
+
+	if (lock == NULL)
+		knl->kl_lockarg = &knlist_lock;
+	else
+		knl->kl_lockarg = lock;
+
+	if (kl_lock == NULL)
+		knl->kl_lock = knlist_mtx_lock;
+	else
+		knl->kl_lock = kl_lock;
+	if (kl_unlock == NULL)
+		knl->kl_unlock = knlist_mtx_unlock;
+	else
+		knl->kl_unlock = kl_unlock;
+	if (kl_assert_locked == NULL)
+		knl->kl_assert_locked = knlist_mtx_assert_locked;
+	else
+		knl->kl_assert_locked = kl_assert_locked;
+	if (kl_assert_unlocked == NULL)
+		knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
+	else
+		knl->kl_assert_unlocked = kl_assert_unlocked;
+
+	SLIST_INIT(&knl->kl_list);
+}
+
+void
+knlist_init_mtx(struct knlist *knl, struct mtx *lock)
+{
+
+	knlist_init(knl, lock, NULL, NULL, NULL, NULL);
+}
+
+void
+knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
+{
+
+	knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
+	    knlist_rw_assert_locked, knlist_rw_assert_unlocked);
+}
+
+void
+knlist_destroy(struct knlist *knl)
+{
+
+#ifdef INVARIANTS
+	/*
+	 * if we run across this error, we need to find the offending
+	 * driver and have it call knlist_clear or knlist_delete.
+	 */
+	if (!SLIST_EMPTY(&knl->kl_list))
+		printf("WARNING: destroying knlist w/ knotes on it!\n");
+#endif
+
+	knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
+	SLIST_INIT(&knl->kl_list);
+}
+
+/*
+ * Even if we are locked, we may need to drop the lock to allow any influx
+ * knotes time to "settle".
+ */
+void
+knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
+{
+	struct knote *kn, *kn2;
+	struct kqueue *kq;
+
+	if (islocked)
+		KNL_ASSERT_LOCKED(knl);
+	else {
+		KNL_ASSERT_UNLOCKED(knl);
+again:		/* need to reacquire lock since we have dropped it */
+		knl->kl_lock(knl->kl_lockarg);
+	}
+
+	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
+		kq = kn->kn_kq;
+		KQ_LOCK(kq);
+		if ((kn->kn_status & KN_INFLUX)) {
+			KQ_UNLOCK(kq);
+			continue;
+		}
+		knlist_remove_kq(knl, kn, 1, 1);
+		if (killkn) {
+			kn->kn_status |= KN_INFLUX | KN_DETACHED;
+			KQ_UNLOCK(kq);
+			knote_drop(kn, td);
+		} else {
+			/* Make sure cleared knotes disappear soon */
+			kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+			KQ_UNLOCK(kq);
+		}
+		kq = NULL;
+	}
+
+	if (!SLIST_EMPTY(&knl->kl_list)) {
+		/* there are still KN_INFLUX remaining */
+		kn = SLIST_FIRST(&knl->kl_list);
+		kq = kn->kn_kq;
+		KQ_LOCK(kq);
+		KASSERT(kn->kn_status & KN_INFLUX,
+		    ("knote removed w/o list lock"));
+		knl->kl_unlock(knl->kl_lockarg);
+		kq->kq_state |= KQ_FLUXWAIT;
+		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
+		kq = NULL;
+		goto again;
+	}
+
+	if (islocked)
+		KNL_ASSERT_LOCKED(knl);
+	else {
+		knl->kl_unlock(knl->kl_lockarg);
+		KNL_ASSERT_UNLOCKED(knl);
+	}
+}
+
+/*
+ * Remove all knotes referencing a specified fd must be called with FILEDESC
+ * lock.  This prevents a race where a new fd comes along and occupies the
+ * entry and we attach a knote to the fd.
+ */
+void
+knote_fdclose(struct thread *td, int fd)
+{
+	struct filedesc *fdp = td->td_proc->p_fd;
+	struct kqueue *kq;
+	struct knote *kn;
+	int influx;
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	/*
+	 * We shouldn't have to worry about new kevents appearing on fd
+	 * since filedesc is locked.
+	 */
+	SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
+		KQ_LOCK(kq);
+
+again:
+		influx = 0;
+		while (kq->kq_knlistsize > fd &&
+		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
+			if (kn->kn_status & KN_INFLUX) {
+				/* someone else might be waiting on our knote */
+				if (influx)
+					wakeup(kq);
+				kq->kq_state |= KQ_FLUXWAIT;
+				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
+				goto again;
+			}
+			kn->kn_status |= KN_INFLUX;
+			KQ_UNLOCK(kq);
+			if (!(kn->kn_status & KN_DETACHED))
+				kn->kn_fop->f_detach(kn);
+			knote_drop(kn, td);
+			influx = 1;
+			KQ_LOCK(kq);
+		}
+		KQ_UNLOCK_FLUX(kq);
+	}
+}
+
+static int
+knote_attach(struct knote *kn, struct kqueue *kq)
+{
+	struct klist *list;
+
+	KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
+	KQ_OWNED(kq);
+
+	if (kn->kn_fop->f_isfd) {
+		if (kn->kn_id >= kq->kq_knlistsize)
+			return ENOMEM;
+		list = &kq->kq_knlist[kn->kn_id];
+	} else {
+		if (kq->kq_knhash == NULL)
+			return ENOMEM;
+		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
+	}
+
+	SLIST_INSERT_HEAD(list, kn, kn_link);
+
+	return 0;
+}
+
+/*
+ * knote must already have been detached using the f_detach method.
+ * no lock need to be held, it is assumed that the KN_INFLUX flag is set
+ * to prevent other removal.
+ */
+static void
+knote_drop(struct knote *kn, struct thread *td)
+{
+	struct kqueue *kq;
+	struct klist *list;
+
+	kq = kn->kn_kq;
+
+	KQ_NOTOWNED(kq);
+	KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
+	    ("knote_drop called without KN_INFLUX set in kn_status"));
+
+	KQ_LOCK(kq);
+	if (kn->kn_fop->f_isfd)
+		list = &kq->kq_knlist[kn->kn_id];
+	else
+		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
+
+	if (!SLIST_EMPTY(list))
+		SLIST_REMOVE(list, kn, knote, kn_link);
+	if (kn->kn_status & KN_QUEUED)
+		knote_dequeue(kn);
+	KQ_UNLOCK_FLUX(kq);
+
+	if (kn->kn_fop->f_isfd) {
+		fdrop(kn->kn_fp, td);
+		kn->kn_fp = NULL;
+	}
+	kqueue_fo_release(kn->kn_kevent.filter);
+	kn->kn_fop = NULL;
+	knote_free(kn);
+}
+
+static void
+knote_enqueue(struct knote *kn)
+{
+	struct kqueue *kq = kn->kn_kq;
+
+	KQ_OWNED(kn->kn_kq);
+	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
+
+	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
+	kn->kn_status |= KN_QUEUED;
+	kq->kq_count++;
+	kqueue_wakeup(kq);
+}
+
+static void
+knote_dequeue(struct knote *kn)
+{
+	struct kqueue *kq = kn->kn_kq;
+
+	KQ_OWNED(kn->kn_kq);
+	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
+
+	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
+	kn->kn_status &= ~KN_QUEUED;
+	kq->kq_count--;
+}
+
+static void
+knote_init(void)
+{
+
+	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
+
+static struct knote *
+knote_alloc(int waitok)
+{
+	return ((struct knote *)uma_zalloc(knote_zone,
+	    (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
+}
+
+static void
+knote_free(struct knote *kn)
+{
+	if (kn != NULL)
+		uma_zfree(knote_zone, kn);
+}
+
+/*
+ * Register the kev w/ the kq specified by fd.
+ */
+int 
+kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
+{
+	struct kqueue *kq;
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	error = fget(td, fd, cap_rights_init(&rights, CAP_POST_EVENT), &fp);
+	if (error != 0)
+		return (error);
+	if ((error = kqueue_acquire(fp, &kq)) != 0)
+		goto noacquire;
+
+	error = kqueue_register(kq, kev, td, waitok);
+
+	kqueue_release(kq, 0);
+
+noacquire:
+	fdrop(fp, td);
+
+	return error;
+}
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
new file mode 100644
index 0000000..45f732b
--- /dev/null
+++ b/sys/kern/kern_exec.c
@@ -0,0 +1,1496 @@
+/*-
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/systm.h>
+#include <sys/capability.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/fcntl.h>
+#include <sys/acct.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/wait.h>
+#include <sys/malloc.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/sf_buf.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/shm.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+
+#ifdef	HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#include <machine/reg.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+dtrace_execexit_func_t	dtrace_fasttrap_exec;
+#endif
+
+SDT_PROVIDER_DECLARE(proc);
+SDT_PROBE_DEFINE1(proc, kernel, , exec, exec, "char *");
+SDT_PROBE_DEFINE1(proc, kernel, , exec_failure, exec-failure, "int");
+SDT_PROBE_DEFINE1(proc, kernel, , exec_success, exec-success, "char *");
+
+MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
+
+static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
+static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
+static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
+static int do_execve(struct thread *td, struct image_args *args,
+    struct mac *mac_p);
+
+/* XXX This should be vm_size_t. */
+SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
+    NULL, 0, sysctl_kern_ps_strings, "LU", "");
+
+/* XXX This should be vm_size_t. */
+SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD|
+    CTLFLAG_CAPRD, NULL, 0, sysctl_kern_usrstack, "LU", "");
+
+SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
+    NULL, 0, sysctl_kern_stackprot, "I", "");
+
+u_long ps_arg_cache_limit = PAGE_SIZE / 16;
+SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
+    &ps_arg_cache_limit, 0, "");
+
+static int map_at_zero = 0;
+TUNABLE_INT("security.bsd.map_at_zero", &map_at_zero);
+SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RW, &map_at_zero, 0,
+    "Permit processes to map an object at virtual address 0.");
+
+static int
+sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
+{
+	struct proc *p;
+	int error;
+
+	p = curproc;
+#ifdef SCTL_MASK32
+	if (req->flags & SCTL_MASK32) {
+		unsigned int val;
+		val = (unsigned int)p->p_sysent->sv_psstrings;
+		error = SYSCTL_OUT(req, &val, sizeof(val));
+	} else
+#endif
+		error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
+		   sizeof(p->p_sysent->sv_psstrings));
+	return error;
+}
+
+static int
+sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
+{
+	struct proc *p;
+	int error;
+
+	p = curproc;
+#ifdef SCTL_MASK32
+	if (req->flags & SCTL_MASK32) {
+		unsigned int val;
+		val = (unsigned int)p->p_sysent->sv_usrstack;
+		error = SYSCTL_OUT(req, &val, sizeof(val));
+	} else
+#endif
+		error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
+		    sizeof(p->p_sysent->sv_usrstack));
+	return error;
+}
+
+static int
+sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
+{
+	struct proc *p;
+
+	p = curproc;
+	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
+	    sizeof(p->p_sysent->sv_stackprot)));
+}
+
+/*
+ * Each of the items is a pointer to a `const struct execsw', hence the
+ * double pointer here.
+ */
+static const struct execsw **execsw;
+
+#ifndef _SYS_SYSPROTO_H_
+struct execve_args {
+	char    *fname; 
+	char    **argv;
+	char    **envv; 
+};
+#endif
+
+int
+sys_execve(td, uap)
+	struct thread *td;
+	struct execve_args /* {
+		char *fname;
+		char **argv;
+		char **envv;
+	} */ *uap;
+{
+	int error;
+	struct image_args args;
+
+	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
+	    uap->argv, uap->envv);
+	if (error == 0)
+		error = kern_execve(td, &args, NULL);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fexecve_args {
+	int	fd;
+	char	**argv;
+	char	**envv;
+}
+#endif
+int
+sys_fexecve(struct thread *td, struct fexecve_args *uap)
+{
+	int error;
+	struct image_args args;
+
+	error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
+	    uap->argv, uap->envv);
+	if (error == 0) {
+		args.fd = uap->fd;
+		error = kern_execve(td, &args, NULL);
+	}
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct __mac_execve_args {
+	char	*fname;
+	char	**argv;
+	char	**envv;
+	struct mac	*mac_p;
+};
+#endif
+
+int
+sys___mac_execve(td, uap)
+	struct thread *td;
+	struct __mac_execve_args /* {
+		char *fname;
+		char **argv;
+		char **envv;
+		struct mac *mac_p;
+	} */ *uap;
+{
+#ifdef MAC
+	int error;
+	struct image_args args;
+
+	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
+	    uap->argv, uap->envv);
+	if (error == 0)
+		error = kern_execve(td, &args, uap->mac_p);
+	return (error);
+#else
+	return (ENOSYS);
+#endif
+}
+
+/*
+ * XXX: kern_execve has the astonishing property of not always returning to
+ * the caller.  If sufficiently bad things happen during the call to
+ * do_execve(), it can end up calling exit1(); as a result, callers must
+ * avoid doing anything which they might need to undo (e.g., allocating
+ * memory).
+ */
+int
+kern_execve(td, args, mac_p)
+	struct thread *td;
+	struct image_args *args;
+	struct mac *mac_p;
+{
+	struct proc *p = td->td_proc;
+	int error;
+
+	AUDIT_ARG_ARGV(args->begin_argv, args->argc,
+	    args->begin_envv - args->begin_argv);
+	AUDIT_ARG_ENVV(args->begin_envv, args->envc,
+	    args->endp - args->begin_envv);
+	if (p->p_flag & P_HADTHREADS) {
+		PROC_LOCK(p);
+		if (thread_single(SINGLE_BOUNDARY)) {
+			PROC_UNLOCK(p);
+	       		exec_free_args(args);
+			return (ERESTART);	/* Try again later. */
+		}
+		PROC_UNLOCK(p);
+	}
+
+	error = do_execve(td, args, mac_p);
+
+	if (p->p_flag & P_HADTHREADS) {
+		PROC_LOCK(p);
+		/*
+		 * If success, we upgrade to SINGLE_EXIT state to
+		 * force other threads to suicide.
+		 */
+		if (error == 0)
+			thread_single(SINGLE_EXIT);
+		else
+			thread_single_end();
+		PROC_UNLOCK(p);
+	}
+
+	return (error);
+}
+
+/*
+ * In-kernel implementation of execve().  All arguments are assumed to be
+ * userspace pointers from the passed thread.
+ */
+static int
+do_execve(td, args, mac_p)
+	struct thread *td;
+	struct image_args *args;
+	struct mac *mac_p;
+{
+	struct proc *p = td->td_proc;
+	struct nameidata nd;
+	struct ucred *newcred = NULL, *oldcred;
+	struct uidinfo *euip;
+	register_t *stack_base;
+	int error, i;
+	struct image_params image_params, *imgp;
+	struct vattr attr;
+	int (*img_first)(struct image_params *);
+	struct pargs *oldargs = NULL, *newargs = NULL;
+	struct sigacts *oldsigacts, *newsigacts;
+#ifdef KTRACE
+	struct vnode *tracevp = NULL;
+	struct ucred *tracecred = NULL;
+#endif
+	struct vnode *textvp = NULL, *binvp = NULL;
+	cap_rights_t rights;
+	int credential_changing;
+	int textset;
+#ifdef MAC
+	struct label *interpvplabel = NULL;
+	int will_transition;
+#endif
+#ifdef HWPMC_HOOKS
+	struct pmckern_procexec pe;
+#endif
+	static const char fexecv_proc_title[] = "(fexecv)";
+
+	imgp = &image_params;
+
+	/*
+	 * Lock the process and set the P_INEXEC flag to indicate that
+	 * it should be left alone until we're done here.  This is
+	 * necessary to avoid race conditions - e.g. in ptrace() -
+	 * that might allow a local user to illicitly obtain elevated
+	 * privileges.
+	 */
+	PROC_LOCK(p);
+	KASSERT((p->p_flag & P_INEXEC) == 0,
+	    ("%s(): process already has P_INEXEC flag", __func__));
+	p->p_flag |= P_INEXEC;
+	PROC_UNLOCK(p);
+
+	/*
+	 * Initialize part of the common data
+	 */
+	imgp->proc = p;
+	imgp->execlabel = NULL;
+	imgp->attr = &attr;
+	imgp->entry_addr = 0;
+	imgp->reloc_base = 0;
+	imgp->vmspace_destroyed = 0;
+	imgp->interpreted = 0;
+	imgp->opened = 0;
+	imgp->interpreter_name = NULL;
+	imgp->auxargs = NULL;
+	imgp->vp = NULL;
+	imgp->object = NULL;
+	imgp->firstpage = NULL;
+	imgp->ps_strings = 0;
+	imgp->auxarg_size = 0;
+	imgp->args = args;
+	imgp->execpath = imgp->freepath = NULL;
+	imgp->execpathp = 0;
+	imgp->canary = 0;
+	imgp->canarylen = 0;
+	imgp->pagesizes = 0;
+	imgp->pagesizeslen = 0;
+	imgp->stack_prot = 0;
+
+#ifdef MAC
+	error = mac_execve_enter(imgp, mac_p);
+	if (error)
+		goto exec_fail;
+#endif
+
+	imgp->image_header = NULL;
+
+	/*
+	 * Translate the file name. namei() returns a vnode pointer
+	 *	in ni_vp amoung other things.
+	 *
+	 * XXXAUDIT: It would be desirable to also audit the name of the
+	 * interpreter if this is an interpreted binary.
+	 */
+	if (args->fname != NULL) {
+		NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
+		    | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
+	}
+
+	SDT_PROBE(proc, kernel, , exec, args->fname, 0, 0, 0, 0 );
+
+interpret:
+	if (args->fname != NULL) {
+#ifdef CAPABILITY_MODE
+		/*
+		 * While capability mode can't reach this point via direct
+		 * path arguments to execve(), we also don't allow
+		 * interpreters to be used in capability mode (for now).
+		 * Catch indirect lookups and return a permissions error.
+		 */
+		if (IN_CAPABILITY_MODE(td)) {
+			error = ECAPMODE;
+			goto exec_fail;
+		}
+#endif
+		error = namei(&nd);
+		if (error)
+			goto exec_fail;
+
+		binvp  = nd.ni_vp;
+		imgp->vp = binvp;
+	} else {
+		AUDIT_ARG_FD(args->fd);
+		/*
+		 * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
+		 */
+		error = fgetvp_exec(td, args->fd,
+		    cap_rights_init(&rights, CAP_FEXECVE), &binvp);
+		if (error)
+			goto exec_fail;
+		vn_lock(binvp, LK_EXCLUSIVE | LK_RETRY);
+		AUDIT_ARG_VNODE1(binvp);
+		imgp->vp = binvp;
+	}
+
+	/*
+	 * Check file permissions (also 'opens' file)
+	 */
+	error = exec_check_permissions(imgp);
+	if (error)
+		goto exec_fail_dealloc;
+
+	imgp->object = imgp->vp->v_object;
+	if (imgp->object != NULL)
+		vm_object_reference(imgp->object);
+
+	/*
+	 * Set VV_TEXT now so no one can write to the executable while we're
+	 * activating it.
+	 *
+	 * Remember if this was set before and unset it in case this is not
+	 * actually an executable image.
+	 */
+	textset = VOP_IS_TEXT(imgp->vp);
+	VOP_SET_TEXT(imgp->vp);
+
+	error = exec_map_first_page(imgp);
+	if (error)
+		goto exec_fail_dealloc;
+
+	imgp->proc->p_osrel = 0;
+	/*
+	 *	If the current process has a special image activator it
+	 *	wants to try first, call it.   For example, emulating shell
+	 *	scripts differently.
+	 */
+	error = -1;
+	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
+		error = img_first(imgp);
+
+	/*
+	 *	Loop through the list of image activators, calling each one.
+	 *	An activator returns -1 if there is no match, 0 on success,
+	 *	and an error otherwise.
+	 */
+	for (i = 0; error == -1 && execsw[i]; ++i) {
+		if (execsw[i]->ex_imgact == NULL ||
+		    execsw[i]->ex_imgact == img_first) {
+			continue;
+		}
+		error = (*execsw[i]->ex_imgact)(imgp);
+	}
+
+	if (error) {
+		if (error == -1) {
+			if (textset == 0)
+				VOP_UNSET_TEXT(imgp->vp);
+			error = ENOEXEC;
+		}
+		goto exec_fail_dealloc;
+	}
+
+	/*
+	 * Special interpreter operation, cleanup and loop up to try to
+	 * activate the interpreter.
+	 */
+	if (imgp->interpreted) {
+		exec_unmap_first_page(imgp);
+		/*
+		 * VV_TEXT needs to be unset for scripts.  There is a short
+		 * period before we determine that something is a script where
+		 * VV_TEXT will be set. The vnode lock is held over this
+		 * entire period so nothing should illegitimately be blocked.
+		 */
+		VOP_UNSET_TEXT(imgp->vp);
+		/* free name buffer and old vnode */
+		if (args->fname != NULL)
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+#ifdef MAC
+		mac_execve_interpreter_enter(binvp, &interpvplabel);
+#endif
+		if (imgp->opened) {
+			VOP_CLOSE(binvp, FREAD, td->td_ucred, td);
+			imgp->opened = 0;
+		}
+		vput(binvp);
+		vm_object_deallocate(imgp->object);
+		imgp->object = NULL;
+		/* set new name to that of the interpreter */
+		NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+		    UIO_SYSSPACE, imgp->interpreter_name, td);
+		args->fname = imgp->interpreter_name;
+		goto interpret;
+	}
+
+	/*
+	 * NB: We unlock the vnode here because it is believed that none
+	 * of the sv_copyout_strings/sv_fixup operations require the vnode.
+	 */
+	VOP_UNLOCK(imgp->vp, 0);
+
+	/*
+	 * Do the best to calculate the full path to the image file.
+	 */
+	if (imgp->auxargs != NULL &&
+	    ((args->fname != NULL && args->fname[0] == '/') ||
+	     vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0))
+		imgp->execpath = args->fname;
+
+	/*
+	 * Copy out strings (args and env) and initialize stack base
+	 */
+	if (p->p_sysent->sv_copyout_strings)
+		stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
+	else
+		stack_base = exec_copyout_strings(imgp);
+
+	/*
+	 * If custom stack fixup routine present for this process
+	 * let it do the stack setup.
+	 * Else stuff argument count as first item on stack
+	 */
+	if (p->p_sysent->sv_fixup != NULL)
+		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
+	else
+		suword(--stack_base, imgp->args->argc);
+
+	/*
+	 * For security and other reasons, the file descriptor table cannot
+	 * be shared after an exec.
+	 */
+	fdunshare(p, td);
+
+	/*
+	 * Malloc things before we need locks.
+	 */
+	newcred = crget();
+	euip = uifind(attr.va_uid);
+	i = imgp->args->begin_envv - imgp->args->begin_argv;
+	/* Cache arguments if they fit inside our allowance */
+	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
+		newargs = pargs_alloc(i);
+		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
+	}
+
+	/* close files on exec */
+	fdcloseexec(td);
+	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
+
+	/* Get a reference to the vnode prior to locking the proc */
+	VREF(binvp);
+
+	/*
+	 * For security and other reasons, signal handlers cannot
+	 * be shared after an exec. The new process gets a copy of the old
+	 * handlers. In execsigs(), the new process will have its signals
+	 * reset.
+	 */
+	PROC_LOCK(p);
+	oldcred = crcopysafe(p, newcred);
+	if (sigacts_shared(p->p_sigacts)) {
+		oldsigacts = p->p_sigacts;
+		PROC_UNLOCK(p);
+		newsigacts = sigacts_alloc();
+		sigacts_copy(newsigacts, oldsigacts);
+		PROC_LOCK(p);
+		p->p_sigacts = newsigacts;
+	} else
+		oldsigacts = NULL;
+
+	/* Stop profiling */
+	stopprofclock(p);
+
+	/* reset caught signals */
+	execsigs(p);
+
+	/* name this process - nameiexec(p, ndp) */
+	bzero(p->p_comm, sizeof(p->p_comm));
+	if (args->fname)
+		bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
+		    min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
+	else if (vn_commname(binvp, p->p_comm, sizeof(p->p_comm)) != 0)
+		bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
+	bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
+#ifdef KTR
+	sched_clear_tdname(td);
+#endif
+
+	/*
+	 * mark as execed, wakeup the process that vforked (if any) and tell
+	 * it that it now has its own resources back
+	 */
+	p->p_flag |= P_EXEC;
+	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
+		p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
+		cv_broadcast(&p->p_pwait);
+	}
+
+	/*
+	 * Implement image setuid/setgid.
+	 *
+	 * Don't honor setuid/setgid if the filesystem prohibits it or if
+	 * the process is being traced.
+	 *
+	 * We disable setuid/setgid/etc in compatibility mode on the basis
+	 * that most setugid applications are not written with that
+	 * environment in mind, and will therefore almost certainly operate
+	 * incorrectly. In principle there's no reason that setugid
+	 * applications might not be useful in capability mode, so we may want
+	 * to reconsider this conservative design choice in the future.
+	 *
+	 * XXXMAC: For the time being, use NOSUID to also prohibit
+	 * transitions on the file system.
+	 */
+	credential_changing = 0;
+	credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid !=
+	    attr.va_uid;
+	credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid !=
+	    attr.va_gid;
+#ifdef MAC
+	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
+	    interpvplabel, imgp);
+	credential_changing |= will_transition;
+#endif
+
+	if (credential_changing &&
+#ifdef CAPABILITY_MODE
+	    ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
+#endif
+	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
+	    (p->p_flag & P_TRACED) == 0) {
+		/*
+		 * Turn off syscall tracing for set-id programs, except for
+		 * root.  Record any set-id flags first to make sure that
+		 * we do not regain any tracing during a possible block.
+		 */
+		setsugid(p);
+
+#ifdef KTRACE
+		if (p->p_tracecred != NULL &&
+		    priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0))
+			ktrprocexec(p, &tracecred, &tracevp);
+#endif
+		/*
+		 * Close any file descriptors 0..2 that reference procfs,
+		 * then make sure file descriptors 0..2 are in use.
+		 *
+		 * setugidsafety() may call closef() and then pfind()
+		 * which may grab the process lock.
+		 * fdcheckstd() may call falloc() which may block to
+		 * allocate memory, so temporarily drop the process lock.
+		 */
+		PROC_UNLOCK(p);
+		VOP_UNLOCK(imgp->vp, 0);
+		setugidsafety(td);
+		error = fdcheckstd(td);
+		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
+		if (error != 0)
+			goto done1;
+		PROC_LOCK(p);
+		/*
+		 * Set the new credentials.
+		 */
+		if (attr.va_mode & S_ISUID)
+			change_euid(newcred, euip);
+		if (attr.va_mode & S_ISGID)
+			change_egid(newcred, attr.va_gid);
+#ifdef MAC
+		if (will_transition) {
+			mac_vnode_execve_transition(oldcred, newcred, imgp->vp,
+			    interpvplabel, imgp);
+		}
+#endif
+		/*
+		 * Implement correct POSIX saved-id behavior.
+		 *
+		 * XXXMAC: Note that the current logic will save the
+		 * uid and gid if a MAC domain transition occurs, even
+		 * though maybe it shouldn't.
+		 */
+		change_svuid(newcred, newcred->cr_uid);
+		change_svgid(newcred, newcred->cr_gid);
+		p->p_ucred = newcred;
+		newcred = NULL;
+	} else {
+		if (oldcred->cr_uid == oldcred->cr_ruid &&
+		    oldcred->cr_gid == oldcred->cr_rgid)
+			p->p_flag &= ~P_SUGID;
+		/*
+		 * Implement correct POSIX saved-id behavior.
+		 *
+		 * XXX: It's not clear that the existing behavior is
+		 * POSIX-compliant.  A number of sources indicate that the
+		 * saved uid/gid should only be updated if the new ruid is
+		 * not equal to the old ruid, or the new euid is not equal
+		 * to the old euid and the new euid is not equal to the old
+		 * ruid.  The FreeBSD code always updates the saved uid/gid.
+		 * Also, this code uses the new (replaced) euid and egid as
+		 * the source, which may or may not be the right ones to use.
+		 */
+		if (oldcred->cr_svuid != oldcred->cr_uid ||
+		    oldcred->cr_svgid != oldcred->cr_gid) {
+			change_svuid(newcred, newcred->cr_uid);
+			change_svgid(newcred, newcred->cr_gid);
+			p->p_ucred = newcred;
+			newcred = NULL;
+		}
+	}
+
+	/*
+	 * Store the vp for use in procfs.  This vnode was referenced prior
+	 * to locking the proc lock.
+	 */
+	textvp = p->p_textvp;
+	p->p_textvp = binvp;
+
+#ifdef KDTRACE_HOOKS
+	/*
+	 * Tell the DTrace fasttrap provider about the exec if it
+	 * has declared an interest.
+	 */
+	if (dtrace_fasttrap_exec)
+		dtrace_fasttrap_exec(p);
+#endif
+
+	/*
+	 * Notify others that we exec'd, and clear the P_INEXEC flag
+	 * as we're now a bona fide freshly-execed process.
+	 */
+	KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
+	p->p_flag &= ~P_INEXEC;
+
+	/* clear "fork but no exec" flag, as we _are_ execing */
+	p->p_acflag &= ~AFORK;
+
+	/*
+	 * Free any previous argument cache and replace it with
+	 * the new argument cache, if any.
+	 */
+	oldargs = p->p_args;
+	p->p_args = newargs;
+	newargs = NULL;
+
+#ifdef	HWPMC_HOOKS
+	/*
+	 * Check if system-wide sampling is in effect or if the
+	 * current process is using PMCs.  If so, do exec() time
+	 * processing.  This processing needs to happen AFTER the
+	 * P_INEXEC flag is cleared.
+	 *
+	 * The proc lock needs to be released before taking the PMC
+	 * SX.
+	 */
+	if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
+		PROC_UNLOCK(p);
+		VOP_UNLOCK(imgp->vp, 0);
+		pe.pm_credentialschanged = credential_changing;
+		pe.pm_entryaddr = imgp->entry_addr;
+
+		PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
+		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
+	} else
+		PROC_UNLOCK(p);
+#else  /* !HWPMC_HOOKS */
+	PROC_UNLOCK(p);
+#endif
+
+	/* Set values passed into the program in registers. */
+	if (p->p_sysent->sv_setregs)
+		(*p->p_sysent->sv_setregs)(td, imgp, 
+		    (u_long)(uintptr_t)stack_base);
+	else
+		exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base);
+
+	vfs_mark_atime(imgp->vp, td->td_ucred);
+
+	SDT_PROBE(proc, kernel, , exec_success, args->fname, 0, 0, 0, 0);
+
+done1:
+	/*
+	 * Free any resources malloc'd earlier that we didn't use.
+	 */
+	uifree(euip);
+	if (newcred == NULL)
+		crfree(oldcred);
+	else
+		crfree(newcred);
+	VOP_UNLOCK(imgp->vp, 0);
+
+	/*
+	 * Handle deferred decrement of ref counts.
+	 */
+	if (textvp != NULL)
+		vrele(textvp);
+	if (binvp && error != 0)
+		vrele(binvp);
+#ifdef KTRACE
+	if (tracevp != NULL)
+		vrele(tracevp);
+	if (tracecred != NULL)
+		crfree(tracecred);
+#endif
+	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
+	pargs_drop(oldargs);
+	pargs_drop(newargs);
+	if (oldsigacts != NULL)
+		sigacts_free(oldsigacts);
+
+exec_fail_dealloc:
+
+	/*
+	 * free various allocated resources
+	 */
+	if (imgp->firstpage != NULL)
+		exec_unmap_first_page(imgp);
+
+	if (imgp->vp != NULL) {
+		if (args->fname)
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (imgp->opened)
+			VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
+		vput(imgp->vp);
+	}
+
+	if (imgp->object != NULL)
+		vm_object_deallocate(imgp->object);
+
+	free(imgp->freepath, M_TEMP);
+
+	if (error == 0) {
+		PROC_LOCK(p);
+		td->td_dbgflags |= TDB_EXEC;
+		PROC_UNLOCK(p);
+
+		/*
+		 * Stop the process here if its stop event mask has
+		 * the S_EXEC bit set.
+		 */
+		STOPEVENT(p, S_EXEC, 0);
+		goto done2;
+	}
+
+exec_fail:
+	/* we're done here, clear P_INEXEC */
+	PROC_LOCK(p);
+	p->p_flag &= ~P_INEXEC;
+	PROC_UNLOCK(p);
+
+	SDT_PROBE(proc, kernel, , exec_failure, error, 0, 0, 0, 0);
+
+done2:
+#ifdef MAC
+	mac_execve_exit(imgp);
+	mac_execve_interpreter_exit(interpvplabel);
+#endif
+	exec_free_args(args);
+
+	if (error && imgp->vmspace_destroyed) {
+		/* sorry, no more process anymore. exit gracefully */
+		exit1(td, W_EXITCODE(0, SIGABRT));
+		/* NOT REACHED */
+	}
+
+#ifdef KTRACE
+	if (error == 0)
+		ktrprocctor(p);
+#endif
+
+	return (error);
+}
+
+int
+exec_map_first_page(imgp)
+	struct image_params *imgp;
+{
+	int rv, i;
+	int initial_pagein;
+	vm_page_t ma[VM_INITIAL_PAGEIN];
+	vm_object_t object;
+
+	if (imgp->firstpage != NULL)
+		exec_unmap_first_page(imgp);
+
+	object = imgp->vp->v_object;
+	if (object == NULL)
+		return (EACCES);
+	VM_OBJECT_WLOCK(object);
+#if VM_NRESERVLEVEL > 0
+	if ((object->flags & OBJ_COLORED) == 0) {
+		object->flags |= OBJ_COLORED;
+		object->pg_color = 0;
+	}
+#endif
+	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL);
+	if (ma[0]->valid != VM_PAGE_BITS_ALL) {
+		initial_pagein = VM_INITIAL_PAGEIN;
+		if (initial_pagein > object->size)
+			initial_pagein = object->size;
+		for (i = 1; i < initial_pagein; i++) {
+			if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
+				if (ma[i]->valid)
+					break;
+				if (vm_page_tryxbusy(ma[i]))
+					break;
+			} else {
+				ma[i] = vm_page_alloc(object, i,
+				    VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
+				if (ma[i] == NULL)
+					break;
+			}
+		}
+		initial_pagein = i;
+		rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
+		ma[0] = vm_page_lookup(object, 0);
+		if ((rv != VM_PAGER_OK) || (ma[0] == NULL)) {
+			if (ma[0] != NULL) {
+				vm_page_lock(ma[0]);
+				vm_page_free(ma[0]);
+				vm_page_unlock(ma[0]);
+			}
+			VM_OBJECT_WUNLOCK(object);
+			return (EIO);
+		}
+	}
+	vm_page_xunbusy(ma[0]);
+	vm_page_lock(ma[0]);
+	vm_page_hold(ma[0]);
+	vm_page_unlock(ma[0]);
+	VM_OBJECT_WUNLOCK(object);
+
+	imgp->firstpage = sf_buf_alloc(ma[0], 0);
+	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
+
+	return (0);
+}
+
+void
+exec_unmap_first_page(imgp)
+	struct image_params *imgp;
+{
+	vm_page_t m;
+
+	if (imgp->firstpage != NULL) {
+		m = sf_buf_page(imgp->firstpage);
+		sf_buf_free(imgp->firstpage);
+		imgp->firstpage = NULL;
+		vm_page_lock(m);
+		vm_page_unhold(m);
+		vm_page_unlock(m);
+	}
+}
+
+/*
+ * Destroy old address space, and allocate a new stack
+ *	The new stack is only SGROWSIZ large because it is grown
+ *	automatically in trap.c.
+ */
+int
+exec_new_vmspace(imgp, sv)
+	struct image_params *imgp;
+	struct sysentvec *sv;
+{
+	int error;
+	struct proc *p = imgp->proc;
+	struct vmspace *vmspace = p->p_vmspace;
+	vm_object_t obj;
+	vm_offset_t sv_minuser, stack_addr;
+	vm_map_t map;
+	u_long ssiz;
+
+	imgp->vmspace_destroyed = 1;
+	imgp->sysent = sv;
+
+	/* May be called with Giant held */
+	EVENTHANDLER_INVOKE(process_exec, p, imgp);
+
+	/*
+	 * Blow away entire process VM, if address space not shared,
+	 * otherwise, create a new VM space so that other threads are
+	 * not disrupted
+	 */
+	map = &vmspace->vm_map;
+	if (map_at_zero)
+		sv_minuser = sv->sv_minuser;
+	else
+		sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
+	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
+	    vm_map_max(map) == sv->sv_maxuser) {
+		shmexit(vmspace);
+		pmap_remove_pages(vmspace_pmap(vmspace));
+		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
+	} else {
+		error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
+		if (error)
+			return (error);
+		vmspace = p->p_vmspace;
+		map = &vmspace->vm_map;
+	}
+
+	/* Map a shared page */
+	obj = sv->sv_shared_page_obj;
+	if (obj != NULL) {
+		vm_object_reference(obj);
+		error = vm_map_fixed(map, obj, 0,
+		    sv->sv_shared_page_base, sv->sv_shared_page_len,
+		    VM_PROT_READ | VM_PROT_EXECUTE,
+		    VM_PROT_READ | VM_PROT_EXECUTE,
+		    MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
+		if (error) {
+			vm_object_deallocate(obj);
+			return (error);
+		}
+	}
+
+	/* Allocate a new stack */
+	if (sv->sv_maxssiz != NULL)
+		ssiz = *sv->sv_maxssiz;
+	else
+		ssiz = maxssiz;
+	stack_addr = sv->sv_usrstack - ssiz;
+	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
+	    obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
+		sv->sv_stackprot,
+	    VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
+	if (error)
+		return (error);
+
+#ifdef __ia64__
+	/* Allocate a new register stack */
+	stack_addr = IA64_BACKINGSTORE;
+	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
+	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
+	if (error)
+		return (error);
+#endif
+
+	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
+	 * VM_STACK case, but they are still used to monitor the size of the
+	 * process stack so we can check the stack rlimit.
+	 */
+	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
+	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;
+
+	return (0);
+}
+
+/*
+ * Copy out argument and environment strings from the old process address
+ * space into the temporary string buffer.
+ */
+int
+exec_copyin_args(struct image_args *args, char *fname,
+    enum uio_seg segflg, char **argv, char **envv)
+{
+	char *argp, *envp;
+	int error;
+	size_t length;
+
+	bzero(args, sizeof(*args));
+	if (argv == NULL)
+		return (EFAULT);
+
+	/*
+	 * Allocate demand-paged memory for the file name, argument, and
+	 * environment strings.
+	 */
+	error = exec_alloc_args(args);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Copy the file name.
+	 */
+	if (fname != NULL) {
+		args->fname = args->buf;
+		error = (segflg == UIO_SYSSPACE) ?
+		    copystr(fname, args->fname, PATH_MAX, &length) :
+		    copyinstr(fname, args->fname, PATH_MAX, &length);
+		if (error != 0)
+			goto err_exit;
+	} else
+		length = 0;
+
+	args->begin_argv = args->buf + length;
+	args->endp = args->begin_argv;
+	args->stringspace = ARG_MAX;
+
+	/*
+	 * extract arguments first
+	 */
+	while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
+		if (argp == (caddr_t) -1) {
+			error = EFAULT;
+			goto err_exit;
+		}
+		if ((error = copyinstr(argp, args->endp,
+		    args->stringspace, &length))) {
+			if (error == ENAMETOOLONG) 
+				error = E2BIG;
+			goto err_exit;
+		}
+		args->stringspace -= length;
+		args->endp += length;
+		args->argc++;
+	}
+
+	args->begin_envv = args->endp;
+
+	/*
+	 * extract environment strings
+	 */
+	if (envv) {
+		while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
+			if (envp == (caddr_t)-1) {
+				error = EFAULT;
+				goto err_exit;
+			}
+			if ((error = copyinstr(envp, args->endp,
+			    args->stringspace, &length))) {
+				if (error == ENAMETOOLONG)
+					error = E2BIG;
+				goto err_exit;
+			}
+			args->stringspace -= length;
+			args->endp += length;
+			args->envc++;
+		}
+	}
+
+	return (0);
+
+err_exit:
+	exec_free_args(args);
+	return (error);
+}
+
+/*
+ * Allocate temporary demand-paged, zero-filled memory for the file name,
+ * argument, and environment strings.  Returns zero if the allocation succeeds
+ * and ENOMEM otherwise.
+ */
+int
+exec_alloc_args(struct image_args *args)
+{
+
+	args->buf = (char *)kmap_alloc_wait(exec_map, PATH_MAX + ARG_MAX);
+	return (args->buf != NULL ? 0 : ENOMEM);
+}
+
+void
+exec_free_args(struct image_args *args)
+{
+
+	if (args->buf != NULL) {
+		kmap_free_wakeup(exec_map, (vm_offset_t)args->buf,
+		    PATH_MAX + ARG_MAX);
+		args->buf = NULL;
+	}
+	if (args->fname_buf != NULL) {
+		free(args->fname_buf, M_TEMP);
+		args->fname_buf = NULL;
+	}
+}
+
+/*
+ * Copy strings out to the new process address space, constructing new arg
+ * and env vector tables. Return a pointer to the base so that it can be used
+ * as the initial stack pointer.
+ */
+register_t *
+exec_copyout_strings(imgp)
+	struct image_params *imgp;
+{
+	int argc, envc;
+	char **vectp;
+	char *stringp, *destp;
+	register_t *stack_base;
+	struct ps_strings *arginfo;
+	struct proc *p;
+	size_t execpath_len;
+	int szsigcode, szps;
+	char canary[sizeof(long) * 8];
+
+	szps = sizeof(pagesizes[0]) * MAXPAGESIZES;
+	/*
+	 * Calculate string base and vector table pointers.
+	 * Also deal with signal trampoline code for this exec type.
+	 */
+	if (imgp->execpath != NULL && imgp->auxargs != NULL)
+		execpath_len = strlen(imgp->execpath) + 1;
+	else
+		execpath_len = 0;
+	p = imgp->proc;
+	szsigcode = 0;
+	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
+	if (p->p_sysent->sv_sigcode_base == 0) {
+		if (p->p_sysent->sv_szsigcode != NULL)
+			szsigcode = *(p->p_sysent->sv_szsigcode);
+	}
+	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
+	    roundup(execpath_len, sizeof(char *)) -
+	    roundup(sizeof(canary), sizeof(char *)) -
+	    roundup(szps, sizeof(char *)) -
+	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
+
+	/*
+	 * install sigcode
+	 */
+	if (szsigcode != 0)
+		copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
+		    szsigcode), szsigcode);
+
+	/*
+	 * Copy the image path for the rtld.
+	 */
+	if (execpath_len != 0) {
+		imgp->execpathp = (uintptr_t)arginfo - szsigcode - execpath_len;
+		copyout(imgp->execpath, (void *)imgp->execpathp,
+		    execpath_len);
+	}
+
+	/*
+	 * Prepare the canary for SSP.
+	 */
+	arc4rand(canary, sizeof(canary), 0);
+	imgp->canary = (uintptr_t)arginfo - szsigcode - execpath_len -
+	    sizeof(canary);
+	copyout(canary, (void *)imgp->canary, sizeof(canary));
+	imgp->canarylen = sizeof(canary);
+
+	/*
+	 * Prepare the pagesizes array.
+	 */
+	imgp->pagesizes = (uintptr_t)arginfo - szsigcode - execpath_len -
+	    roundup(sizeof(canary), sizeof(char *)) - szps;
+	copyout(pagesizes, (void *)imgp->pagesizes, szps);
+	imgp->pagesizeslen = szps;
+
+	/*
+	 * If we have a valid auxargs ptr, prepare some room
+	 * on the stack.
+	 */
+	if (imgp->auxargs) {
+		/*
+		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
+		 * lower compatibility.
+		 */
+		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
+		    (AT_COUNT * 2);
+		/*
+		 * The '+ 2' is for the null pointers at the end of each of
+		 * the arg and env vector sets,and imgp->auxarg_size is room
+		 * for argument of Runtime loader.
+		 */
+		vectp = (char **)(destp - (imgp->args->argc +
+		    imgp->args->envc + 2 + imgp->auxarg_size)
+		    * sizeof(char *));
+	} else {
+		/*
+		 * The '+ 2' is for the null pointers at the end of each of
+		 * the arg and env vector sets
+		 */
+		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
+		    sizeof(char *));
+	}
+
+	/*
+	 * vectp also becomes our initial stack base
+	 */
+	stack_base = (register_t *)vectp;
+
+	stringp = imgp->args->begin_argv;
+	argc = imgp->args->argc;
+	envc = imgp->args->envc;
+
+	/*
+	 * Copy out strings - arguments and environment.
+	 */
+	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
+
+	/*
+	 * Fill in "ps_strings" struct for ps, w, etc.
+	 */
+	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
+	suword32(&arginfo->ps_nargvstr, argc);
+
+	/*
+	 * Fill in argument portion of vector table.
+	 */
+	for (; argc > 0; --argc) {
+		suword(vectp++, (long)(intptr_t)destp);
+		while (*stringp++ != 0)
+			destp++;
+		destp++;
+	}
+
+	/* a null vector table pointer separates the argp's from the envp's */
+	suword(vectp++, 0);
+
+	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
+	suword32(&arginfo->ps_nenvstr, envc);
+
+	/*
+	 * Fill in environment portion of vector table.
+	 */
+	for (; envc > 0; --envc) {
+		suword(vectp++, (long)(intptr_t)destp);
+		while (*stringp++ != 0)
+			destp++;
+		destp++;
+	}
+
+	/* end of vector table is a null pointer */
+	suword(vectp, 0);
+
+	return (stack_base);
+}
+
+/*
+ * Check permissions of file to execute.
+ *	Called with imgp->vp locked.
+ *	Return 0 for success or error code on failure.
+ */
+int
+exec_check_permissions(imgp)
+	struct image_params *imgp;
+{
+	struct vnode *vp = imgp->vp;
+	struct vattr *attr = imgp->attr;
+	struct thread *td;
+	int error, writecount;
+
+	td = curthread;
+
+	/* Get file attributes */
+	error = VOP_GETATTR(vp, attr, td->td_ucred);
+	if (error)
+		return (error);
+
+#ifdef MAC
+	error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
+	if (error)
+		return (error);
+#endif
+
+	/*
+	 * 1) Check if file execution is disabled for the filesystem that
+	 *    this file resides on.
+	 * 2) Ensure that at least one execute bit is on. Otherwise, a
+	 *    privileged user will always succeed, and we don't want this
+	 *    to happen unless the file really is executable.
+	 * 3) Ensure that the file is a regular file.
+	 */
+	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
+	    (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 ||
+	    (attr->va_type != VREG))
+		return (EACCES);
+
+	/*
+	 * Zero length files can't be exec'd
+	 */
+	if (attr->va_size == 0)
+		return (ENOEXEC);
+
+	/*
+	 *  Check for execute permission to file based on current credentials.
+	 */
+	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+	if (error)
+		return (error);
+
+	/*
+	 * Check number of open-for-writes on the file and deny execution
+	 * if there are any.
+	 */
+	error = VOP_GET_WRITECOUNT(vp, &writecount);
+	if (error != 0)
+		return (error);
+	if (writecount != 0)
+		return (ETXTBSY);
+
+	/*
+	 * Call filesystem specific open routine (which does nothing in the
+	 * general case).
+	 */
+	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
+	if (error == 0)
+		imgp->opened = 1;
+	return (error);
+}
+
+/*
+ * Exec handler registration
+ */
+int
+exec_register(execsw_arg)
+	const struct execsw *execsw_arg;
+{
+	const struct execsw **es, **xs, **newexecsw;
+	int count = 2;	/* New slot and trailing NULL */
+
+	if (execsw)
+		for (es = execsw; *es; es++)
+			count++;
+	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
+	if (newexecsw == NULL)
+		return (ENOMEM);
+	xs = newexecsw;
+	if (execsw)
+		for (es = execsw; *es; es++)
+			*xs++ = *es;
+	*xs++ = execsw_arg;
+	*xs = NULL;
+	if (execsw)
+		free(execsw, M_TEMP);
+	execsw = newexecsw;
+	return (0);
+}
+
+int
+exec_unregister(execsw_arg)
+	const struct execsw *execsw_arg;
+{
+	const struct execsw **es, **xs, **newexecsw;
+	int count = 1;
+
+	if (execsw == NULL)
+		panic("unregister with no handlers left?\n");
+
+	for (es = execsw; *es; es++) {
+		if (*es == execsw_arg)
+			break;
+	}
+	if (*es == NULL)
+		return (ENOENT);
+	for (es = execsw; *es; es++)
+		if (*es != execsw_arg)
+			count++;
+	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
+	if (newexecsw == NULL)
+		return (ENOMEM);
+	xs = newexecsw;
+	for (es = execsw; *es; es++)
+		if (*es != execsw_arg)
+			*xs++ = *es;
+	*xs = NULL;
+	if (execsw)
+		free(execsw, M_TEMP);
+	execsw = newexecsw;
+	return (0);
+}
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
new file mode 100644
index 0000000..f0be10e
--- /dev/null
+++ b/sys/kern/kern_exit.c
@@ -0,0 +1,1261 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_exit.c	8.7 (Berkeley) 2/12/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+#include "opt_procdesc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/capability.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/procdesc.h>
+#include <sys/pioctl.h>
+#include <sys/jail.h>
+#include <sys/tty.h>
+#include <sys/wait.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sbuf.h>
+#include <sys/signalvar.h>
+#include <sys/sched.h>
+#include <sys/sx.h>
+#include <sys/syscallsubr.h>
+#include <sys/syslog.h>
+#include <sys/ptrace.h>
+#include <sys/acct.h>		/* for acct_process() function prototype */
+#include <sys/filedesc.h>
+#include <sys/sdt.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+dtrace_execexit_func_t	dtrace_fasttrap_exit;
+#endif
+
+SDT_PROVIDER_DECLARE(proc);
+SDT_PROBE_DEFINE1(proc, kernel, , exit, exit, "int");
+
+/* Hook for NFS teardown procedure. */
+void (*nlminfo_release_p)(struct proc *p);
+
+static void
+clear_orphan(struct proc *p)
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	if (p->p_flag & P_ORPHAN) {
+		LIST_REMOVE(p, p_orphan);
+		p->p_flag &= ~P_ORPHAN;
+	}
+}
+
+/*
+ * exit -- death of process.
+ */
+void
+sys_sys_exit(struct thread *td, struct sys_exit_args *uap)
+{
+
+	exit1(td, W_EXITCODE(uap->rval, 0));
+	/* NOTREACHED */
+}
+
+/*
+ * Exit: deallocate address space and other resources, change proc state to
+ * zombie, and unlink proc from allproc and parent's lists.  Save exit status
+ * and rusage for wait().  Check for child processes and orphan them.
+ */
+void
+exit1(struct thread *td, int rv)
+{
+	struct proc *p, *nq, *q;
+	struct vnode *vtmp;
+	struct vnode *ttyvp = NULL;
+	struct plimit *plim;
+
+	mtx_assert(&Giant, MA_NOTOWNED);
+
+	p = td->td_proc;
+	/*
+	 * XXX in case we're rebooting we just let init die in order to
+	 * work around an unsolved stack overflow seen very late during
+	 * shutdown on sparc64 when the gmirror worker process exists.
+	 */
+	if (p == initproc && rebooting == 0) {
+		printf("init died (signal %d, exit %d)\n",
+		    WTERMSIG(rv), WEXITSTATUS(rv));
+		panic("Going nowhere without my init!");
+	}
+
+	/*
+	 * MUST abort all other threads before proceeding past here.
+	 */
+	PROC_LOCK(p);
+	while (p->p_flag & P_HADTHREADS) {
+		/*
+		 * First check if some other thread got here before us.
+		 * If so, act appropriately: exit or suspend.
+		 */
+		thread_suspend_check(0);
+
+		/*
+		 * Kill off the other threads. This requires
+		 * some co-operation from other parts of the kernel
+		 * so it may not be instantaneous.  With this state set
+		 * any thread entering the kernel from userspace will
+		 * thread_exit() in trap().  Any thread attempting to
+		 * sleep will return immediately with EINTR or EWOULDBLOCK
+		 * which will hopefully force them to back out to userland
+		 * freeing resources as they go.  Any thread attempting
+		 * to return to userland will thread_exit() from userret().
+		 * thread_exit() will unsuspend us when the last of the
+		 * other threads exits.
+		 * If there is already a thread singler after resumption,
+		 * calling thread_single will fail; in that case, we just
+		 * re-check all suspension request, the thread should
+		 * either be suspended there or exit.
+		 */
+		if (!thread_single(SINGLE_EXIT))
+			break;
+
+		/*
+		 * All other activity in this process is now stopped.
+		 * Threading support has been turned off.
+		 */
+	}
+	KASSERT(p->p_numthreads == 1,
+	    ("exit1: proc %p exiting with %d threads", p, p->p_numthreads));
+	racct_sub(p, RACCT_NTHR, 1);
+	/*
+	 * Wakeup anyone in procfs' PIOCWAIT.  They should have a hold
+	 * on our vmspace, so we should block below until they have
+	 * released their reference to us.  Note that if they have
+	 * requested S_EXIT stops we will block here until they ack
+	 * via PIOCCONT.
+	 */
+	_STOPEVENT(p, S_EXIT, rv);
+
+	/*
+	 * Ignore any pending request to stop due to a stop signal.
+	 * Once P_WEXIT is set, future requests will be ignored as
+	 * well.
+	 */
+	p->p_flag &= ~P_STOPPED_SIG;
+	KASSERT(!P_SHOULDSTOP(p), ("exiting process is stopped"));
+
+	/*
+	 * Note that we are exiting and do another wakeup of anyone in
+	 * PIOCWAIT in case they aren't listening for S_EXIT stops or
+	 * decided to wait again after we told them we are exiting.
+	 */
+	p->p_flag |= P_WEXIT;
+	wakeup(&p->p_stype);
+
+	/*
+	 * Wait for any processes that have a hold on our vmspace to
+	 * release their reference.
+	 */
+	while (p->p_lock > 0)
+		msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0);
+
+	p->p_xstat = rv;	/* Let event handler change exit status */
+	PROC_UNLOCK(p);
+	/* Drain the limit callout while we don't have the proc locked */
+	callout_drain(&p->p_limco);
+
+#ifdef AUDIT
+	/*
+	 * The Sun BSM exit token contains two components: an exit status as
+	 * passed to exit(), and a return value to indicate what sort of exit
+	 * it was.  The exit status is WEXITSTATUS(rv), but it's not clear
+	 * what the return value is.
+	 */
+	AUDIT_ARG_EXIT(WEXITSTATUS(rv), 0);
+	AUDIT_SYSCALL_EXIT(0, td);
+#endif
+
+	/* Are we a task leader? */
+	if (p == p->p_leader) {
+		mtx_lock(&ppeers_lock);
+		q = p->p_peers;
+		while (q != NULL) {
+			PROC_LOCK(q);
+			kern_psignal(q, SIGKILL);
+			PROC_UNLOCK(q);
+			q = q->p_peers;
+		}
+		while (p->p_peers != NULL)
+			msleep(p, &ppeers_lock, PWAIT, "exit1", 0);
+		mtx_unlock(&ppeers_lock);
+	}
+
+	/*
+	 * Check if any loadable modules need anything done at process exit.
+	 * E.g. SYSV IPC stuff
+	 * XXX what if one of these generates an error?
+	 */
+	EVENTHANDLER_INVOKE(process_exit, p);
+
+	/*
+	 * If parent is waiting for us to exit or exec,
+	 * P_PPWAIT is set; we will wakeup the parent below.
+	 */
+	PROC_LOCK(p);
+	rv = p->p_xstat;	/* Event handler could change exit status */
+	stopprofclock(p);
+	p->p_flag &= ~(P_TRACED | P_PPWAIT | P_PPTRACE);
+
+	/*
+	 * Stop the real interval timer.  If the handler is currently
+	 * executing, prevent it from rearming itself and let it finish.
+	 */
+	if (timevalisset(&p->p_realtimer.it_value) &&
+	    callout_stop(&p->p_itcallout) == 0) {
+		timevalclear(&p->p_realtimer.it_interval);
+		msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0);
+		KASSERT(!timevalisset(&p->p_realtimer.it_value),
+		    ("realtime timer is still armed"));
+	}
+	PROC_UNLOCK(p);
+
+	/*
+	 * Reset any sigio structures pointing to us as a result of
+	 * F_SETOWN with our pid.
+	 */
+	funsetownlst(&p->p_sigiolst);
+
+	/*
+	 * If this process has an nlminfo data area (for lockd), release it
+	 */
+	if (nlminfo_release_p != NULL && p->p_nlminfo != NULL)
+		(*nlminfo_release_p)(p);
+
+	/*
+	 * Close open files and release open-file table.
+	 * This may block!
+	 */
+	fdescfree(td);
+
+	/*
+	 * If this thread tickled GEOM, we need to wait for the giggling to
+	 * stop before we return to userland
+	 */
+	if (td->td_pflags & TDP_GEOM)
+		g_waitidle();
+
+	/*
+	 * Remove ourself from our leader's peer list and wake our leader.
+	 */
+	mtx_lock(&ppeers_lock);
+	if (p->p_leader->p_peers) {
+		q = p->p_leader;
+		while (q->p_peers != p)
+			q = q->p_peers;
+		q->p_peers = p->p_peers;
+		wakeup(p->p_leader);
+	}
+	mtx_unlock(&ppeers_lock);
+
+	vmspace_exit(td);
+
+	sx_xlock(&proctree_lock);
+	if (SESS_LEADER(p)) {
+		struct session *sp = p->p_session;
+		struct tty *tp;
+
+		/*
+		 * s_ttyp is not zero'd; we use this to indicate that
+		 * the session once had a controlling terminal. (for
+		 * logging and informational purposes)
+		 */
+		SESS_LOCK(sp);
+		ttyvp = sp->s_ttyvp;
+		tp = sp->s_ttyp;
+		sp->s_ttyvp = NULL;
+		sp->s_ttydp = NULL;
+		sp->s_leader = NULL;
+		SESS_UNLOCK(sp);
+
+		/*
+		 * Signal foreground pgrp and revoke access to
+		 * controlling terminal if it has not been revoked
+		 * already.
+		 *
+		 * Because the TTY may have been revoked in the mean
+		 * time and could already have a new session associated
+		 * with it, make sure we don't send a SIGHUP to a
+		 * foreground process group that does not belong to this
+		 * session.
+		 */
+
+		if (tp != NULL) {
+			tty_lock(tp);
+			if (tp->t_session == sp)
+				tty_signal_pgrp(tp, SIGHUP);
+			tty_unlock(tp);
+		}
+
+		if (ttyvp != NULL) {
+			sx_xunlock(&proctree_lock);
+			if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) {
+				VOP_REVOKE(ttyvp, REVOKEALL);
+				VOP_UNLOCK(ttyvp, 0);
+			}
+			sx_xlock(&proctree_lock);
+		}
+	}
+	fixjobc(p, p->p_pgrp, 0);
+	sx_xunlock(&proctree_lock);
+	(void)acct_process(td);
+
+	/* Release the TTY now we've unlocked everything. */
+	if (ttyvp != NULL)
+		vrele(ttyvp);
+#ifdef KTRACE
+	ktrprocexit(td);
+#endif
+	/*
+	 * Release reference to text vnode
+	 */
+	if ((vtmp = p->p_textvp) != NULL) {
+		p->p_textvp = NULL;
+		vrele(vtmp);
+	}
+
+	/*
+	 * Release our limits structure.
+	 */
+	PROC_LOCK(p);
+	plim = p->p_limit;
+	p->p_limit = NULL;
+	PROC_UNLOCK(p);
+	lim_free(plim);
+
+	tidhash_remove(td);
+
+	/*
+	 * Remove proc from allproc queue and pidhash chain.
+	 * Place onto zombproc.  Unlink from parent's child list.
+	 */
+	sx_xlock(&allproc_lock);
+	LIST_REMOVE(p, p_list);
+	LIST_INSERT_HEAD(&zombproc, p, p_list);
+	LIST_REMOVE(p, p_hash);
+	sx_xunlock(&allproc_lock);
+
+	/*
+	 * Call machine-dependent code to release any
+	 * machine-dependent resources other than the address space.
+	 * The address space is released by "vmspace_exitfree(p)" in
+	 * vm_waitproc().
+	 */
+	cpu_exit(td);
+
+	WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);
+
+	/*
+	 * Reparent all of our children to init.
+	 */
+	sx_xlock(&proctree_lock);
+	q = LIST_FIRST(&p->p_children);
+	if (q != NULL)		/* only need this if any child is S_ZOMB */
+		wakeup(initproc);
+	for (; q != NULL; q = nq) {
+		nq = LIST_NEXT(q, p_sibling);
+		PROC_LOCK(q);
+		proc_reparent(q, initproc);
+		q->p_sigparent = SIGCHLD;
+		/*
+		 * Traced processes are killed
+		 * since their existence means someone is screwing up.
+		 */
+		if (q->p_flag & P_TRACED) {
+			struct thread *temp;
+
+			/*
+			 * Since q was found on our children list, the
+			 * proc_reparent() call moved q to the orphan
+			 * list due to present P_TRACED flag. Clear
+			 * orphan link for q now while q is locked.
+			 */
+			clear_orphan(q);
+			q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE);
+			FOREACH_THREAD_IN_PROC(q, temp)
+				temp->td_dbgflags &= ~TDB_SUSPEND;
+			kern_psignal(q, SIGKILL);
+		}
+		PROC_UNLOCK(q);
+	}
+
+	/*
+	 * Also get rid of our orphans.
+	 */
+	while ((q = LIST_FIRST(&p->p_orphans)) != NULL) {
+		PROC_LOCK(q);
+		clear_orphan(q);
+		PROC_UNLOCK(q);
+	}
+
+	/* Save exit status. */
+	PROC_LOCK(p);
+	p->p_xthread = td;
+
+	/* Tell the prison that we are gone. */
+	prison_proc_free(p->p_ucred->cr_prison);
+
+#ifdef KDTRACE_HOOKS
+	/*
+	 * Tell the DTrace fasttrap provider about the exit if it
+	 * has declared an interest.
+	 */
+	if (dtrace_fasttrap_exit)
+		dtrace_fasttrap_exit(p);
+#endif
+
+	/*
+	 * Notify interested parties of our demise.
+	 */
+	KNOTE_LOCKED(&p->p_klist, NOTE_EXIT);
+
+#ifdef KDTRACE_HOOKS
+	int reason = CLD_EXITED;
+	if (WCOREDUMP(rv))
+		reason = CLD_DUMPED;
+	else if (WIFSIGNALED(rv))
+		reason = CLD_KILLED;
+	SDT_PROBE(proc, kernel, , exit, reason, 0, 0, 0, 0);
+#endif
+
+	/*
+	 * Just delete all entries in the p_klist. At this point we won't
+	 * report any more events, and there are nasty race conditions that
+	 * can beat us if we don't.
+	 */
+	knlist_clear(&p->p_klist, 1);
+
+	/*
+	 * If this is a process with a descriptor, we may not need to deliver
+	 * a signal to the parent.  proctree_lock is held over
+	 * procdesc_exit() to serialize concurrent calls to close() and
+	 * exit().
+	 */
+#ifdef PROCDESC
+	if (p->p_procdesc == NULL || procdesc_exit(p)) {
+#endif
+		/*
+		 * Notify parent that we're gone.  If parent has the
+		 * PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN,
+		 * notify process 1 instead (and hope it will handle this
+		 * situation).
+		 */
+		PROC_LOCK(p->p_pptr);
+		mtx_lock(&p->p_pptr->p_sigacts->ps_mtx);
+		if (p->p_pptr->p_sigacts->ps_flag &
+		    (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
+			struct proc *pp;
+
+			mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
+			pp = p->p_pptr;
+			PROC_UNLOCK(pp);
+			proc_reparent(p, initproc);
+			p->p_sigparent = SIGCHLD;
+			PROC_LOCK(p->p_pptr);
+
+			/*
+			 * Notify parent, so in case he was wait(2)ing or
+			 * executing waitpid(2) with our pid, he will
+			 * continue.
+			 */
+			wakeup(pp);
+		} else
+			mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
+
+		if (p->p_pptr == initproc)
+			kern_psignal(p->p_pptr, SIGCHLD);
+		else if (p->p_sigparent != 0) {
+			if (p->p_sigparent == SIGCHLD)
+				childproc_exited(p);
+			else	/* LINUX thread */
+				kern_psignal(p->p_pptr, p->p_sigparent);
+		}
+#ifdef PROCDESC
+	} else
+		PROC_LOCK(p->p_pptr);
+#endif
+	sx_xunlock(&proctree_lock);
+
+	/*
+	 * The state PRS_ZOMBIE prevents other proesses from sending
+	 * signal to the process, to avoid memory leak, we free memory
+	 * for signal queue at the time when the state is set.
+	 */
+	sigqueue_flush(&p->p_sigqueue);
+	sigqueue_flush(&td->td_sigqueue);
+
+	/*
+	 * We have to wait until after acquiring all locks before
+	 * changing p_state.  We need to avoid all possible context
+	 * switches (including ones from blocking on a mutex) while
+	 * marked as a zombie.  We also have to set the zombie state
+	 * before we release the parent process' proc lock to avoid
+	 * a lost wakeup.  So, we first call wakeup, then we grab the
+	 * sched lock, update the state, and release the parent process'
+	 * proc lock.
+	 */
+	wakeup(p->p_pptr);
+	cv_broadcast(&p->p_pwait);
+	sched_exit(p->p_pptr, td);
+	PROC_SLOCK(p);
+	p->p_state = PRS_ZOMBIE;
+	PROC_UNLOCK(p->p_pptr);
+
+	/*
+	 * Hopefully no one will try to deliver a signal to the process this
+	 * late in the game.
+	 */
+	knlist_destroy(&p->p_klist);
+
+	/*
+	 * Save our children's rusage information in our exit rusage.
+	 */
+	ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
+
+	/*
+	 * Make sure the scheduler takes this thread out of its tables etc.
+	 * This will also release this thread's reference to the ucred.
+	 * Other thread parts to release include pcb bits and such.
+	 */
+	thread_exit();
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct abort2_args {
+	char *why;
+	int nargs;
+	void **args;
+};
+#endif
+
+int
+sys_abort2(struct thread *td, struct abort2_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct sbuf *sb;
+	void *uargs[16];
+	int error, i, sig;
+
+	/*
+	 * Do it right now so we can log either proper call of abort2(), or
+	 * note, that invalid argument was passed. 512 is big enough to
+	 * handle 16 arguments' descriptions with additional comments.
+	 */
+	sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN);
+	sbuf_clear(sb);
+	sbuf_printf(sb, "%s(pid %d uid %d) aborted: ",
+	    p->p_comm, p->p_pid, td->td_ucred->cr_uid);
+	/*
+	 * Since we can't return from abort2(), send SIGKILL in cases, where
+	 * abort2() was called improperly
+	 */
+	sig = SIGKILL;
+	/* Prevent from DoSes from user-space. */
+	if (uap->nargs < 0 || uap->nargs > 16)
+		goto out;
+	if (uap->nargs > 0) {
+		if (uap->args == NULL)
+			goto out;
+		error = copyin(uap->args, uargs, uap->nargs * sizeof(void *));
+		if (error != 0)
+			goto out;
+	}
+	/*
+	 * Limit size of 'reason' string to 128. Will fit even when
+	 * maximal number of arguments was chosen to be logged.
+	 */
+	if (uap->why != NULL) {
+		error = sbuf_copyin(sb, uap->why, 128);
+		if (error < 0)
+			goto out;
+	} else {
+		sbuf_printf(sb, "(null)");
+	}
+	if (uap->nargs > 0) {
+		sbuf_printf(sb, "(");
+		for (i = 0;i < uap->nargs; i++)
+			sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]);
+		sbuf_printf(sb, ")");
+	}
+	/*
+	 * Final stage: arguments were proper, string has been
+	 * successfully copied from userspace, and copying pointers
+	 * from user-space succeed.
+	 */
+	sig = SIGABRT;
+out:
+	if (sig == SIGKILL) {
+		sbuf_trim(sb);
+		sbuf_printf(sb, " (Reason text inaccessible)");
+	}
+	sbuf_cat(sb, "\n");
+	sbuf_finish(sb);
+	log(LOG_INFO, "%s", sbuf_data(sb));
+	sbuf_delete(sb);
+	exit1(td, W_EXITCODE(0, sig));
+	return (0);
+}
+
+
+#ifdef COMPAT_43
+/*
+ * The dirty work is handled by kern_wait().
+ */
+int
+owait(struct thread *td, struct owait_args *uap __unused)
+{
+	int error, status;
+
+	error = kern_wait(td, WAIT_ANY, &status, 0, NULL);
+	if (error == 0)
+		td->td_retval[1] = status;
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * The dirty work is handled by kern_wait().
+ */
+int
+sys_wait4(struct thread *td, struct wait4_args *uap)
+{
+	struct rusage ru, *rup;
+	int error, status;
+
+	if (uap->rusage != NULL)
+		rup = &ru;
+	else
+		rup = NULL;
+	error = kern_wait(td, uap->pid, &status, uap->options, rup);
+	if (uap->status != NULL && error == 0)
+		error = copyout(&status, uap->status, sizeof(status));
+	if (uap->rusage != NULL && error == 0)
+		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
+	return (error);
+}
+
+int
+sys_wait6(struct thread *td, struct wait6_args *uap)
+{
+	struct __wrusage wru, *wrup;
+	siginfo_t si, *sip;
+	idtype_t idtype;
+	id_t id;
+	int error, status;
+
+	idtype = uap->idtype;
+	id = uap->id;
+
+	if (uap->wrusage != NULL)
+		wrup = &wru;
+	else
+		wrup = NULL;
+
+	if (uap->info != NULL) {
+		sip = &si;
+		bzero(sip, sizeof(*sip));
+	} else
+		sip = NULL;
+
+	/*
+	 *  We expect all callers of wait6() to know about WEXITED and
+	 *  WTRAPPED.
+	 */
+	error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip);
+
+	if (uap->status != NULL && error == 0)
+		error = copyout(&status, uap->status, sizeof(status));
+	if (uap->wrusage != NULL && error == 0)
+		error = copyout(&wru, uap->wrusage, sizeof(wru));
+	if (uap->info != NULL && error == 0)
+		error = copyout(&si, uap->info, sizeof(si));
+	return (error);
+}
+
+/*
+ * Reap the remains of a zombie process and optionally return status and
+ * rusage.  Asserts and will release both the proctree_lock and the process
+ * lock as part of its work.
+ */
+void
+proc_reap(struct thread *td, struct proc *p, int *status, int options)
+{
+	struct proc *q, *t;
+
+	sx_assert(&proctree_lock, SA_XLOCKED);
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE"));
+
+	q = td->td_proc;
+
+	PROC_SUNLOCK(p);
+	td->td_retval[0] = p->p_pid;
+	if (status)
+		*status = p->p_xstat;	/* convert to int */
+	if (options & WNOWAIT) {
+		/*
+		 *  Only poll, returning the status.  Caller does not wish to
+		 * release the proc struct just yet.
+		 */
+		PROC_UNLOCK(p);
+		sx_xunlock(&proctree_lock);
+		return;
+	}
+
+	PROC_LOCK(q);
+	sigqueue_take(p->p_ksi);
+	PROC_UNLOCK(q);
+	PROC_UNLOCK(p);
+
+	/*
+	 * If we got the child via a ptrace 'attach', we need to give it back
+	 * to the old parent.
+	 */
+	if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
+		PROC_LOCK(p);
+		proc_reparent(p, t);
+		p->p_oppid = 0;
+		PROC_UNLOCK(p);
+		pksignal(t, SIGCHLD, p->p_ksi);
+		wakeup(t);
+		cv_broadcast(&p->p_pwait);
+		PROC_UNLOCK(t);
+		sx_xunlock(&proctree_lock);
+		return;
+	}
+
+	/*
+	 * Remove other references to this process to ensure we have an
+	 * exclusive reference.
+	 */
+	sx_xlock(&allproc_lock);
+	LIST_REMOVE(p, p_list);	/* off zombproc */
+	sx_xunlock(&allproc_lock);
+	LIST_REMOVE(p, p_sibling);
+	PROC_LOCK(p);
+	clear_orphan(p);
+	PROC_UNLOCK(p);
+	leavepgrp(p);
+#ifdef PROCDESC
+	if (p->p_procdesc != NULL)
+		procdesc_reap(p);
+#endif
+	sx_xunlock(&proctree_lock);
+
+	/*
+	 * As a side effect of this lock, we know that all other writes to
+	 * this proc are visible now, so no more locking is needed for p.
+	 */
+	PROC_LOCK(p);
+	p->p_xstat = 0;		/* XXX: why? */
+	PROC_UNLOCK(p);
+	PROC_LOCK(q);
+	ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru, &p->p_rux);
+	PROC_UNLOCK(q);
+
+	/*
+	 * Decrement the count of procs running with this uid.
+	 */
+	(void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);
+
+	/*
+	 * Destroy resource accounting information associated with the process.
+	 */
+#ifdef RACCT
+	PROC_LOCK(p);
+	racct_sub(p, RACCT_NPROC, 1);
+	PROC_UNLOCK(p);
+#endif
+	racct_proc_exit(p);
+
+	/*
+	 * Free credentials, arguments, and sigacts.
+	 */
+	crfree(p->p_ucred);
+	p->p_ucred = NULL;
+	pargs_drop(p->p_args);
+	p->p_args = NULL;
+	sigacts_free(p->p_sigacts);
+	p->p_sigacts = NULL;
+
+	/*
+	 * Do any thread-system specific cleanups.
+	 */
+	thread_wait(p);
+
+	/*
+	 * Give vm and machine-dependent layer a chance to free anything that
+	 * cpu_exit couldn't release while still running in process context.
+	 */
+	vm_waitproc(p);
+#ifdef MAC
+	mac_proc_destroy(p);
+#endif
+	KASSERT(FIRST_THREAD_IN_PROC(p),
+	    ("proc_reap: no residual thread!"));
+	uma_zfree(proc_zone, p);
+	sx_xlock(&allproc_lock);
+	nprocs--;
+	sx_xunlock(&allproc_lock);
+}
+
+static int
+proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id,
+    int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo)
+{
+	struct proc *q;
+	struct rusage *rup;
+
+	sx_assert(&proctree_lock, SA_XLOCKED);
+
+	q = td->td_proc;
+	PROC_LOCK(p);
+
+	switch (idtype) {
+	case P_ALL:
+		break;
+	case P_PID:
+		if (p->p_pid != (pid_t)id) {
+			PROC_UNLOCK(p);
+			return (0);
+		}
+		break;
+	case P_PGID:
+		if (p->p_pgid != (pid_t)id) {
+			PROC_UNLOCK(p);
+			return (0);
+		}
+		break;
+	case P_SID:
+		if (p->p_session->s_sid != (pid_t)id) {
+			PROC_UNLOCK(p);
+			return (0);
+		}
+		break;
+	case P_UID:
+		if (p->p_ucred->cr_uid != (uid_t)id) {
+			PROC_UNLOCK(p);
+			return (0);
+		}
+		break;
+	case P_GID:
+		if (p->p_ucred->cr_gid != (gid_t)id) {
+			PROC_UNLOCK(p);
+			return (0);
+		}
+		break;
+	case P_JAILID:
+		if (p->p_ucred->cr_prison->pr_id != (int)id) {
+			PROC_UNLOCK(p);
+			return (0);
+		}
+		break;
+	/*
+	 * It seems that the thread structures get zeroed out
+	 * at process exit.  This makes it impossible to
+	 * support P_SETID, P_CID or P_CPUID.
+	 */
+	default:
+		PROC_UNLOCK(p);
+		return (0);
+	}
+
+	if (p_canwait(td, p)) {
+		PROC_UNLOCK(p);
+		return (0);
+	}
+
+	if (((options & WEXITED) == 0) && (p->p_state == PRS_ZOMBIE)) {
+		PROC_UNLOCK(p);
+		return (0);
+	}
+
+	/*
+	 * This special case handles a kthread spawned by linux_clone
+	 * (see linux_misc.c).  The linux_wait4 and linux_waitpid
+	 * functions need to be able to distinguish between waiting
+	 * on a process and waiting on a thread.  It is a thread if
+	 * p_sigparent is not SIGCHLD, and the WLINUXCLONE option
+	 * signifies we want to wait for threads and not processes.
+	 */
+	if ((p->p_sigparent != SIGCHLD) ^
+	    ((options & WLINUXCLONE) != 0)) {
+		PROC_UNLOCK(p);
+		return (0);
+	}
+
+	PROC_SLOCK(p);
+
+	if (siginfo != NULL) {
+		bzero(siginfo, sizeof(*siginfo));
+		siginfo->si_errno = 0;
+
+		/*
+		 * SUSv4 requires that the si_signo value is always
+		 * SIGCHLD. Obey it despite the rfork(2) interface
+		 * allows to request other signal for child exit
+		 * notification.
+		 */
+		siginfo->si_signo = SIGCHLD;
+
+		/*
+		 *  This is still a rough estimate.  We will fix the
+		 *  cases TRAPPED, STOPPED, and CONTINUED later.
+		 */
+		if (WCOREDUMP(p->p_xstat))
+			siginfo->si_code = CLD_DUMPED;
+		else if (WIFSIGNALED(p->p_xstat))
+			siginfo->si_code = CLD_KILLED;
+		else
+			siginfo->si_code = CLD_EXITED;
+
+		siginfo->si_pid = p->p_pid;
+		siginfo->si_uid = p->p_ucred->cr_uid;
+		siginfo->si_status = p->p_xstat;
+
+		/*
+		 * The si_addr field would be useful additional
+		 * detail, but apparently the PC value may be lost
+		 * when we reach this point.  bzero() above sets
+		 * siginfo->si_addr to NULL.
+		 */
+	}
+
+	/*
+	 * There should be no reason to limit resources usage info to
+	 * exited processes only.  A snapshot about any resources used
+	 * by a stopped process may be exactly what is needed.
+	 */
+	if (wrusage != NULL) {
+		rup = &wrusage->wru_self;
+		*rup = p->p_ru;
+		calcru(p, &rup->ru_utime, &rup->ru_stime);
+
+		rup = &wrusage->wru_children;
+		*rup = p->p_stats->p_cru;
+		calccru(p, &rup->ru_utime, &rup->ru_stime);
+	}
+
+	if (p->p_state == PRS_ZOMBIE) {
+		proc_reap(td, p, status, options);
+		return (-1);
+	}
+	PROC_SUNLOCK(p);
+	PROC_UNLOCK(p);
+	return (1);
+}
+
+int
+kern_wait(struct thread *td, pid_t pid, int *status, int options,
+    struct rusage *rusage)
+{
+	struct __wrusage wru, *wrup;
+	idtype_t idtype;
+	id_t id;
+	int ret;
+
+	/*
+	 * Translate the special pid values into the (idtype, pid)
+	 * pair for kern_wait6.  The WAIT_MYPGRP case is handled by
+	 * kern_wait6() on its own.
+	 */
+	if (pid == WAIT_ANY) {
+		idtype = P_ALL;
+		id = 0;
+	} else if (pid < 0) {
+		idtype = P_PGID;
+		id = (id_t)-pid;
+	} else {
+		idtype = P_PID;
+		id = (id_t)pid;
+	}
+
+	if (rusage != NULL)
+		wrup = &wru;
+	else
+		wrup = NULL;
+
+	/*
+	 * For backward compatibility we implicitly add flags WEXITED
+	 * and WTRAPPED here.
+	 */
+	options |= WEXITED | WTRAPPED;
+	ret = kern_wait6(td, idtype, id, status, options, wrup, NULL);
+	if (rusage != NULL)
+		*rusage = wru.wru_self;
+	return (ret);
+}
+
+int
+kern_wait6(struct thread *td, idtype_t idtype, id_t id, int *status,
+    int options, struct __wrusage *wrusage, siginfo_t *siginfo)
+{
+	struct proc *p, *q;
+	int error, nfound, ret;
+
+	AUDIT_ARG_VALUE((int)idtype);	/* XXX - This is likely wrong! */
+	AUDIT_ARG_PID((pid_t)id);	/* XXX - This may be wrong! */
+	AUDIT_ARG_VALUE(options);
+
+	q = td->td_proc;
+
+	if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) {
+		PROC_LOCK(q);
+		id = (id_t)q->p_pgid;
+		PROC_UNLOCK(q);
+		idtype = P_PGID;
+	}
+
+	/* If we don't know the option, just return. */
+	if ((options & ~(WUNTRACED | WNOHANG | WCONTINUED | WNOWAIT |
+	    WEXITED | WTRAPPED | WLINUXCLONE)) != 0)
+		return (EINVAL);
+	if ((options & (WEXITED | WUNTRACED | WCONTINUED | WTRAPPED)) == 0) {
+		/*
+		 * We will be unable to find any matching processes,
+		 * because there are no known events to look for.
+		 * Prefer to return error instead of blocking
+		 * indefinitely.
+		 */
+		return (EINVAL);
+	}
+
+loop:
+	if (q->p_flag & P_STATCHILD) {
+		PROC_LOCK(q);
+		q->p_flag &= ~P_STATCHILD;
+		PROC_UNLOCK(q);
+	}
+	nfound = 0;
+	sx_xlock(&proctree_lock);
+	LIST_FOREACH(p, &q->p_children, p_sibling) {
+		ret = proc_to_reap(td, p, idtype, id, status, options,
+		    wrusage, siginfo);
+		if (ret == 0)
+			continue;
+		else if (ret == 1)
+			nfound++;
+		else
+			return (0);
+
+		PROC_LOCK(p);
+		PROC_SLOCK(p);
+
+		if ((options & WTRAPPED) != 0 &&
+		    (p->p_flag & P_TRACED) != 0 &&
+		    (p->p_flag & (P_STOPPED_TRACE | P_STOPPED_SIG)) != 0 &&
+		    (p->p_suspcount == p->p_numthreads) &&
+		    ((p->p_flag & P_WAITED) == 0)) {
+			PROC_SUNLOCK(p);
+			if ((options & WNOWAIT) == 0)
+				p->p_flag |= P_WAITED;
+			sx_xunlock(&proctree_lock);
+			td->td_retval[0] = p->p_pid;
+
+			if (status != NULL)
+				*status = W_STOPCODE(p->p_xstat);
+			if (siginfo != NULL) {
+				siginfo->si_status = p->p_xstat;
+				siginfo->si_code = CLD_TRAPPED;
+			}
+			if ((options & WNOWAIT) == 0) {
+				PROC_LOCK(q);
+				sigqueue_take(p->p_ksi);
+				PROC_UNLOCK(q);
+			}
+
+			PROC_UNLOCK(p);
+			return (0);
+		}
+		if ((options & WUNTRACED) != 0 &&
+		    (p->p_flag & P_STOPPED_SIG) != 0 &&
+		    (p->p_suspcount == p->p_numthreads) &&
+		    ((p->p_flag & P_WAITED) == 0)) {
+			PROC_SUNLOCK(p);
+			if ((options & WNOWAIT) == 0)
+				p->p_flag |= P_WAITED;
+			sx_xunlock(&proctree_lock);
+			td->td_retval[0] = p->p_pid;
+
+			if (status != NULL)
+				*status = W_STOPCODE(p->p_xstat);
+			if (siginfo != NULL) {
+				siginfo->si_status = p->p_xstat;
+				siginfo->si_code = CLD_STOPPED;
+			}
+			if ((options & WNOWAIT) == 0) {
+				PROC_LOCK(q);
+				sigqueue_take(p->p_ksi);
+				PROC_UNLOCK(q);
+			}
+
+			PROC_UNLOCK(p);
+			return (0);
+		}
+		PROC_SUNLOCK(p);
+		if ((options & WCONTINUED) != 0 &&
+		    (p->p_flag & P_CONTINUED) != 0) {
+			sx_xunlock(&proctree_lock);
+			td->td_retval[0] = p->p_pid;
+			if ((options & WNOWAIT) == 0) {
+				p->p_flag &= ~P_CONTINUED;
+				PROC_LOCK(q);
+				sigqueue_take(p->p_ksi);
+				PROC_UNLOCK(q);
+			}
+			PROC_UNLOCK(p);
+
+			if (status != NULL)
+				*status = SIGCONT;
+			if (siginfo != NULL) {
+				siginfo->si_status = SIGCONT;
+				siginfo->si_code = CLD_CONTINUED;
+			}
+			return (0);
+		}
+		PROC_UNLOCK(p);
+	}
+
+	/*
+	 * Look in the orphans list too, to allow the parent to
+	 * collect it's child exit status even if child is being
+	 * debugged.
+	 *
+	 * Debugger detaches from the parent upon successful
+	 * switch-over from parent to child.  At this point due to
+	 * re-parenting the parent loses the child to debugger and a
+	 * wait4(2) call would report that it has no children to wait
+	 * for.  By maintaining a list of orphans we allow the parent
+	 * to successfully wait until the child becomes a zombie.
+	 */
+	LIST_FOREACH(p, &q->p_orphans, p_orphan) {
+		ret = proc_to_reap(td, p, idtype, id, status, options,
+		    wrusage, siginfo);
+		if (ret == 0)
+			continue;
+		else if (ret == 1)
+			nfound++;
+		else
+			return (0);
+	}
+	if (nfound == 0) {
+		sx_xunlock(&proctree_lock);
+		return (ECHILD);
+	}
+	if (options & WNOHANG) {
+		sx_xunlock(&proctree_lock);
+		td->td_retval[0] = 0;
+		return (0);
+	}
+	PROC_LOCK(q);
+	sx_xunlock(&proctree_lock);
+	if (q->p_flag & P_STATCHILD) {
+		q->p_flag &= ~P_STATCHILD;
+		error = 0;
+	} else
+		error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0);
+	PROC_UNLOCK(q);
+	if (error)
+		return (error);
+	goto loop;
+}
+
+/*
+ * Make process 'parent' the new parent of process 'child'.
+ * Must be called with an exclusive hold of proctree lock.
+ */
+void
+proc_reparent(struct proc *child, struct proc *parent)
+{
+
+	sx_assert(&proctree_lock, SX_XLOCKED);
+	PROC_LOCK_ASSERT(child, MA_OWNED);
+	if (child->p_pptr == parent)
+		return;
+
+	PROC_LOCK(child->p_pptr);
+	sigqueue_take(child->p_ksi);
+	PROC_UNLOCK(child->p_pptr);
+	LIST_REMOVE(child, p_sibling);
+	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
+
+	clear_orphan(child);
+	if (child->p_flag & P_TRACED) {
+		LIST_INSERT_HEAD(&child->p_pptr->p_orphans, child, p_orphan);
+		child->p_flag |= P_ORPHAN;
+	}
+
+	child->p_pptr = parent;
+}
diff --git a/sys/kern/kern_fail.c b/sys/kern/kern_fail.c
new file mode 100644
index 0000000..85d81e8
--- /dev/null
+++ b/sys/kern/kern_fail.c
@@ -0,0 +1,611 @@
+/*-
+ * Copyright (c) 2009 Isilon Inc http://www.isilon.com/
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/**
+ * @file
+ *
+ * fail(9) Facility.
+ *
+ * @ingroup failpoint_private
+ */
+/**
+ * @defgroup failpoint fail(9) Facility
+ *
+ * Failpoints allow for injecting fake errors into running code on the fly,
+ * without modifying code or recompiling with flags.  Failpoints are always
+ * present, and are very efficient when disabled.  Failpoints are described
+ * in man fail(9).
+ */
+/**
+ * @defgroup failpoint_private Private fail(9) Implementation functions
+ *
+ * Private implementations for the actual failpoint code.
+ *
+ * @ingroup failpoint
+ */
+/**
+ * @addtogroup failpoint_private
+ * @{
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/ctype.h>
+#include <sys/errno.h>
+#include <sys/fail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+
+#include <machine/stdarg.h>
+
+#ifdef ILOG_DEFINE_FOR_FILE
+ILOG_DEFINE_FOR_FILE(L_ISI_FAIL_POINT, L_ILOG, fail_point);
+#endif
+
+static MALLOC_DEFINE(M_FAIL_POINT, "Fail Points", "fail points system");
+#define fp_free(ptr) free(ptr, M_FAIL_POINT)
+#define fp_malloc(size, flags) malloc((size), M_FAIL_POINT, (flags))
+
+static struct mtx g_fp_mtx;
+MTX_SYSINIT(g_fp_mtx, &g_fp_mtx, "fail point mtx", MTX_DEF);
+#define FP_LOCK()	mtx_lock(&g_fp_mtx)
+#define FP_UNLOCK()	mtx_unlock(&g_fp_mtx)
+
+/**
+ * Failpoint types.
+ * Don't change these without changing fail_type_strings in fail.c.
+ * @ingroup failpoint_private
+ */
+enum fail_point_t {
+	FAIL_POINT_OFF,		/**< don't fail */
+	FAIL_POINT_PANIC,	/**< panic */
+	FAIL_POINT_RETURN,	/**< return an errorcode */
+	FAIL_POINT_BREAK,	/**< break into the debugger */
+	FAIL_POINT_PRINT,	/**< print a message */
+	FAIL_POINT_SLEEP,	/**< sleep for some msecs */
+	FAIL_POINT_NUMTYPES
+};
+
+static struct {
+	const char *name;
+	int	nmlen;
+} fail_type_strings[] = {
+#define	FP_TYPE_NM_LEN(s)	{ s, sizeof(s) - 1 }
+	[FAIL_POINT_OFF] =	FP_TYPE_NM_LEN("off"),
+	[FAIL_POINT_PANIC] =	FP_TYPE_NM_LEN("panic"),
+	[FAIL_POINT_RETURN] =	FP_TYPE_NM_LEN("return"),
+	[FAIL_POINT_BREAK] =	FP_TYPE_NM_LEN("break"),
+	[FAIL_POINT_PRINT] =	FP_TYPE_NM_LEN("print"),
+	[FAIL_POINT_SLEEP] =	FP_TYPE_NM_LEN("sleep"),
+};
+
+/**
+ * Internal structure tracking a single term of a complete failpoint.
+ * @ingroup failpoint_private
+ */
+struct fail_point_entry {
+	enum fail_point_t fe_type;	/**< type of entry */
+	int		fe_arg;		/**< argument to type (e.g. return value) */
+	int		fe_prob;	/**< likelihood of firing in millionths */
+	int		fe_count;	/**< number of times to fire, 0 means always */
+	pid_t		fe_pid;		/**< only fail for this process */
+	TAILQ_ENTRY(fail_point_entry) fe_entries; /**< next entry in fail point */
+};
+
+static inline void
+fail_point_sleep(struct fail_point *fp, struct fail_point_entry *ent,
+    int msecs, enum fail_point_return_code *pret)
+{
+	/* convert from millisecs to ticks, rounding up */
+	int timo = ((msecs * hz) + 999) / 1000;
+
+	if (timo > 0) {
+		if (fp->fp_sleep_fn == NULL) {
+			msleep(fp, &g_fp_mtx, PWAIT, "failpt", timo);
+		} else {
+			timeout(fp->fp_sleep_fn, fp->fp_sleep_arg, timo);
+			*pret = FAIL_POINT_RC_QUEUED;
+		}
+	}
+}
+
+
+/**
+ * Defines stating the equivalent of probablilty one (100%)
+ */
+enum {
+	PROB_MAX = 1000000,	/* probability between zero and this number */
+	PROB_DIGITS = 6,        /* number of zero's in above number */
+};
+
+static char *parse_fail_point(struct fail_point_entries *, char *);
+static char *parse_term(struct fail_point_entries *, char *);
+static char *parse_number(int *out_units, int *out_decimal, char *);
+static char *parse_type(struct fail_point_entry *, char *);
+static void free_entry(struct fail_point_entries *, struct fail_point_entry *);
+static void clear_entries(struct fail_point_entries *);
+
+/**
+ * Initialize a fail_point.  The name is formed in a printf-like fashion
+ * from "fmt" and subsequent arguments.  This function is generally used
+ * for custom failpoints located at odd places in the sysctl tree, and is
+ * not explicitly needed for standard in-line-declared failpoints.
+ *
+ * @ingroup failpoint
+ */
+void
+fail_point_init(struct fail_point *fp, const char *fmt, ...)
+{
+	va_list ap;
+	char *name;
+	int n;
+
+	TAILQ_INIT(&fp->fp_entries);
+	fp->fp_flags = 0;
+
+	/* Figure out the size of the name. */
+	va_start(ap, fmt);
+	n = vsnprintf(NULL, 0, fmt, ap);
+	va_end(ap);
+
+	/* Allocate the name and fill it in. */
+	name = fp_malloc(n + 1, M_WAITOK);
+	if (name != NULL) {
+		va_start(ap, fmt);
+		vsnprintf(name, n + 1, fmt, ap);
+		va_end(ap);
+	}
+	fp->fp_name = name;
+	fp->fp_location = "";
+	fp->fp_flags |= FAIL_POINT_DYNAMIC_NAME;
+	fp->fp_sleep_fn = NULL;
+	fp->fp_sleep_arg = NULL;
+}
+
+/**
+ * Free the resources held by a fail_point.
+ *
+ * @ingroup failpoint
+ */
+void
+fail_point_destroy(struct fail_point *fp)
+{
+
+	if ((fp->fp_flags & FAIL_POINT_DYNAMIC_NAME) != 0) {
+		fp_free(__DECONST(void *, fp->fp_name));
+		fp->fp_name = NULL;
+	}
+	fp->fp_flags = 0;
+	clear_entries(&fp->fp_entries);
+}
+
+/**
+ * This does the real work of evaluating a fail point. If the fail point tells
+ * us to return a value, this function returns 1 and fills in 'return_value'
+ * (return_value is allowed to be null). If the fail point tells us to panic,
+ * we never return. Otherwise we just return 0 after doing some work, which
+ * means "keep going".
+ */
+enum fail_point_return_code
+fail_point_eval_nontrivial(struct fail_point *fp, int *return_value)
+{
+	enum fail_point_return_code ret = FAIL_POINT_RC_CONTINUE;
+	struct fail_point_entry *ent, *next;
+	int msecs;
+
+	FP_LOCK();
+
+	TAILQ_FOREACH_SAFE(ent, &fp->fp_entries, fe_entries, next) {
+		int cont = 0; /* don't continue by default */
+
+		if (ent->fe_prob < PROB_MAX &&
+		    ent->fe_prob < random() % PROB_MAX)
+			continue;
+		if (ent->fe_pid != NO_PID && ent->fe_pid != curproc->p_pid)
+			continue;
+
+		switch (ent->fe_type) {
+		case FAIL_POINT_PANIC:
+			panic("fail point %s panicking", fp->fp_name);
+			/* NOTREACHED */
+
+		case FAIL_POINT_RETURN:
+			if (return_value != NULL)
+				*return_value = ent->fe_arg;
+			ret = FAIL_POINT_RC_RETURN;
+			break;
+
+		case FAIL_POINT_BREAK:
+			printf("fail point %s breaking to debugger\n",
+			    fp->fp_name);
+			breakpoint();
+			break;
+
+		case FAIL_POINT_PRINT:
+			printf("fail point %s executing\n", fp->fp_name);
+			cont = ent->fe_arg;
+			break;
+
+		case FAIL_POINT_SLEEP:
+			/*
+			 * Free the entry now if necessary, since
+			 * we're about to drop the mutex and sleep.
+			 */
+			msecs = ent->fe_arg;
+			if (ent->fe_count > 0 && --ent->fe_count == 0) {
+				free_entry(&fp->fp_entries, ent);
+				ent = NULL;
+			}
+
+			if (msecs)
+				fail_point_sleep(fp, ent, msecs, &ret);
+			break;
+
+		default:
+			break;
+		}
+
+		if (ent != NULL && ent->fe_count > 0 && --ent->fe_count == 0)
+			free_entry(&fp->fp_entries, ent);
+		if (cont == 0)
+			break;
+	}
+
+	/* Get rid of "off"s at the end. */
+	while ((ent = TAILQ_LAST(&fp->fp_entries, fail_point_entries)) &&
+	       ent->fe_type == FAIL_POINT_OFF)
+		free_entry(&fp->fp_entries, ent);
+
+	FP_UNLOCK();
+
+	return (ret);
+}
+
+/**
+ * Translate internal fail_point structure into human-readable text.
+ */
+static void
+fail_point_get(struct fail_point *fp, struct sbuf *sb)
+{
+	struct fail_point_entry *ent;
+
+	FP_LOCK();
+
+	TAILQ_FOREACH(ent, &fp->fp_entries, fe_entries) {
+		if (ent->fe_prob < PROB_MAX) {
+			int decimal = ent->fe_prob % (PROB_MAX / 100);
+			int units = ent->fe_prob / (PROB_MAX / 100);
+			sbuf_printf(sb, "%d", units);
+			if (decimal) {
+				int digits = PROB_DIGITS - 2;
+				while (!(decimal % 10)) {
+					digits--;
+					decimal /= 10;
+				}
+				sbuf_printf(sb, ".%0*d", digits, decimal);
+			}
+			sbuf_printf(sb, "%%");
+		}
+		if (ent->fe_count > 0)
+			sbuf_printf(sb, "%d*", ent->fe_count);
+		sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type].name);
+		if (ent->fe_arg)
+			sbuf_printf(sb, "(%d)", ent->fe_arg);
+		if (ent->fe_pid != NO_PID)
+			sbuf_printf(sb, "[pid %d]", ent->fe_pid);
+		if (TAILQ_NEXT(ent, fe_entries))
+			sbuf_printf(sb, "->");
+	}
+	if (TAILQ_EMPTY(&fp->fp_entries))
+		sbuf_printf(sb, "off");
+
+	FP_UNLOCK();
+}
+
+/**
+ * Set an internal fail_point structure from a human-readable failpoint string
+ * in a lock-safe manner.
+ */
+static int
+fail_point_set(struct fail_point *fp, char *buf)
+{
+	int error = 0;
+	struct fail_point_entry *ent, *ent_next;
+	struct fail_point_entries new_entries;
+
+	/* Parse new entries. */
+	TAILQ_INIT(&new_entries);
+	if (!parse_fail_point(&new_entries, buf)) {
+	        clear_entries(&new_entries);
+		error = EINVAL;
+		goto end;
+	}
+
+	FP_LOCK();
+
+	/* Move new entries in. */
+	TAILQ_SWAP(&fp->fp_entries, &new_entries, fail_point_entry, fe_entries);
+	clear_entries(&new_entries);
+
+	/* Get rid of useless zero probability entries. */
+	TAILQ_FOREACH_SAFE(ent, &fp->fp_entries, fe_entries, ent_next) {
+		if (ent->fe_prob == 0)
+			free_entry(&fp->fp_entries, ent);
+	}
+
+	/* Get rid of "off"s at the end. */
+	while ((ent = TAILQ_LAST(&fp->fp_entries, fail_point_entries)) &&
+		ent->fe_type == FAIL_POINT_OFF)
+		free_entry(&fp->fp_entries, ent);
+
+	FP_UNLOCK();
+
+ end:
+#ifdef IWARNING
+	if (error)
+		IWARNING("Failed to set %s %s to %s",
+		    fp->fp_name, fp->fp_location, buf);
+	else
+		INOTICE("Set %s %s to %s",
+		    fp->fp_name, fp->fp_location, buf);
+#endif /* IWARNING */
+
+	return (error);
+}
+
+#define MAX_FAIL_POINT_BUF	1023
+
+/**
+ * Handle kernel failpoint set/get.
+ */
+int
+fail_point_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct fail_point *fp = arg1;
+	char *buf = NULL;
+	struct sbuf sb;
+	int error;
+
+	/* Retrieving */
+	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
+	fail_point_get(fp, &sb);
+	sbuf_trim(&sb);
+	sbuf_finish(&sb);
+	error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb));
+	sbuf_delete(&sb);
+
+	/* Setting */
+	if (!error && req->newptr) {
+		if (req->newlen > MAX_FAIL_POINT_BUF) {
+			error = EINVAL;
+			goto out;
+		}
+
+		buf = fp_malloc(req->newlen + 1, M_WAITOK);
+
+		error = SYSCTL_IN(req, buf, req->newlen);
+		if (error)
+			goto out;
+		buf[req->newlen] = '\0';
+
+		error = fail_point_set(fp, buf);
+        }
+
+out:
+	fp_free(buf);
+	return (error);
+}
+
+/**
+ * Internal helper function to translate a human-readable failpoint string
+ * into a internally-parsable fail_point structure.
+ */
+static char *
+parse_fail_point(struct fail_point_entries *ents, char *p)
+{
+	/*  <fail_point> ::
+	 *      <term> ( "->" <term> )*
+	 */
+	p = parse_term(ents, p);
+	if (p == NULL)
+		return (NULL);
+	while (*p != '\0') {
+		if (p[0] != '-' || p[1] != '>')
+			return (NULL);
+		p = parse_term(ents, p + 2);
+		if (p == NULL)
+			return (NULL);
+	}
+	return (p);
+}
+
+/**
+ * Internal helper function to parse an individual term from a failpoint.
+ */
+static char *
+parse_term(struct fail_point_entries *ents, char *p)
+{
+	struct fail_point_entry *ent;
+
+	ent = fp_malloc(sizeof *ent, M_WAITOK | M_ZERO);
+	ent->fe_prob = PROB_MAX;
+	ent->fe_pid = NO_PID;
+	TAILQ_INSERT_TAIL(ents, ent, fe_entries);
+
+	/*
+	 * <term> ::
+	 *     ( (<float> "%") | (<integer> "*" ) )*
+	 *     <type>
+	 *     [ "(" <integer> ")" ]
+	 *     [ "[pid " <integer> "]" ]
+	 */
+
+	/* ( (<float> "%") | (<integer> "*" ) )* */
+	while (isdigit(*p) || *p == '.') {
+		int units, decimal;
+
+		p = parse_number(&units, &decimal, p);
+		if (p == NULL)
+			return (NULL);
+
+		if (*p == '%') {
+			if (units > 100) /* prevent overflow early */
+				units = 100;
+			ent->fe_prob = units * (PROB_MAX / 100) + decimal;
+			if (ent->fe_prob > PROB_MAX)
+				ent->fe_prob = PROB_MAX;
+		} else if (*p == '*') {
+			if (!units || decimal)
+				return (NULL);
+			ent->fe_count = units;
+		} else
+			return (NULL);
+		p++;
+	}
+
+	/* <type> */
+	p = parse_type(ent, p);
+	if (p == NULL)
+		return (NULL);
+	if (*p == '\0')
+		return (p);
+
+	/* [ "(" <integer> ")" ] */
+	if (*p != '(')
+		return p;
+	p++;
+	if (!isdigit(*p) && *p != '-')
+		return (NULL);
+	ent->fe_arg = strtol(p, &p, 0);
+	if (*p++ != ')')
+		return (NULL);
+
+	/* [ "[pid " <integer> "]" ] */
+#define	PID_STRING	"[pid "
+	if (strncmp(p, PID_STRING, sizeof(PID_STRING) - 1) != 0)
+		return (p);
+	p += sizeof(PID_STRING) - 1;
+	if (!isdigit(*p))
+		return (NULL);
+	ent->fe_pid = strtol(p, &p, 0);
+	if (*p++ != ']')
+		return (NULL);
+
+	return (p);
+}
+
+/**
+ * Internal helper function to parse a numeric for a failpoint term.
+ */
+static char *
+parse_number(int *out_units, int *out_decimal, char *p)
+{
+	char *old_p;
+
+	/*
+	 *  <number> ::
+	 *      <integer> [ "." <integer> ] |
+	 *      "." <integer>
+	 */
+
+	/* whole part */
+	old_p = p;
+	*out_units = strtol(p, &p, 10);
+	if (p == old_p && *p != '.')
+		return (NULL);
+
+	/* fractional part */
+	*out_decimal = 0;
+	if (*p == '.') {
+		int digits = 0;
+		p++;
+		while (isdigit(*p)) {
+			int digit = *p - '0';
+			if (digits < PROB_DIGITS - 2)
+				*out_decimal = *out_decimal * 10 + digit;
+			else if (digits == PROB_DIGITS - 2 && digit >= 5)
+				(*out_decimal)++;
+			digits++;
+			p++;
+		}
+		if (!digits) /* need at least one digit after '.' */
+			return (NULL);
+		while (digits++ < PROB_DIGITS - 2) /* add implicit zeros */
+			*out_decimal *= 10;
+	}
+
+	return (p); /* success */
+}
+
+/**
+ * Internal helper function to parse an individual type for a failpoint term.
+ */
+static char *
+parse_type(struct fail_point_entry *ent, char *beg)
+{
+	enum fail_point_t type;
+	int len;
+
+	for (type = FAIL_POINT_OFF; type < FAIL_POINT_NUMTYPES; type++) {
+		len = fail_type_strings[type].nmlen;
+		if (strncmp(fail_type_strings[type].name, beg, len) == 0) {
+			ent->fe_type = type;
+			return (beg + len);
+		}
+	}
+	return (NULL);
+}
+
+/**
+ * Internal helper function to free an individual failpoint term.
+ */
+static void
+free_entry(struct fail_point_entries *ents, struct fail_point_entry *ent)
+{
+	TAILQ_REMOVE(ents, ent, fe_entries);
+	fp_free(ent);
+}
+
+/**
+ * Internal helper function to clear out all failpoint terms for a single
+ * failpoint.
+ */
+static void
+clear_entries(struct fail_point_entries *ents)
+{
+	struct fail_point_entry *ent, *ent_next;
+
+	TAILQ_FOREACH_SAFE(ent, ents, fe_entries, ent_next)
+		fp_free(ent);
+	TAILQ_INIT(ents);
+}
+
+/* The fail point sysctl tree. */
+SYSCTL_NODE(_debug, OID_AUTO, fail_point, CTLFLAG_RW, 0, "fail points");
diff --git a/sys/kern/kern_ffclock.c b/sys/kern/kern_ffclock.c
new file mode 100644
index 0000000..07441cd
--- /dev/null
+++ b/sys/kern/kern_ffclock.c
@@ -0,0 +1,479 @@
+/*-
+ * Copyright (c) 2011 The University of Melbourne
+ * All rights reserved.
+ *
+ * This software was developed by Julien Ridoux at the University of Melbourne
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ffclock.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/timeffc.h>
+
+#ifdef FFCLOCK
+
+FEATURE(ffclock, "Feed-forward clock support");
+
+extern struct ffclock_estimate ffclock_estimate;
+extern struct bintime ffclock_boottime;
+extern int8_t ffclock_updated;
+extern struct mtx ffclock_mtx;
+
+/*
+ * Feed-forward clock absolute time. This should be the preferred way to read
+ * the feed-forward clock for "wall-clock" type time. The flags allow to compose
+ * various flavours of absolute time (e.g. with or without leap seconds taken
+ * into account). If valid pointers are provided, the ffcounter value and an
+ * upper bound on clock error associated with the bintime are provided.
+ * NOTE: use ffclock_convert_abs() to differ the conversion of a ffcounter value
+ * read earlier.
+ */
+void
+ffclock_abstime(ffcounter *ffcount, struct bintime *bt,
+    struct bintime *error_bound, uint32_t flags)
+{
+	struct ffclock_estimate cest;
+	ffcounter ffc;
+	ffcounter update_ffcount;
+	ffcounter ffdelta_error;
+
+	/* Get counter and corresponding time. */
+	if ((flags & FFCLOCK_FAST) == FFCLOCK_FAST)
+		ffclock_last_tick(&ffc, bt, flags);
+	else {
+		ffclock_read_counter(&ffc);
+		ffclock_convert_abs(ffc, bt, flags);
+	}
+
+	/* Current ffclock estimate, use update_ffcount as generation number. */
+	do {
+		update_ffcount = ffclock_estimate.update_ffcount;
+		bcopy(&ffclock_estimate, &cest, sizeof(struct ffclock_estimate));
+	} while (update_ffcount != ffclock_estimate.update_ffcount);
+
+	/*
+	 * Leap second adjustment. Total as seen by synchronisation algorithm
+	 * since it started. cest.leapsec_next is the ffcounter prediction of
+	 * when the next leapsecond occurs.
+	 */
+	if ((flags & FFCLOCK_LEAPSEC) == FFCLOCK_LEAPSEC) {
+		bt->sec -= cest.leapsec_total;
+		if (ffc > cest.leapsec_next)
+			bt->sec -= cest.leapsec;
+	}
+
+	/* Boot time adjustment, for uptime/monotonic clocks. */
+	if ((flags & FFCLOCK_UPTIME) == FFCLOCK_UPTIME) {
+		bintime_sub(bt, &ffclock_boottime);
+	}
+
+	/* Compute error bound if a valid pointer has been passed. */
+	if (error_bound) {
+		ffdelta_error = ffc - cest.update_ffcount;
+		ffclock_convert_diff(ffdelta_error, error_bound);
+		/* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s] */
+		bintime_mul(error_bound, cest.errb_rate *
+		    (uint64_t)18446744073709LL);
+		/* 18446744073 = int(2^64 / 1e9), since err_abs in [ns] */
+		bintime_addx(error_bound, cest.errb_abs *
+		    (uint64_t)18446744073LL);
+	}
+
+	if (ffcount)
+		*ffcount = ffc;
+}
+
+/*
+ * Feed-forward difference clock. This should be the preferred way to convert a
+ * time interval in ffcounter values into a time interval in seconds. If a valid
+ * pointer is passed, an upper bound on the error in computing the time interval
+ * in seconds is provided.
+ */
+void
+ffclock_difftime(ffcounter ffdelta, struct bintime *bt,
+    struct bintime *error_bound)
+{
+	ffcounter update_ffcount;
+	uint32_t err_rate;
+
+	ffclock_convert_diff(ffdelta, bt);
+
+	if (error_bound) {
+		do {
+			update_ffcount = ffclock_estimate.update_ffcount;
+			err_rate = ffclock_estimate.errb_rate;
+		} while (update_ffcount != ffclock_estimate.update_ffcount);
+
+		ffclock_convert_diff(ffdelta, error_bound);
+		/* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s] */
+		bintime_mul(error_bound, err_rate * (uint64_t)18446744073709LL);
+	}
+}
+
+/*
+ * Create a new kern.sysclock sysctl node, which will be home to some generic
+ * sysclock configuration variables. Feed-forward clock specific variables will
+ * live under the ffclock subnode.
+ */
+
+SYSCTL_NODE(_kern, OID_AUTO, sysclock, CTLFLAG_RW, 0,
+    "System clock related configuration");
+SYSCTL_NODE(_kern_sysclock, OID_AUTO, ffclock, CTLFLAG_RW, 0,
+    "Feed-forward clock configuration");
+
+static char *sysclocks[] = {"feedback", "feed-forward"};
+#define	MAX_SYSCLOCK_NAME_LEN 16
+#define	NUM_SYSCLOCKS (sizeof(sysclocks) / sizeof(*sysclocks))
+
+static int ffclock_version = 2;
+SYSCTL_INT(_kern_sysclock_ffclock, OID_AUTO, version, CTLFLAG_RD,
+    &ffclock_version, 0, "Feed-forward clock kernel version");
+
+/* List available sysclocks. */
+static int
+sysctl_kern_sysclock_available(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf *s;
+	int clk, error;
+
+	s = sbuf_new_for_sysctl(NULL, NULL,
+	    MAX_SYSCLOCK_NAME_LEN * NUM_SYSCLOCKS, req);
+	if (s == NULL)
+		return (ENOMEM);
+
+	for (clk = 0; clk < NUM_SYSCLOCKS; clk++) {
+		sbuf_cat(s, sysclocks[clk]);
+		if (clk + 1 < NUM_SYSCLOCKS)
+			sbuf_cat(s, " ");
+	}
+	error = sbuf_finish(s);
+	sbuf_delete(s);
+
+	return (error);
+}
+
+SYSCTL_PROC(_kern_sysclock, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD,
+    0, 0, sysctl_kern_sysclock_available, "A",
+    "List of available system clocks");
+
+/*
+ * Return the name of the active system clock if read, or attempt to change
+ * the active system clock to the user specified one if written to. The active
+ * system clock is read when calling any of the [get]{bin,nano,micro}[up]time()
+ * functions.
+ */
+static int
+sysctl_kern_sysclock_active(SYSCTL_HANDLER_ARGS)
+{
+	char newclock[MAX_SYSCLOCK_NAME_LEN];
+	int clk, error;
+
+	if (req->newptr == NULL) {
+		/* Return the name of the current active sysclock. */
+		strlcpy(newclock, sysclocks[sysclock_active], sizeof(newclock));
+		error = sysctl_handle_string(oidp, newclock,
+		    sizeof(newclock), req);
+	} else {
+		/* Change the active sysclock to the user specified one. */
+		error = EINVAL;
+		for (clk = 0; clk < NUM_SYSCLOCKS; clk++) {
+			if (strncmp((char *)req->newptr, sysclocks[clk],
+			    strlen(sysclocks[clk])) == 0) {
+				sysclock_active = clk;
+				error = 0;
+				break;
+			}
+		}
+	}
+
+	return (error);
+}
+
+SYSCTL_PROC(_kern_sysclock, OID_AUTO, active, CTLTYPE_STRING | CTLFLAG_RW,
+    0, 0, sysctl_kern_sysclock_active, "A",
+    "Name of the active system clock which is currently serving time");
+
+static int sysctl_kern_ffclock_ffcounter_bypass = 0;
+SYSCTL_INT(_kern_sysclock_ffclock, OID_AUTO, ffcounter_bypass, CTLFLAG_RW,
+    &sysctl_kern_ffclock_ffcounter_bypass, 0,
+    "Use reliable hardware timecounter as the feed-forward counter");
+
+/*
+ * High level functions to access the Feed-Forward Clock.
+ */
+void
+ffclock_bintime(struct bintime *bt)
+{
+
+	ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC);
+}
+
+void
+ffclock_nanotime(struct timespec *tsp)
+{
+	struct bintime bt;
+
+	ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC);
+	bintime2timespec(&bt, tsp);
+}
+
+void
+ffclock_microtime(struct timeval *tvp)
+{
+	struct bintime bt;
+
+	ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC);
+	bintime2timeval(&bt, tvp);
+}
+
+void
+ffclock_getbintime(struct bintime *bt)
+{
+
+	ffclock_abstime(NULL, bt, NULL,
+	    FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST);
+}
+
+void
+ffclock_getnanotime(struct timespec *tsp)
+{
+	struct bintime bt;
+
+	ffclock_abstime(NULL, &bt, NULL,
+	    FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST);
+	bintime2timespec(&bt, tsp);
+}
+
+void
+ffclock_getmicrotime(struct timeval *tvp)
+{
+	struct bintime bt;
+
+	ffclock_abstime(NULL, &bt, NULL,
+	    FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST);
+	bintime2timeval(&bt, tvp);
+}
+
+void
+ffclock_binuptime(struct bintime *bt)
+{
+
+	ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME);
+}
+
+void
+ffclock_nanouptime(struct timespec *tsp)
+{
+	struct bintime bt;
+
+	ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME);
+	bintime2timespec(&bt, tsp);
+}
+
+void
+ffclock_microuptime(struct timeval *tvp)
+{
+	struct bintime bt;
+
+	ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME);
+	bintime2timeval(&bt, tvp);
+}
+
+void
+ffclock_getbinuptime(struct bintime *bt)
+{
+
+	ffclock_abstime(NULL, bt, NULL,
+	    FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST);
+}
+
+void
+ffclock_getnanouptime(struct timespec *tsp)
+{
+	struct bintime bt;
+
+	ffclock_abstime(NULL, &bt, NULL,
+	    FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST);
+	bintime2timespec(&bt, tsp);
+}
+
+void
+ffclock_getmicrouptime(struct timeval *tvp)
+{
+	struct bintime bt;
+
+	ffclock_abstime(NULL, &bt, NULL,
+	    FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST);
+	bintime2timeval(&bt, tvp);
+}
+
+void
+ffclock_bindifftime(ffcounter ffdelta, struct bintime *bt)
+{
+
+	ffclock_difftime(ffdelta, bt, NULL);
+}
+
+void
+ffclock_nanodifftime(ffcounter ffdelta, struct timespec *tsp)
+{
+	struct bintime bt;
+
+	ffclock_difftime(ffdelta, &bt, NULL);
+	bintime2timespec(&bt, tsp);
+}
+
+void
+ffclock_microdifftime(ffcounter ffdelta, struct timeval *tvp)
+{
+	struct bintime bt;
+
+	ffclock_difftime(ffdelta, &bt, NULL);
+	bintime2timeval(&bt, tvp);
+}
+
+/*
+ * System call allowing userland applications to retrieve the current value of
+ * the Feed-Forward Clock counter.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ffclock_getcounter_args {
+	ffcounter *ffcount;
+};
+#endif
+/* ARGSUSED */
+int
+sys_ffclock_getcounter(struct thread *td, struct ffclock_getcounter_args *uap)
+{
+	ffcounter ffcount;
+	int error;
+
+	ffcount = 0;
+	ffclock_read_counter(&ffcount);
+	if (ffcount == 0)
+		return (EAGAIN);
+	error = copyout(&ffcount, uap->ffcount, sizeof(ffcounter));
+
+	return (error);
+}
+
+/*
+ * System call allowing the synchronisation daemon to push new feed-foward clock
+ * estimates to the kernel. Acquire ffclock_mtx to prevent concurrent updates
+ * and ensure data consistency.
+ * NOTE: ffclock_updated signals the fftimehands that new estimates are
+ * available. The updated estimates are picked up by the fftimehands on next
+ * tick, which could take as long as 1/hz seconds (if ticks are not missed).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ffclock_setestimate_args {
+	struct ffclock_estimate *cest;
+};
+#endif
+/* ARGSUSED */
+int
+sys_ffclock_setestimate(struct thread *td, struct ffclock_setestimate_args *uap)
+{
+	struct ffclock_estimate cest;
+	int error;
+
+	/* Reuse of PRIV_CLOCK_SETTIME. */
+	if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
+		return (error);
+
+	if ((error = copyin(uap->cest, &cest, sizeof(struct ffclock_estimate)))
+	    != 0)
+		return (error);
+
+	mtx_lock(&ffclock_mtx);
+	memcpy(&ffclock_estimate, &cest, sizeof(struct ffclock_estimate));
+	ffclock_updated++;
+	mtx_unlock(&ffclock_mtx);
+	return (error);
+}
+
+/*
+ * System call allowing userland applications to retrieve the clock estimates
+ * stored within the kernel. It is useful to kickstart the synchronisation
+ * daemon with the kernel's knowledge of hardware timecounter.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ffclock_getestimate_args {
+	struct ffclock_estimate *cest;
+};
+#endif
+/* ARGSUSED */
+int
+sys_ffclock_getestimate(struct thread *td, struct ffclock_getestimate_args *uap)
+{
+	struct ffclock_estimate cest;
+	int error;
+
+	mtx_lock(&ffclock_mtx);
+	memcpy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate));
+	mtx_unlock(&ffclock_mtx);
+	error = copyout(&cest, uap->cest, sizeof(struct ffclock_estimate));
+	return (error);
+}
+
+#else /* !FFCLOCK */
+
+int
+sys_ffclock_getcounter(struct thread *td, struct ffclock_getcounter_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+int
+sys_ffclock_setestimate(struct thread *td, struct ffclock_setestimate_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+int
+sys_ffclock_getestimate(struct thread *td, struct ffclock_getestimate_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+#endif /* FFCLOCK */
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
new file mode 100644
index 0000000..9cd1da9
--- /dev/null
+++ b/sys/kern/kern_fork.c
@@ -0,0 +1,1052 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+#include "opt_kstack_pages.h"
+#include "opt_procdesc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/eventhandler.h>
+#include <sys/fcntl.h>
+#include <sys/filedesc.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/procdesc.h>
+#include <sys/pioctl.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/syscall.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <sys/acct.h>
+#include <sys/ktr.h>
+#include <sys/ktrace.h>
+#include <sys/unistd.h>	
+#include <sys/sdt.h>
+#include <sys/sx.h>
+#include <sys/sysent.h>
+#include <sys/signalvar.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+dtrace_fork_func_t	dtrace_fasttrap_fork;
+#endif
+
+SDT_PROVIDER_DECLARE(proc);
+SDT_PROBE_DEFINE3(proc, kernel, , create, create, "struct proc *",
+    "struct proc *", "int");
+
+#ifndef _SYS_SYSPROTO_H_
+struct fork_args {
+	int     dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+sys_fork(struct thread *td, struct fork_args *uap)
+{
+	int error;
+	struct proc *p2;
+
+	error = fork1(td, RFFDG | RFPROC, 0, &p2, NULL, 0);
+	if (error == 0) {
+		td->td_retval[0] = p2->p_pid;
+		td->td_retval[1] = 0;
+	}
+	return (error);
+}
+
+/* ARGUSED */
+int
+sys_pdfork(td, uap)
+	struct thread *td;
+	struct pdfork_args *uap;
+{
+#ifdef PROCDESC
+	int error, fd;
+	struct proc *p2;
+
+	/*
+	 * It is necessary to return fd by reference because 0 is a valid file
+	 * descriptor number, and the child needs to be able to distinguish
+	 * itself from the parent using the return value.
+	 */
+	error = fork1(td, RFFDG | RFPROC | RFPROCDESC, 0, &p2,
+	    &fd, uap->flags);
+	if (error == 0) {
+		td->td_retval[0] = p2->p_pid;
+		td->td_retval[1] = 0;
+		error = copyout(&fd, uap->fdp, sizeof(fd));
+	}
+	return (error);
+#else
+	return (ENOSYS);
+#endif
+}
+
+/* ARGSUSED */
+int
+sys_vfork(struct thread *td, struct vfork_args *uap)
+{
+	int error, flags;
+	struct proc *p2;
+
+	flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
+	error = fork1(td, flags, 0, &p2, NULL, 0);
+	if (error == 0) {
+		td->td_retval[0] = p2->p_pid;
+		td->td_retval[1] = 0;
+	}
+	return (error);
+}
+
+int
+sys_rfork(struct thread *td, struct rfork_args *uap)
+{
+	struct proc *p2;
+	int error;
+
+	/* Don't allow kernel-only flags. */
+	if ((uap->flags & RFKERNELONLY) != 0)
+		return (EINVAL);
+
+	AUDIT_ARG_FFLAGS(uap->flags);
+	error = fork1(td, uap->flags, 0, &p2, NULL, 0);
+	if (error == 0) {
+		td->td_retval[0] = p2 ? p2->p_pid : 0;
+		td->td_retval[1] = 0;
+	}
+	return (error);
+}
+
+int	nprocs = 1;		/* process 0 */
+int	lastpid = 0;
+SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, 
+    "Last used PID");
+
+/*
+ * Random component to lastpid generation.  We mix in a random factor to make
+ * it a little harder to predict.  We sanity check the modulus value to avoid
+ * doing it in critical paths.  Don't let it be too small or we pointlessly
+ * waste randomness entropy, and don't let it be impossibly large.  Using a
+ * modulus that is too big causes a LOT more process table scans and slows
+ * down fork processing as the pidchecked caching is defeated.
+ */
+static int randompid = 0;
+
+static int
+sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
+{
+	int error, pid;
+
+	error = sysctl_wire_old_buffer(req, sizeof(int));
+	if (error != 0)
+		return(error);
+	sx_xlock(&allproc_lock);
+	pid = randompid;
+	error = sysctl_handle_int(oidp, &pid, 0, req);
+	if (error == 0 && req->newptr != NULL) {
+		if (pid < 0 || pid > pid_max - 100)	/* out of range */
+			pid = pid_max - 100;
+		else if (pid < 2)			/* NOP */
+			pid = 0;
+		else if (pid < 100)			/* Make it reasonable */
+			pid = 100;
+		randompid = pid;
+	}
+	sx_xunlock(&allproc_lock);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
+    0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
+
+static int
+fork_findpid(int flags)
+{
+	struct proc *p;
+	int trypid;
+	static int pidchecked = 0;
+
+	/*
+	 * Requires allproc_lock in order to iterate over the list
+	 * of processes, and proctree_lock to access p_pgrp.
+	 */
+	sx_assert(&allproc_lock, SX_LOCKED);
+	sx_assert(&proctree_lock, SX_LOCKED);
+
+	/*
+	 * Find an unused process ID.  We remember a range of unused IDs
+	 * ready to use (from lastpid+1 through pidchecked-1).
+	 *
+	 * If RFHIGHPID is set (used during system boot), do not allocate
+	 * low-numbered pids.
+	 */
+	trypid = lastpid + 1;
+	if (flags & RFHIGHPID) {
+		if (trypid < 10)
+			trypid = 10;
+	} else {
+		if (randompid)
+			trypid += arc4random() % randompid;
+	}
+retry:
+	/*
+	 * If the process ID prototype has wrapped around,
+	 * restart somewhat above 0, as the low-numbered procs
+	 * tend to include daemons that don't exit.
+	 */
+	if (trypid >= pid_max) {
+		trypid = trypid % pid_max;
+		if (trypid < 100)
+			trypid += 100;
+		pidchecked = 0;
+	}
+	if (trypid >= pidchecked) {
+		int doingzomb = 0;
+
+		pidchecked = PID_MAX;
+		/*
+		 * Scan the active and zombie procs to check whether this pid
+		 * is in use.  Remember the lowest pid that's greater
+		 * than trypid, so we can avoid checking for a while.
+		 */
+		p = LIST_FIRST(&allproc);
+again:
+		for (; p != NULL; p = LIST_NEXT(p, p_list)) {
+			while (p->p_pid == trypid ||
+			    (p->p_pgrp != NULL &&
+			    (p->p_pgrp->pg_id == trypid ||
+			    (p->p_session != NULL &&
+			    p->p_session->s_sid == trypid)))) {
+				trypid++;
+				if (trypid >= pidchecked)
+					goto retry;
+			}
+			if (p->p_pid > trypid && pidchecked > p->p_pid)
+				pidchecked = p->p_pid;
+			if (p->p_pgrp != NULL) {
+				if (p->p_pgrp->pg_id > trypid &&
+				    pidchecked > p->p_pgrp->pg_id)
+					pidchecked = p->p_pgrp->pg_id;
+				if (p->p_session != NULL &&
+				    p->p_session->s_sid > trypid &&
+				    pidchecked > p->p_session->s_sid)
+					pidchecked = p->p_session->s_sid;
+			}
+		}
+		if (!doingzomb) {
+			doingzomb = 1;
+			p = LIST_FIRST(&zombproc);
+			goto again;
+		}
+	}
+
+	/*
+	 * RFHIGHPID does not mess with the lastpid counter during boot.
+	 */
+	if (flags & RFHIGHPID)
+		pidchecked = 0;
+	else
+		lastpid = trypid;
+
+	return (trypid);
+}
+
+static int
+fork_norfproc(struct thread *td, int flags)
+{
+	int error;
+	struct proc *p1;
+
+	KASSERT((flags & RFPROC) == 0,
+	    ("fork_norfproc called with RFPROC set"));
+	p1 = td->td_proc;
+
+	if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
+	    (flags & (RFCFDG | RFFDG))) {
+		PROC_LOCK(p1);
+		if (thread_single(SINGLE_BOUNDARY)) {
+			PROC_UNLOCK(p1);
+			return (ERESTART);
+		}
+		PROC_UNLOCK(p1);
+	}
+
+	error = vm_forkproc(td, NULL, NULL, NULL, flags);
+	if (error)
+		goto fail;
+
+	/*
+	 * Close all file descriptors.
+	 */
+	if (flags & RFCFDG) {
+		struct filedesc *fdtmp;
+		fdtmp = fdinit(td->td_proc->p_fd);
+		fdescfree(td);
+		p1->p_fd = fdtmp;
+	}
+
+	/*
+	 * Unshare file descriptors (from parent).
+	 */
+	if (flags & RFFDG) 
+		fdunshare(p1, td);
+
+fail:
+	if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
+	    (flags & (RFCFDG | RFFDG))) {
+		PROC_LOCK(p1);
+		thread_single_end();
+		PROC_UNLOCK(p1);
+	}
+	return (error);
+}
+
+static void
+do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
+    struct vmspace *vm2, int pdflags)
+{
+	struct proc *p1, *pptr;
+	int p2_held, trypid;
+	struct filedesc *fd;
+	struct filedesc_to_leader *fdtol;
+	struct sigacts *newsigacts;
+
+	sx_assert(&proctree_lock, SX_SLOCKED);
+	sx_assert(&allproc_lock, SX_XLOCKED);
+
+	p2_held = 0;
+	p1 = td->td_proc;
+
+	/*
+	 * Increment the nprocs resource before blocking can occur.  There
+	 * are hard-limits as to the number of processes that can run.
+	 */
+	nprocs++;
+
+	trypid = fork_findpid(flags);
+
+	sx_sunlock(&proctree_lock);
+
+	p2->p_state = PRS_NEW;		/* protect against others */
+	p2->p_pid = trypid;
+	AUDIT_ARG_PID(p2->p_pid);
+	LIST_INSERT_HEAD(&allproc, p2, p_list);
+	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
+	tidhash_add(td2);
+	PROC_LOCK(p2);
+	PROC_LOCK(p1);
+
+	sx_xunlock(&allproc_lock);
+
+	bcopy(&p1->p_startcopy, &p2->p_startcopy,
+	    __rangeof(struct proc, p_startcopy, p_endcopy));
+	pargs_hold(p2->p_args);
+	PROC_UNLOCK(p1);
+
+	bzero(&p2->p_startzero,
+	    __rangeof(struct proc, p_startzero, p_endzero));
+
+	p2->p_ucred = crhold(td->td_ucred);
+
+	/* Tell the prison that we exist. */
+	prison_proc_hold(p2->p_ucred->cr_prison);
+
+	PROC_UNLOCK(p2);
+
+	/*
+	 * Malloc things while we don't hold any locks.
+	 */
+	if (flags & RFSIGSHARE)
+		newsigacts = NULL;
+	else
+		newsigacts = sigacts_alloc();
+
+	/*
+	 * Copy filedesc.
+	 */
+	if (flags & RFCFDG) {
+		fd = fdinit(p1->p_fd);
+		fdtol = NULL;
+	} else if (flags & RFFDG) {
+		fd = fdcopy(p1->p_fd);
+		fdtol = NULL;
+	} else {
+		fd = fdshare(p1->p_fd);
+		if (p1->p_fdtol == NULL)
+			p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL,
+			    p1->p_leader);
+		if ((flags & RFTHREAD) != 0) {
+			/*
+			 * Shared file descriptor table, and shared
+			 * process leaders.
+			 */
+			fdtol = p1->p_fdtol;
+			FILEDESC_XLOCK(p1->p_fd);
+			fdtol->fdl_refcount++;
+			FILEDESC_XUNLOCK(p1->p_fd);
+		} else {
+			/* 
+			 * Shared file descriptor table, and different
+			 * process leaders.
+			 */
+			fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
+			    p1->p_fd, p2);
+		}
+	}
+	/*
+	 * Make a proc table entry for the new process.
+	 * Start by zeroing the section of proc that is zero-initialized,
+	 * then copy the section that is copied directly from the parent.
+	 */
+
+	PROC_LOCK(p2);
+	PROC_LOCK(p1);
+
+	bzero(&td2->td_startzero,
+	    __rangeof(struct thread, td_startzero, td_endzero));
+
+	bcopy(&td->td_startcopy, &td2->td_startcopy,
+	    __rangeof(struct thread, td_startcopy, td_endcopy));
+
+	bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
+	td2->td_sigstk = td->td_sigstk;
+	td2->td_flags = TDF_INMEM;
+	td2->td_lend_user_pri = PRI_MAX;
+
+#ifdef VIMAGE
+	td2->td_vnet = NULL;
+	td2->td_vnet_lpush = NULL;
+#endif
+
+	/*
+	 * Allow the scheduler to initialize the child.
+	 */
+	thread_lock(td);
+	sched_fork(td, td2);
+	thread_unlock(td);
+
+	/*
+	 * Duplicate sub-structures as needed.
+	 * Increase reference counts on shared objects.
+	 */
+	p2->p_flag = P_INMEM;
+	p2->p_swtick = ticks;
+	if (p1->p_flag & P_PROFIL)
+		startprofclock(p2);
+	td2->td_ucred = crhold(p2->p_ucred);
+
+	if (flags & RFSIGSHARE) {
+		p2->p_sigacts = sigacts_hold(p1->p_sigacts);
+	} else {
+		sigacts_copy(newsigacts, p1->p_sigacts);
+		p2->p_sigacts = newsigacts;
+	}
+
+	if (flags & RFTSIGZMB)
+	        p2->p_sigparent = RFTSIGNUM(flags);
+	else if (flags & RFLINUXTHPN)
+	        p2->p_sigparent = SIGUSR1;
+	else
+	        p2->p_sigparent = SIGCHLD;
+
+	p2->p_textvp = p1->p_textvp;
+	p2->p_fd = fd;
+	p2->p_fdtol = fdtol;
+
+	/*
+	 * p_limit is copy-on-write.  Bump its refcount.
+	 */
+	lim_fork(p1, p2);
+
+	pstats_fork(p1->p_stats, p2->p_stats);
+
+	PROC_UNLOCK(p1);
+	PROC_UNLOCK(p2);
+
+	/* Bump references to the text vnode (for procfs). */
+	if (p2->p_textvp)
+		vref(p2->p_textvp);
+
+	/*
+	 * Set up linkage for kernel based threading.
+	 */
+	if ((flags & RFTHREAD) != 0) {
+		mtx_lock(&ppeers_lock);
+		p2->p_peers = p1->p_peers;
+		p1->p_peers = p2;
+		p2->p_leader = p1->p_leader;
+		mtx_unlock(&ppeers_lock);
+		PROC_LOCK(p1->p_leader);
+		if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
+			PROC_UNLOCK(p1->p_leader);
+			/*
+			 * The task leader is exiting, so process p1 is
+			 * going to be killed shortly.  Since p1 obviously
+			 * isn't dead yet, we know that the leader is either
+			 * sending SIGKILL's to all the processes in this
+			 * task or is sleeping waiting for all the peers to
+			 * exit.  We let p1 complete the fork, but we need
+			 * to go ahead and kill the new process p2 since
+			 * the task leader may not get a chance to send
+			 * SIGKILL to it.  We leave it on the list so that
+			 * the task leader will wait for this new process
+			 * to commit suicide.
+			 */
+			PROC_LOCK(p2);
+			kern_psignal(p2, SIGKILL);
+			PROC_UNLOCK(p2);
+		} else
+			PROC_UNLOCK(p1->p_leader);
+	} else {
+		p2->p_peers = NULL;
+		p2->p_leader = p2;
+	}
+
+	sx_xlock(&proctree_lock);
+	PGRP_LOCK(p1->p_pgrp);
+	PROC_LOCK(p2);
+	PROC_LOCK(p1);
+
+	/*
+	 * Preserve some more flags in subprocess.  P_PROFIL has already
+	 * been preserved.
+	 */
+	p2->p_flag |= p1->p_flag & P_SUGID;
+	td2->td_pflags |= td->td_pflags & TDP_ALTSTACK;
+	SESS_LOCK(p1->p_session);
+	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
+		p2->p_flag |= P_CONTROLT;
+	SESS_UNLOCK(p1->p_session);
+	if (flags & RFPPWAIT)
+		p2->p_flag |= P_PPWAIT;
+
+	p2->p_pgrp = p1->p_pgrp;
+	LIST_INSERT_AFTER(p1, p2, p_pglist);
+	PGRP_UNLOCK(p1->p_pgrp);
+	LIST_INIT(&p2->p_children);
+	LIST_INIT(&p2->p_orphans);
+
+	callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0);
+
+	/*
+	 * If PF_FORK is set, the child process inherits the
+	 * procfs ioctl flags from its parent.
+	 */
+	if (p1->p_pfsflags & PF_FORK) {
+		p2->p_stops = p1->p_stops;
+		p2->p_pfsflags = p1->p_pfsflags;
+	}
+
+	/*
+	 * This begins the section where we must prevent the parent
+	 * from being swapped.
+	 */
+	_PHOLD(p1);
+	PROC_UNLOCK(p1);
+
+	/*
+	 * Attach the new process to its parent.
+	 *
+	 * If RFNOWAIT is set, the newly created process becomes a child
+	 * of init.  This effectively disassociates the child from the
+	 * parent.
+	 */
+	if (flags & RFNOWAIT)
+		pptr = initproc;
+	else
+		pptr = p1;
+	p2->p_pptr = pptr;
+	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
+	sx_xunlock(&proctree_lock);
+
+	/* Inform accounting that we have forked. */
+	p2->p_acflag = AFORK;
+	PROC_UNLOCK(p2);
+
+#ifdef KTRACE
+	ktrprocfork(p1, p2);
+#endif
+
+	/*
+	 * Finish creating the child process.  It will return via a different
+	 * execution path later.  (ie: directly into user mode)
+	 */
+	vm_forkproc(td, p2, td2, vm2, flags);
+
+	if (flags == (RFFDG | RFPROC)) {
+		PCPU_INC(cnt.v_forks);
+		PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
+		    p2->p_vmspace->vm_ssize);
+	} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
+		PCPU_INC(cnt.v_vforks);
+		PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
+		    p2->p_vmspace->vm_ssize);
+	} else if (p1 == &proc0) {
+		PCPU_INC(cnt.v_kthreads);
+		PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
+		    p2->p_vmspace->vm_ssize);
+	} else {
+		PCPU_INC(cnt.v_rforks);
+		PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
+		    p2->p_vmspace->vm_ssize);
+	}
+
+#ifdef PROCDESC
+	/*
+	 * Associate the process descriptor with the process before anything
+	 * can happen that might cause that process to need the descriptor.
+	 * However, don't do this until after fork(2) can no longer fail.
+	 */
+	if (flags & RFPROCDESC)
+		procdesc_new(p2, pdflags);
+#endif
+
+	/*
+	 * Both processes are set up, now check if any loadable modules want
+	 * to adjust anything.
+	 */
+	EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
+
+	/*
+	 * Set the child start time and mark the process as being complete.
+	 */
+	PROC_LOCK(p2);
+	PROC_LOCK(p1);
+	microuptime(&p2->p_stats->p_start);
+	PROC_SLOCK(p2);
+	p2->p_state = PRS_NORMAL;
+	PROC_SUNLOCK(p2);
+
+#ifdef KDTRACE_HOOKS
+	/*
+	 * Tell the DTrace fasttrap provider about the new process
+	 * if it has registered an interest. We have to do this only after
+	 * p_state is PRS_NORMAL since the fasttrap module will use pfind()
+	 * later on.
+	 */
+	if (dtrace_fasttrap_fork)
+		dtrace_fasttrap_fork(p1, p2);
+#endif
+	if ((p1->p_flag & (P_TRACED | P_FOLLOWFORK)) == (P_TRACED |
+	    P_FOLLOWFORK)) {
+		/*
+		 * Arrange for debugger to receive the fork event.
+		 *
+		 * We can report PL_FLAG_FORKED regardless of
+		 * P_FOLLOWFORK settings, but it does not make a sense
+		 * for runaway child.
+		 */
+		td->td_dbgflags |= TDB_FORK;
+		td->td_dbg_forked = p2->p_pid;
+		td2->td_dbgflags |= TDB_STOPATFORK;
+		_PHOLD(p2);
+		p2_held = 1;
+	}
+	if (flags & RFPPWAIT) {
+		td->td_pflags |= TDP_RFPPWAIT;
+		td->td_rfppwait_p = p2;
+	}
+	PROC_UNLOCK(p2);
+	if ((flags & RFSTOPPED) == 0) {
+		/*
+		 * If RFSTOPPED not requested, make child runnable and
+		 * add to run queue.
+		 */
+		thread_lock(td2);
+		TD_SET_CAN_RUN(td2);
+		sched_add(td2, SRQ_BORING);
+		thread_unlock(td2);
+	}
+
+	/*
+	 * Now can be swapped.
+	 */
+	_PRELE(p1);
+	PROC_UNLOCK(p1);
+
+	/*
+	 * Tell any interested parties about the new process.
+	 */
+	knote_fork(&p1->p_klist, p2->p_pid);
+	SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0);
+
+	/*
+	 * Wait until debugger is attached to child.
+	 */
+	PROC_LOCK(p2);
+	while ((td2->td_dbgflags & TDB_STOPATFORK) != 0)
+		cv_wait(&p2->p_dbgwait, &p2->p_mtx);
+	if (p2_held)
+		_PRELE(p2);
+	PROC_UNLOCK(p2);
+}
+
+int
+fork1(struct thread *td, int flags, int pages, struct proc **procp,
+    int *procdescp, int pdflags)
+{
+	struct proc *p1;
+	struct proc *newproc;
+	int ok;
+	struct thread *td2;
+	struct vmspace *vm2;
+	vm_ooffset_t mem_charged;
+	int error;
+	static int curfail;
+	static struct timeval lastfail;
+#ifdef PROCDESC
+	struct file *fp_procdesc = NULL;
+#endif
+
+	/* Check for the undefined or unimplemented flags. */
+	if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
+		return (EINVAL);
+
+	/* Signal value requires RFTSIGZMB. */
+	if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0)
+		return (EINVAL);
+
+	/* Can't copy and clear. */
+	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
+		return (EINVAL);
+
+	/* Check the validity of the signal number. */
+	if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
+		return (EINVAL);
+
+#ifdef PROCDESC
+	if ((flags & RFPROCDESC) != 0) {
+		/* Can't not create a process yet get a process descriptor. */
+		if ((flags & RFPROC) == 0)
+			return (EINVAL);
+
+		/* Must provide a place to put a procdesc if creating one. */
+		if (procdescp == NULL)
+			return (EINVAL);
+	}
+#endif
+
+	p1 = td->td_proc;
+
+	/*
+	 * Here we don't create a new process, but we divorce
+	 * certain parts of a process from itself.
+	 */
+	if ((flags & RFPROC) == 0) {
+		*procp = NULL;
+		return (fork_norfproc(td, flags));
+	}
+
+#ifdef PROCDESC
+	/*
+	 * If required, create a process descriptor in the parent first; we
+	 * will abandon it if something goes wrong. We don't finit() until
+	 * later.
+	 */
+	if (flags & RFPROCDESC) {
+		error = falloc(td, &fp_procdesc, procdescp, 0);
+		if (error != 0)
+			return (error);
+	}
+#endif
+
+	mem_charged = 0;
+	vm2 = NULL;
+	if (pages == 0)
+		pages = KSTACK_PAGES;
+	/* Allocate new proc. */
+	newproc = uma_zalloc(proc_zone, M_WAITOK);
+	td2 = FIRST_THREAD_IN_PROC(newproc);
+	if (td2 == NULL) {
+		td2 = thread_alloc(pages);
+		if (td2 == NULL) {
+			error = ENOMEM;
+			goto fail1;
+		}
+		proc_linkup(newproc, td2);
+	} else {
+		if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) {
+			if (td2->td_kstack != 0)
+				vm_thread_dispose(td2);
+			if (!thread_alloc_stack(td2, pages)) {
+				error = ENOMEM;
+				goto fail1;
+			}
+		}
+	}
+
+	if ((flags & RFMEM) == 0) {
+		vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
+		if (vm2 == NULL) {
+			error = ENOMEM;
+			goto fail1;
+		}
+		if (!swap_reserve(mem_charged)) {
+			/*
+			 * The swap reservation failed. The accounting
+			 * from the entries of the copied vm2 will be
+			 * substracted in vmspace_free(), so force the
+			 * reservation there.
+			 */
+			swap_reserve_force(mem_charged);
+			error = ENOMEM;
+			goto fail1;
+		}
+	} else
+		vm2 = NULL;
+
+	/*
+	 * XXX: This is ugly; when we copy resource usage, we need to bump
+	 *      per-cred resource counters.
+	 */
+	newproc->p_ucred = p1->p_ucred;
+
+	/*
+	 * Initialize resource accounting for the child process.
+	 */
+	error = racct_proc_fork(p1, newproc);
+	if (error != 0) {
+		error = EAGAIN;
+		goto fail1;
+	}
+
+#ifdef MAC
+	mac_proc_init(newproc);
+#endif
+	knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx);
+	STAILQ_INIT(&newproc->p_ktr);
+
+	/* We have to lock the process tree while we look for a pid. */
+	sx_slock(&proctree_lock);
+
+	/*
+	 * Although process entries are dynamically created, we still keep
+	 * a global limit on the maximum number we will create.  Don't allow
+	 * a nonprivileged user to use the last ten processes; don't let root
+	 * exceed the limit. The variable nprocs is the current number of
+	 * processes, maxproc is the limit.
+	 */
+	sx_xlock(&allproc_lock);
+	if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
+	    PRIV_MAXPROC, 0) != 0) || nprocs >= maxproc) {
+		error = EAGAIN;
+		goto fail;
+	}
+
+	/*
+	 * Increment the count of procs running with this uid. Don't allow
+	 * a nonprivileged user to exceed their current limit.
+	 *
+	 * XXXRW: Can we avoid privilege here if it's not needed?
+	 */
+	error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
+	if (error == 0)
+		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
+	else {
+		PROC_LOCK(p1);
+		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
+		    lim_cur(p1, RLIMIT_NPROC));
+		PROC_UNLOCK(p1);
+	}
+	if (ok) {
+		do_fork(td, flags, newproc, td2, vm2, pdflags);
+
+		/*
+		 * Return child proc pointer to parent.
+		 */
+		*procp = newproc;
+#ifdef PROCDESC
+		if (flags & RFPROCDESC) {
+			procdesc_finit(newproc->p_procdesc, fp_procdesc);
+			fdrop(fp_procdesc, td);
+		}
+#endif
+		racct_proc_fork_done(newproc);
+		return (0);
+	}
+
+	error = EAGAIN;
+fail:
+	sx_sunlock(&proctree_lock);
+	if (ppsratecheck(&lastfail, &curfail, 1))
+		printf("maxproc limit exceeded by uid %u (pid %d); see tuning(7) and login.conf(5)\n",
+		    td->td_ucred->cr_ruid, p1->p_pid);
+	sx_xunlock(&allproc_lock);
+#ifdef MAC
+	mac_proc_destroy(newproc);
+#endif
+	racct_proc_exit(newproc);
+fail1:
+	if (vm2 != NULL)
+		vmspace_free(vm2);
+	uma_zfree(proc_zone, newproc);
+#ifdef PROCDESC
+	if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) {
+		fdclose(td->td_proc->p_fd, fp_procdesc, *procdescp, td);
+		fdrop(fp_procdesc, td);
+	}
+#endif
+	pause("fork", hz / 2);
+	return (error);
+}
+
+/*
+ * Handle the return of a child process from fork1().  This function
+ * is called from the MD fork_trampoline() entry point.
+ */
+void
+fork_exit(void (*callout)(void *, struct trapframe *), void *arg,
+    struct trapframe *frame)
+{
+	struct proc *p;
+	struct thread *td;
+	struct thread *dtd;
+
+	td = curthread;
+	p = td->td_proc;
+	KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
+
+	CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)",
+		td, td->td_sched, p->p_pid, td->td_name);
+
+	sched_fork_exit(td);
+	/*
+	* Processes normally resume in mi_switch() after being
+	* cpu_switch()'ed to, but when children start up they arrive here
+	* instead, so we must do much the same things as mi_switch() would.
+	*/
+	if ((dtd = PCPU_GET(deadthread))) {
+		PCPU_SET(deadthread, NULL);
+		thread_stash(dtd);
+	}
+	thread_unlock(td);
+
+	/*
+	 * cpu_set_fork_handler intercepts this function call to
+	 * have this call a non-return function to stay in kernel mode.
+	 * initproc has its own fork handler, but it does return.
+	 */
+	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
+	callout(arg, frame);
+
+	/*
+	 * Check if a kernel thread misbehaved and returned from its main
+	 * function.
+	 */
+	if (p->p_flag & P_KTHREAD) {
+		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
+		    td->td_name, p->p_pid);
+		kproc_exit(0);
+	}
+	mtx_assert(&Giant, MA_NOTOWNED);
+
+	if (p->p_sysent->sv_schedtail != NULL)
+		(p->p_sysent->sv_schedtail)(td);
+}
+
+/*
+ * Simplified back end of syscall(), used when returning from fork()
+ * directly into user mode.  Giant is not held on entry, and must not
+ * be held on return.  This function is passed in to fork_exit() as the
+ * first parameter and is called when returning to a new userland process.
+ */
+void
+fork_return(struct thread *td, struct trapframe *frame)
+{
+	struct proc *p, *dbg;
+
+	if (td->td_dbgflags & TDB_STOPATFORK) {
+		p = td->td_proc;
+		sx_xlock(&proctree_lock);
+		PROC_LOCK(p);
+		if ((p->p_pptr->p_flag & (P_TRACED | P_FOLLOWFORK)) ==
+		    (P_TRACED | P_FOLLOWFORK)) {
+			/*
+			 * If debugger still wants auto-attach for the
+			 * parent's children, do it now.
+			 */
+			dbg = p->p_pptr->p_pptr;
+			p->p_flag |= P_TRACED;
+			p->p_oppid = p->p_pptr->p_pid;
+			proc_reparent(p, dbg);
+			sx_xunlock(&proctree_lock);
+			td->td_dbgflags |= TDB_CHILD;
+			ptracestop(td, SIGSTOP);
+			td->td_dbgflags &= ~TDB_CHILD;
+		} else {
+			/*
+			 * ... otherwise clear the request.
+			 */
+			sx_xunlock(&proctree_lock);
+			td->td_dbgflags &= ~TDB_STOPATFORK;
+			cv_broadcast(&p->p_dbgwait);
+		}
+		PROC_UNLOCK(p);
+	}
+
+	userret(td, frame);
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_SYSRET))
+		ktrsysret(SYS_fork, 0, 0);
+#endif
+}
diff --git a/sys/kern/kern_gzio.c b/sys/kern/kern_gzio.c
new file mode 100644
index 0000000..15dc301
--- /dev/null
+++ b/sys/kern/kern_gzio.c
@@ -0,0 +1,400 @@
+/*
+ * $Id: kern_gzio.c,v 1.6 2008-10-18 22:54:45 lbazinet Exp $
+ *
+ * core_gzip.c -- gzip routines used in compressing user process cores
+ *
+ * This file is derived from src/lib/libz/gzio.c in FreeBSD.
+ */
+
+/* gzio.c -- IO on .gz files
+ * Copyright (C) 1995-1998 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+/* @(#) $FreeBSD$ */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <sys/syslog.h>
+#include <sys/endian.h>
+#include <net/zutil.h>
+#include <sys/libkern.h>
+
+#include <sys/vnode.h>
+#include <sys/mount.h>
+
+#define GZ_HEADER_LEN	10
+
+#ifndef Z_BUFSIZE
+#  ifdef MAXSEG_64K
+#    define Z_BUFSIZE 4096 /* minimize memory usage for 16-bit DOS */
+#  else
+#    define Z_BUFSIZE 16384
+#  endif
+#endif
+#ifndef Z_PRINTF_BUFSIZE
+#  define Z_PRINTF_BUFSIZE 4096
+#endif
+
+#define ALLOC(size) malloc(size, M_TEMP, M_WAITOK | M_ZERO)
+#define TRYFREE(p) {if (p) free(p, M_TEMP);}
+
+static int gz_magic[2] = {0x1f, 0x8b}; /* gzip magic header */
+
+/* gzip flag byte */
+#define ASCII_FLAG   0x01 /* bit 0 set: file probably ascii text */
+#define HEAD_CRC     0x02 /* bit 1 set: header CRC present */
+#define EXTRA_FIELD  0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME    0x08 /* bit 3 set: original file name present */
+#define COMMENT      0x10 /* bit 4 set: file comment present */
+#define RESERVED     0xE0 /* bits 5..7: reserved */
+
+typedef struct gz_stream {
+    z_stream stream;
+    int      z_err;   /* error code for last stream operation */
+    int      z_eof;   /* set if end of input file */
+    struct vnode *file; /* vnode pointer of .gz file */
+    Byte     *inbuf;  /* input buffer */
+    Byte     *outbuf; /* output buffer */
+    uLong    crc;     /* crc32 of uncompressed data */
+    char     *msg;    /* error message */
+    char     *path;   /* path name for debugging only */
+    int      transparent; /* 1 if input file is not a .gz file */
+    char     mode;    /* 'w' or 'r' */
+    long     startpos; /* start of compressed data in file (header skipped) */
+    off_t    outoff;  /* current offset in output file */
+    int	     flags;
+} gz_stream;
+
+
+local int do_flush        OF((gzFile file, int flush));
+local int    destroy      OF((gz_stream *s));
+local void   putU32      OF((gz_stream *file, uint32_t x));
+local void *gz_alloc      OF((void *notused, u_int items, u_int size));
+local void gz_free        OF((void *notused, void *ptr));
+
+/* ===========================================================================
+     Opens a gzip (.gz) file for reading or writing. The mode parameter
+   is as in fopen ("rb" or "wb"). The file is given either by file descriptor
+   or path name (if fd == -1).
+     gz_open return NULL if the file could not be opened or if there was
+   insufficient memory to allocate the (de)compression state; errno
+   can be checked to distinguish the two cases (if errno is zero, the
+   zlib error is Z_MEM_ERROR).
+*/
+gzFile gz_open (path, mode, vp)
+    const char *path;
+    const char *mode;
+    struct vnode *vp;
+{
+    int err;
+    int level = Z_DEFAULT_COMPRESSION; /* compression level */
+    int strategy = Z_DEFAULT_STRATEGY; /* compression strategy */
+    const char *p = mode;
+    gz_stream *s;
+    char fmode[80]; /* copy of mode, without the compression level */
+    char *m = fmode;
+    ssize_t resid;
+    int error;
+    char buf[GZ_HEADER_LEN + 1];
+
+    if (!path || !mode) return Z_NULL;
+
+    s = (gz_stream *)ALLOC(sizeof(gz_stream));
+    if (!s) return Z_NULL;
+
+    s->stream.zalloc = (alloc_func)gz_alloc;
+    s->stream.zfree = (free_func)gz_free;
+    s->stream.opaque = (voidpf)0;
+    s->stream.next_in = s->inbuf = Z_NULL;
+    s->stream.next_out = s->outbuf = Z_NULL;
+    s->stream.avail_in = s->stream.avail_out = 0;
+    s->file = NULL;
+    s->z_err = Z_OK;
+    s->z_eof = 0;
+    s->crc = 0;
+    s->msg = NULL;
+    s->transparent = 0;
+    s->outoff = 0;
+    s->flags = 0;
+
+    s->path = (char*)ALLOC(strlen(path)+1);
+    if (s->path == NULL) {
+        return destroy(s), (gzFile)Z_NULL;
+    }
+    strcpy(s->path, path); /* do this early for debugging */
+
+    s->mode = '\0';
+    do {
+        if (*p == 'r') s->mode = 'r';
+        if (*p == 'w' || *p == 'a') s->mode = 'w';
+        if (*p >= '0' && *p <= '9') {
+	    level = *p - '0';
+	} else if (*p == 'f') {
+	  strategy = Z_FILTERED;
+	} else if (*p == 'h') {
+	  strategy = Z_HUFFMAN_ONLY;
+	} else {
+	    *m++ = *p; /* copy the mode */
+	}
+    } while (*p++ && m != fmode + sizeof(fmode));
+
+    if (s->mode != 'w') {
+        log(LOG_ERR, "gz_open: mode is not w (%c)\n", s->mode);
+        return destroy(s), (gzFile)Z_NULL;
+    }
+    
+    err = deflateInit2(&(s->stream), level,
+                       Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, strategy);
+    /* windowBits is passed < 0 to suppress zlib header */
+
+    s->stream.next_out = s->outbuf = (Byte*)ALLOC(Z_BUFSIZE);
+    if (err != Z_OK || s->outbuf == Z_NULL) {
+        return destroy(s), (gzFile)Z_NULL;
+    }
+
+    s->stream.avail_out = Z_BUFSIZE;
+    s->file = vp;
+
+    /* Write a very simple .gz header:
+     */
+    snprintf(buf, sizeof(buf), "%c%c%c%c%c%c%c%c%c%c", gz_magic[0],
+             gz_magic[1], Z_DEFLATED, 0 /*flags*/, 0,0,0,0 /*time*/,
+             0 /*xflags*/, OS_CODE);
+
+    if ((error = vn_rdwr(UIO_WRITE, s->file, buf, GZ_HEADER_LEN, s->outoff,
+                         UIO_SYSSPACE, IO_UNIT, curproc->p_ucred,
+                         NOCRED, &resid, curthread))) {
+        s->outoff += GZ_HEADER_LEN - resid;
+        return destroy(s), (gzFile)Z_NULL;
+    }
+    s->outoff += GZ_HEADER_LEN;
+    s->startpos = 10L;
+    
+    return (gzFile)s;
+}
+
+
+ /* ===========================================================================
+ * Cleanup then free the given gz_stream. Return a zlib error code.
+   Try freeing in the reverse order of allocations.
+ */
+local int destroy (s)
+    gz_stream *s;
+{
+    int err = Z_OK;
+
+    if (!s) return Z_STREAM_ERROR;
+
+    TRYFREE(s->msg);
+
+    if (s->stream.state != NULL) {
+	if (s->mode == 'w') {
+	    err = deflateEnd(&(s->stream));
+	}
+    }
+    if (s->z_err < 0) err = s->z_err;
+
+    TRYFREE(s->inbuf);
+    TRYFREE(s->outbuf);
+    TRYFREE(s->path);
+    TRYFREE(s);
+    return err;
+}
+
+
+/* ===========================================================================
+     Writes the given number of uncompressed bytes into the compressed file.
+   gzwrite returns the number of bytes actually written (0 in case of error).
+*/
+int ZEXPORT gzwrite (file, buf, len)
+    gzFile file;
+    const voidp buf;
+    unsigned len;
+{
+    gz_stream *s = (gz_stream*)file;
+    off_t curoff;
+    size_t resid;
+    int error;
+
+    if (s == NULL || s->mode != 'w') return Z_STREAM_ERROR;
+
+    s->stream.next_in = (Bytef*)buf;
+    s->stream.avail_in = len;
+
+    curoff = s->outoff;
+    while (s->stream.avail_in != 0) {
+
+        if (s->stream.avail_out == 0) {
+
+            s->stream.next_out = s->outbuf;
+            error = vn_rdwr_inchunks(UIO_WRITE, s->file, s->outbuf, Z_BUFSIZE,
+                        curoff, UIO_SYSSPACE, IO_UNIT,
+                        curproc->p_ucred, NOCRED, &resid, curthread);
+            if (error) {
+                log(LOG_ERR, "gzwrite: vn_rdwr return %d\n", error);
+                curoff += Z_BUFSIZE - resid;
+                s->z_err = Z_ERRNO;
+                break;
+            }
+            curoff += Z_BUFSIZE;
+            s->stream.avail_out = Z_BUFSIZE;
+        }
+        s->z_err = deflate(&(s->stream), Z_NO_FLUSH);
+        if (s->z_err != Z_OK) {
+            log(LOG_ERR,
+                "gzwrite: deflate returned error %d\n", s->z_err);
+            break;
+        }
+    }
+
+    s->crc = ~crc32_raw(buf, len, ~s->crc);
+    s->outoff = curoff;
+
+    return (int)(len - s->stream.avail_in);
+}
+
+
+/* ===========================================================================
+     Flushes all pending output into the compressed file. The parameter
+   flush is as in the deflate() function.
+*/
+local int do_flush (file, flush)
+    gzFile file;
+    int flush;
+{
+    uInt len;
+    int done = 0;
+    gz_stream *s = (gz_stream*)file;
+    off_t curoff = s->outoff;
+    size_t resid;
+    int error;
+
+    if (s == NULL || s->mode != 'w') return Z_STREAM_ERROR;
+
+    if (s->stream.avail_in) {
+        log(LOG_WARNING, "do_flush: avail_in non-zero on entry\n");
+    } 
+
+    s->stream.avail_in = 0; /* should be zero already anyway */
+
+    for (;;) {
+        len = Z_BUFSIZE - s->stream.avail_out;
+
+        if (len != 0) {
+            error = vn_rdwr_inchunks(UIO_WRITE, s->file, s->outbuf, len, curoff,
+                        UIO_SYSSPACE, IO_UNIT, curproc->p_ucred,
+                        NOCRED, &resid, curthread);
+	    if (error) {
+                s->z_err = Z_ERRNO;
+                s->outoff = curoff + len - resid;
+                return Z_ERRNO;
+            }
+            s->stream.next_out = s->outbuf;
+            s->stream.avail_out = Z_BUFSIZE;
+            curoff += len;
+        }
+        if (done) break;
+        s->z_err = deflate(&(s->stream), flush);
+
+	/* Ignore the second of two consecutive flushes: */
+	if (len == 0 && s->z_err == Z_BUF_ERROR) s->z_err = Z_OK;
+
+        /* deflate has finished flushing only when it hasn't used up
+         * all the available space in the output buffer: 
+         */
+        done = (s->stream.avail_out != 0 || s->z_err == Z_STREAM_END);
+ 
+        if (s->z_err != Z_OK && s->z_err != Z_STREAM_END) break;
+    }
+    s->outoff = curoff;
+
+    return  s->z_err == Z_STREAM_END ? Z_OK : s->z_err;
+}
+
+int ZEXPORT gzflush (file, flush)
+     gzFile file;
+     int flush;
+{
+    gz_stream *s = (gz_stream*)file;
+    int err = do_flush (file, flush);
+
+    if (err) return err;
+    return  s->z_err == Z_STREAM_END ? Z_OK : s->z_err;
+}
+
+
+/* ===========================================================================
+   Outputs a long in LSB order to the given file
+*/
+local void putU32 (s, x)
+    gz_stream *s;
+    uint32_t x;
+{
+    uint32_t xx;
+    off_t curoff = s->outoff;
+    ssize_t resid;
+
+#if BYTE_ORDER == BIG_ENDIAN
+    xx = bswap32(x);
+#else
+    xx = x;
+#endif
+    vn_rdwr(UIO_WRITE, s->file, (caddr_t)&xx, sizeof(xx), curoff,
+            UIO_SYSSPACE, IO_UNIT, curproc->p_ucred,
+            NOCRED, &resid, curthread);
+    s->outoff += sizeof(xx) - resid;
+}
+
+
+/* ===========================================================================
+     Flushes all pending output if necessary, closes the compressed file
+   and deallocates all the (de)compression state.
+*/
+int ZEXPORT gzclose (file)
+    gzFile file;
+{
+    int err;
+    gz_stream *s = (gz_stream*)file;
+
+    if (s == NULL) return Z_STREAM_ERROR;
+
+    if (s->mode == 'w') {
+        err = do_flush (file, Z_FINISH);
+        if (err != Z_OK) {
+            log(LOG_ERR, "gzclose: do_flush failed (err %d)\n", err);
+            return destroy((gz_stream*)file);
+        }
+#if 0
+	printf("gzclose: putting crc: %lld total: %lld\n",
+	    (long long)s->crc, (long long)s->stream.total_in);
+	printf("sizeof uLong = %d\n", (int)sizeof(uLong));
+#endif
+        putU32 (s, s->crc);
+        putU32 (s, (uint32_t) s->stream.total_in);
+    }
+    return destroy((gz_stream*)file);
+}
+
+/*
+ * Space allocation and freeing routines for use by zlib routines when called
+ * from gzip modules.
+ */
+static void *
+gz_alloc(void *notused __unused, u_int items, u_int size)
+{
+    void *ptr;
+
+    MALLOC(ptr, void *, items * size, M_TEMP, M_NOWAIT | M_ZERO);
+    return ptr;
+}
+                                     
+static void
+gz_free(void *opaque __unused, void *ptr)
+{
+    FREE(ptr, M_TEMP);
+}
+
diff --git a/sys/kern/kern_hhook.c b/sys/kern/kern_hhook.c
new file mode 100644
index 0000000..321e1a9
--- /dev/null
+++ b/sys/kern/kern_hhook.c
@@ -0,0 +1,521 @@
+/*-
+ * Copyright (c) 2010,2013 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Lawrence Stewart while studying at the Centre
+ * for Advanced Internet Architectures, Swinburne University of Technology,
+ * made possible in part by grants from the FreeBSD Foundation and Cisco
+ * University Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/hhook.h>
+#include <sys/khelp.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/module_khelp.h>
+#include <sys/osd.h>
+#include <sys/queue.h>
+#include <sys/refcount.h>
+#include <sys/systm.h>
+
+#include <net/vnet.h>
+
+struct hhook {
+	hhook_func_t		hhk_func;
+	struct helper		*hhk_helper;
+	void			*hhk_udata;
+	STAILQ_ENTRY(hhook)	hhk_next;
+};
+
+static MALLOC_DEFINE(M_HHOOK, "hhook", "Helper hooks are linked off hhook_head lists");
+
+LIST_HEAD(hhookheadhead, hhook_head);
+struct hhookheadhead hhook_head_list;
+VNET_DEFINE(struct hhookheadhead, hhook_vhead_list);
+#define	V_hhook_vhead_list VNET(hhook_vhead_list)
+
+static struct mtx hhook_head_list_lock;
+MTX_SYSINIT(hhookheadlistlock, &hhook_head_list_lock, "hhook_head list lock",
+    MTX_DEF);
+
+/* Protected by hhook_head_list_lock. */
+static uint32_t n_hhookheads;
+
+/* Private function prototypes. */
+static void hhook_head_destroy(struct hhook_head *hhh);
+void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags);
+
+#define	HHHLIST_LOCK() mtx_lock(&hhook_head_list_lock)
+#define	HHHLIST_UNLOCK() mtx_unlock(&hhook_head_list_lock)
+#define	HHHLIST_LOCK_ASSERT() mtx_assert(&hhook_head_list_lock, MA_OWNED)
+
+#define	HHH_LOCK_INIT(hhh) rm_init(&(hhh)->hhh_lock, "hhook_head rm lock")
+#define	HHH_LOCK_DESTROY(hhh) rm_destroy(&(hhh)->hhh_lock)
+#define	HHH_WLOCK(hhh) rm_wlock(&(hhh)->hhh_lock)
+#define	HHH_WUNLOCK(hhh) rm_wunlock(&(hhh)->hhh_lock)
+#define	HHH_RLOCK(hhh, rmpt) rm_rlock(&(hhh)->hhh_lock, (rmpt))
+#define	HHH_RUNLOCK(hhh, rmpt) rm_runlock(&(hhh)->hhh_lock, (rmpt))
+
+/*
+ * Run all helper hook functions for a given hook point.
+ */
+void
+hhook_run_hooks(struct hhook_head *hhh, void *ctx_data, struct osd *hosd)
+{
+	struct hhook *hhk;
+	void *hdata;
+	struct rm_priotracker rmpt;
+
+	KASSERT(hhh->hhh_refcount > 0, ("hhook_head %p refcount is 0", hhh));
+
+	HHH_RLOCK(hhh, &rmpt);
+	STAILQ_FOREACH(hhk, &hhh->hhh_hooks, hhk_next) {
+		if (hhk->hhk_helper->h_flags & HELPER_NEEDS_OSD) {
+			hdata = osd_get(OSD_KHELP, hosd, hhk->hhk_helper->h_id);
+			if (hdata == NULL)
+				continue;
+		} else
+			hdata = NULL;
+
+		/*
+		 * XXXLAS: We currently ignore the int returned by the hook,
+		 * but will likely want to handle it in future to allow hhook to
+		 * be used like pfil and effect changes at the hhook calling
+		 * site e.g. we could define a new hook type of HHOOK_TYPE_PFIL
+		 * and standardise what particular return values mean and set
+		 * the context data to pass exactly the same information as pfil
+		 * hooks currently receive, thus replicating pfil with hhook.
+		 */
+		hhk->hhk_func(hhh->hhh_type, hhh->hhh_id, hhk->hhk_udata,
+		    ctx_data, hdata, hosd);
+	}
+	HHH_RUNLOCK(hhh, &rmpt);
+}
+
+/*
+ * Register a new helper hook function with a helper hook point.
+ */
+int
+hhook_add_hook(struct hhook_head *hhh, struct hookinfo *hki, uint32_t flags)
+{
+	struct hhook *hhk, *tmp;
+	int error;
+
+	error = 0;
+
+	if (hhh == NULL)
+		return (ENOENT);
+
+	hhk = malloc(sizeof(struct hhook), M_HHOOK,
+	    M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT));
+
+	if (hhk == NULL)
+		return (ENOMEM);
+
+	hhk->hhk_helper = hki->hook_helper;
+	hhk->hhk_func = hki->hook_func;
+	hhk->hhk_udata = hki->hook_udata;
+
+	HHH_WLOCK(hhh);
+	STAILQ_FOREACH(tmp, &hhh->hhh_hooks, hhk_next) {
+		if (tmp->hhk_func == hki->hook_func &&
+		    tmp->hhk_udata == hki->hook_udata) {
+			/* The helper hook function is already registered. */
+			error = EEXIST;
+			break;
+		}
+	}
+
+	if (!error) {
+		STAILQ_INSERT_TAIL(&hhh->hhh_hooks, hhk, hhk_next);
+		hhh->hhh_nhooks++;
+	} else
+		free(hhk, M_HHOOK);
+
+	HHH_WUNLOCK(hhh);
+
+	return (error);
+}
+
+/*
+ * Register a helper hook function with a helper hook point (including all
+ * virtual instances of the hook point if it is virtualised).
+ *
+ * The logic is unfortunately far more complex than for
+ * hhook_remove_hook_lookup() because hhook_add_hook() can call malloc() with
+ * M_WAITOK and thus we cannot call hhook_add_hook() with the
+ * hhook_head_list_lock held.
+ *
+ * The logic assembles an array of hhook_head structs that correspond to the
+ * helper hook point being hooked and bumps the refcount on each (all done with
+ * the hhook_head_list_lock held). The hhook_head_list_lock is then dropped, and
+ * hhook_add_hook() is called and the refcount dropped for each hhook_head
+ * struct in the array.
+ */
+int
+hhook_add_hook_lookup(struct hookinfo *hki, uint32_t flags)
+{
+	struct hhook_head **heads_to_hook, *hhh;
+	int error, i, n_heads_to_hook;
+
+tryagain:
+	error = i = 0;
+	/*
+	 * Accessing n_hhookheads without hhook_head_list_lock held opens up a
+	 * race with hhook_head_register() which we are unlikely to lose, but
+	 * nonetheless have to cope with - hence the complex goto logic.
+	 */
+	n_heads_to_hook = n_hhookheads;
+	heads_to_hook = malloc(n_heads_to_hook * sizeof(struct hhook_head *),
+	    M_HHOOK, flags & HHOOK_WAITOK ? M_WAITOK : M_NOWAIT);
+	if (heads_to_hook == NULL)
+		return (ENOMEM);
+
+	HHHLIST_LOCK();
+	LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
+		if (hhh->hhh_type == hki->hook_type &&
+		    hhh->hhh_id == hki->hook_id) {
+			if (i < n_heads_to_hook) {
+				heads_to_hook[i] = hhh;
+				refcount_acquire(&heads_to_hook[i]->hhh_refcount);
+				i++;
+			} else {
+				/*
+				 * We raced with hhook_head_register() which
+				 * inserted a hhook_head that we need to hook
+				 * but did not malloc space for. Abort this run
+				 * and try again.
+				 */
+				for (i--; i >= 0; i--)
+					refcount_release(&heads_to_hook[i]->hhh_refcount);
+				free(heads_to_hook, M_HHOOK);
+				HHHLIST_UNLOCK();
+				goto tryagain;
+			}
+		}
+	}
+	HHHLIST_UNLOCK();
+
+	for (i--; i >= 0; i--) {
+		if (!error)
+			error = hhook_add_hook(heads_to_hook[i], hki, flags);
+		refcount_release(&heads_to_hook[i]->hhh_refcount);
+	}
+
+	free(heads_to_hook, M_HHOOK);
+
+	return (error);
+}
+
+/*
+ * Remove a helper hook function from a helper hook point.
+ */
+int
+hhook_remove_hook(struct hhook_head *hhh, struct hookinfo *hki)
+{
+	struct hhook *tmp;
+
+	if (hhh == NULL)
+		return (ENOENT);
+
+	HHH_WLOCK(hhh);
+	STAILQ_FOREACH(tmp, &hhh->hhh_hooks, hhk_next) {
+		if (tmp->hhk_func == hki->hook_func &&
+		    tmp->hhk_udata == hki->hook_udata) {
+			STAILQ_REMOVE(&hhh->hhh_hooks, tmp, hhook, hhk_next);
+			free(tmp, M_HHOOK);
+			hhh->hhh_nhooks--;
+			break;
+		}
+	}
+	HHH_WUNLOCK(hhh);
+
+	return (0);
+}
+
+/*
+ * Remove a helper hook function from a helper hook point (including all
+ * virtual instances of the hook point if it is virtualised).
+ */
+int
+hhook_remove_hook_lookup(struct hookinfo *hki)
+{
+	struct hhook_head *hhh;
+
+	HHHLIST_LOCK();
+	LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
+		if (hhh->hhh_type == hki->hook_type &&
+		    hhh->hhh_id == hki->hook_id)
+			hhook_remove_hook(hhh, hki);
+	}
+	HHHLIST_UNLOCK();
+
+	return (0);
+}
+
+/*
+ * Register a new helper hook point.
+ */
+int
+hhook_head_register(int32_t hhook_type, int32_t hhook_id, struct hhook_head **hhh,
+    uint32_t flags)
+{
+	struct hhook_head *tmphhh;
+
+	tmphhh = hhook_head_get(hhook_type, hhook_id);
+
+	if (tmphhh != NULL) {
+		/* Hook point previously registered. */
+		hhook_head_release(tmphhh);
+		return (EEXIST);
+	}
+
+	tmphhh = malloc(sizeof(struct hhook_head), M_HHOOK,
+	    M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT));
+
+	if (tmphhh == NULL)
+		return (ENOMEM);
+
+	tmphhh->hhh_type = hhook_type;
+	tmphhh->hhh_id = hhook_id;
+	tmphhh->hhh_nhooks = 0;
+	STAILQ_INIT(&tmphhh->hhh_hooks);
+	HHH_LOCK_INIT(tmphhh);
+	refcount_init(&tmphhh->hhh_refcount, 1);
+
+	HHHLIST_LOCK();
+	if (flags & HHOOK_HEADISINVNET) {
+		tmphhh->hhh_flags |= HHH_ISINVNET;
+#ifdef VIMAGE
+		KASSERT(curvnet != NULL, ("curvnet is NULL"));
+		tmphhh->hhh_vid = (uintptr_t)curvnet;
+		LIST_INSERT_HEAD(&V_hhook_vhead_list, tmphhh, hhh_vnext);
+#endif
+	}
+	LIST_INSERT_HEAD(&hhook_head_list, tmphhh, hhh_next);
+	n_hhookheads++;
+	HHHLIST_UNLOCK();
+
+	khelp_new_hhook_registered(tmphhh, flags);
+
+	if (hhh != NULL)
+		*hhh = tmphhh;
+	else
+		refcount_release(&tmphhh->hhh_refcount);
+
+	return (0);
+}
+
+static void
+hhook_head_destroy(struct hhook_head *hhh)
+{
+	struct hhook *tmp, *tmp2;
+
+	HHHLIST_LOCK_ASSERT();
+	KASSERT(n_hhookheads > 0, ("n_hhookheads should be > 0"));
+
+	LIST_REMOVE(hhh, hhh_next);
+#ifdef VIMAGE
+	if (hhook_head_is_virtualised(hhh) == HHOOK_HEADISINVNET)
+		LIST_REMOVE(hhh, hhh_vnext);
+#endif
+	HHH_WLOCK(hhh);
+	STAILQ_FOREACH_SAFE(tmp, &hhh->hhh_hooks, hhk_next, tmp2)
+		free(tmp, M_HHOOK);
+	HHH_WUNLOCK(hhh);
+	HHH_LOCK_DESTROY(hhh);
+	free(hhh, M_HHOOK);
+	n_hhookheads--;
+}
+
+/*
+ * Remove a helper hook point.
+ */
+int
+hhook_head_deregister(struct hhook_head *hhh)
+{
+	int error;
+
+	error = 0;
+
+	HHHLIST_LOCK();
+	if (hhh == NULL)
+		error = ENOENT;
+	else if (hhh->hhh_refcount > 1)
+		error = EBUSY;
+	else
+		hhook_head_destroy(hhh);
+	HHHLIST_UNLOCK();
+
+	return (error);
+}
+
+/*
+ * Remove a helper hook point via a hhook_head lookup.
+ */
+int
+hhook_head_deregister_lookup(int32_t hhook_type, int32_t hhook_id)
+{
+	struct hhook_head *hhh;
+	int error;
+
+	hhh = hhook_head_get(hhook_type, hhook_id);
+	error = hhook_head_deregister(hhh);
+
+	if (error == EBUSY)
+		hhook_head_release(hhh);
+
+	return (error);
+}
+
+/*
+ * Lookup and return the hhook_head struct associated with the specified type
+ * and id, or NULL if not found. If found, the hhook_head's refcount is bumped.
+ */
+struct hhook_head *
+hhook_head_get(int32_t hhook_type, int32_t hhook_id)
+{
+	struct hhook_head *hhh;
+
+	HHHLIST_LOCK();
+	LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
+		if (hhh->hhh_type == hhook_type && hhh->hhh_id == hhook_id) {
+#ifdef VIMAGE
+			if (hhook_head_is_virtualised(hhh) ==
+			    HHOOK_HEADISINVNET) {
+				KASSERT(curvnet != NULL, ("curvnet is NULL"));
+				if (hhh->hhh_vid != (uintptr_t)curvnet)
+					continue;
+			}
+#endif
+			refcount_acquire(&hhh->hhh_refcount);
+			break;
+		}
+	}
+	HHHLIST_UNLOCK();
+
+	return (hhh);
+}
+
+void
+hhook_head_release(struct hhook_head *hhh)
+{
+
+	refcount_release(&hhh->hhh_refcount);
+}
+
+/*
+ * Check the hhook_head private flags and return the appropriate public
+ * representation of the flag to the caller. The function is implemented in a
+ * way that allows us to cope with other subsystems becoming virtualised in the
+ * future.
+ */
+uint32_t
+hhook_head_is_virtualised(struct hhook_head *hhh)
+{
+	uint32_t ret;
+
+	ret = 0;
+
+	if (hhh != NULL) {
+		if (hhh->hhh_flags & HHH_ISINVNET)
+			ret = HHOOK_HEADISINVNET;
+	}
+
+	return (ret);
+}
+
+uint32_t
+hhook_head_is_virtualised_lookup(int32_t hook_type, int32_t hook_id)
+{
+	struct hhook_head *hhh;
+	uint32_t ret;
+
+	hhh = hhook_head_get(hook_type, hook_id);
+
+	if (hhh == NULL)
+		return (0);
+
+	ret = hhook_head_is_virtualised(hhh);
+	hhook_head_release(hhh);
+
+	return (ret);
+}
+
+/*
+ * Vnet created and being initialised.
+ */
+static void
+hhook_vnet_init(const void *unused __unused)
+{
+
+	LIST_INIT(&V_hhook_vhead_list);
+}
+
+/*
+ * Vnet being torn down and destroyed.
+ */
+static void
+hhook_vnet_uninit(const void *unused __unused)
+{
+	struct hhook_head *hhh, *tmphhh;
+
+	/*
+	 * If subsystems which export helper hook points use the hhook KPI
+	 * correctly, the loop below should have no work to do because the
+	 * subsystem should have already called hhook_head_deregister().
+	 */
+	HHHLIST_LOCK();
+	LIST_FOREACH_SAFE(hhh, &V_hhook_vhead_list, hhh_vnext, tmphhh) {
+		printf("%s: hhook_head type=%d, id=%d cleanup required\n",
+		    __func__, hhh->hhh_type, hhh->hhh_id);
+		hhook_head_destroy(hhh);
+	}
+	HHHLIST_UNLOCK();
+}
+
+
+/*
+ * When a vnet is created and being initialised, init the V_hhook_vhead_list.
+ */
+VNET_SYSINIT(hhook_vnet_init, SI_SUB_MBUF, SI_ORDER_FIRST,
+    hhook_vnet_init, NULL);
+
+/*
+ * The hhook KPI provides a mechanism for subsystems which export helper hook
+ * points to clean up on vnet tear down, but in case the KPI is misused,
+ * provide a function to clean up and free memory for a vnet being destroyed.
+ */
+VNET_SYSUNINIT(hhook_vnet_uninit, SI_SUB_MBUF, SI_ORDER_ANY,
+    hhook_vnet_uninit, NULL);
diff --git a/sys/kern/kern_idle.c b/sys/kern/kern_idle.c
new file mode 100644
index 0000000..f412d17
--- /dev/null
+++ b/sys/kern/kern_idle.c
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (C) 2000-2004 The FreeBSD Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/unistd.h>
+#ifdef SMP
+#include <sys/smp.h>
+#endif
+
+static void idle_setup(void *dummy);
+SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL);
+
+/*
+ * Set up per-cpu idle process contexts.  The AP's shouldn't be running or
+ * accessing their idle processes at this point, so don't bother with
+ * locking.
+ */
+static void
+idle_setup(void *dummy)
+{
+#ifdef SMP
+	struct pcpu *pc;
+#endif
+	struct proc *p;
+	struct thread *td;
+	int error;
+
+	p = NULL; /* start with no idle process */
+#ifdef SMP
+	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
+#endif
+#ifdef SMP
+		error = kproc_kthread_add(sched_idletd, NULL, &p, &td,
+		    RFSTOPPED | RFHIGHPID, 0, "idle", "idle: cpu%d", pc->pc_cpuid);
+		pc->pc_idlethread = td;
+#else
+		error = kproc_kthread_add(sched_idletd, NULL, &p, &td,
+		    RFSTOPPED | RFHIGHPID, 0, "idle", "idle");
+		PCPU_SET(idlethread, td);
+#endif
+		if (error)
+			panic("idle_setup: kproc_create error %d\n", error);
+
+		thread_lock(td);
+		TD_SET_CAN_RUN(td);
+		td->td_flags |= TDF_IDLETD | TDF_NOLOAD;
+		sched_class(td, PRI_IDLE);
+		sched_prio(td, PRI_MAX_IDLE);
+		thread_unlock(td);
+#ifdef SMP
+	}
+#endif
+}
diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c
new file mode 100644
index 0000000..f4b04c3
--- /dev/null
+++ b/sys/kern/kern_intr.c
@@ -0,0 +1,1943 @@
+/*-
+ * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/cpuset.h>
+#include <sys/rtprio.h>
+#include <sys/systm.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/random.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/unistd.h>
+#include <sys/vmmeter.h>
+#include <machine/atomic.h>
+#include <machine/cpu.h>
+#include <machine/md_var.h>
+#include <machine/stdarg.h>
+#ifdef DDB
+#include <ddb/ddb.h>
+#include <ddb/db_sym.h>
+#endif
+
+/*
+ * Describe an interrupt thread.  There is one of these per interrupt event.
+ */
+struct intr_thread {
+	struct intr_event *it_event;
+	struct thread *it_thread;	/* Kernel thread. */
+	int	it_flags;		/* (j) IT_* flags. */
+	int	it_need;		/* Needs service. */
+};
+
+/* Interrupt thread flags kept in it_flags */
+#define	IT_DEAD		0x000001	/* Thread is waiting to exit. */
+#define	IT_WAIT		0x000002	/* Thread is waiting for completion. */
+
+struct	intr_entropy {
+	struct	thread *td;
+	uintptr_t event;
+};
+
+struct	intr_event *clk_intr_event;
+struct	intr_event *tty_intr_event;
+void	*vm_ih;
+struct proc *intrproc;
+
+static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads");
+
+static int intr_storm_threshold = 1000;
+TUNABLE_INT("hw.intr_storm_threshold", &intr_storm_threshold);
+SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RW,
+    &intr_storm_threshold, 0,
+    "Number of consecutive interrupts before storm protection is enabled");
+static TAILQ_HEAD(, intr_event) event_list =
+    TAILQ_HEAD_INITIALIZER(event_list);
+static struct mtx event_lock;
+MTX_SYSINIT(intr_event_list, &event_lock, "intr event list", MTX_DEF);
+
+static void	intr_event_update(struct intr_event *ie);
+#ifdef INTR_FILTER
+static int	intr_event_schedule_thread(struct intr_event *ie,
+		    struct intr_thread *ithd);
+static int	intr_filter_loop(struct intr_event *ie,
+		    struct trapframe *frame, struct intr_thread **ithd);
+static struct intr_thread *ithread_create(const char *name,
+			      struct intr_handler *ih);
+#else
+static int	intr_event_schedule_thread(struct intr_event *ie);
+static struct intr_thread *ithread_create(const char *name);
+#endif
+static void	ithread_destroy(struct intr_thread *ithread);
+static void	ithread_execute_handlers(struct proc *p, 
+		    struct intr_event *ie);
+#ifdef INTR_FILTER
+static void	priv_ithread_execute_handler(struct proc *p, 
+		    struct intr_handler *ih);
+#endif
+static void	ithread_loop(void *);
+static void	ithread_update(struct intr_thread *ithd);
+static void	start_softintr(void *);
+
+/* Map an interrupt type to an ithread priority. */
+u_char
+intr_priority(enum intr_type flags)
+{
+	u_char pri;
+
+	flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET |
+	    INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV);
+	switch (flags) {
+	case INTR_TYPE_TTY:
+		pri = PI_TTY;
+		break;
+	case INTR_TYPE_BIO:
+		pri = PI_DISK;
+		break;
+	case INTR_TYPE_NET:
+		pri = PI_NET;
+		break;
+	case INTR_TYPE_CAM:
+		pri = PI_DISK;
+		break;
+	case INTR_TYPE_AV:
+		pri = PI_AV;
+		break;
+	case INTR_TYPE_CLK:
+		pri = PI_REALTIME;
+		break;
+	case INTR_TYPE_MISC:
+		pri = PI_DULL;          /* don't care */
+		break;
+	default:
+		/* We didn't specify an interrupt level. */
+		panic("intr_priority: no interrupt type in flags");
+	}
+
+	return pri;
+}
+
+/*
+ * Update an ithread based on the associated intr_event.
+ */
+static void
+ithread_update(struct intr_thread *ithd)
+{
+	struct intr_event *ie;
+	struct thread *td;
+	u_char pri;
+
+	ie = ithd->it_event;
+	td = ithd->it_thread;
+
+	/* Determine the overall priority of this event. */
+	if (TAILQ_EMPTY(&ie->ie_handlers))
+		pri = PRI_MAX_ITHD;
+	else
+		pri = TAILQ_FIRST(&ie->ie_handlers)->ih_pri;
+
+	/* Update name and priority. */
+	strlcpy(td->td_name, ie->ie_fullname, sizeof(td->td_name));
+#ifdef KTR
+	sched_clear_tdname(td);
+#endif
+	thread_lock(td);
+	sched_prio(td, pri);
+	thread_unlock(td);
+}
+
+/*
+ * Regenerate the full name of an interrupt event and update its priority.
+ */
+static void
+intr_event_update(struct intr_event *ie)
+{
+	struct intr_handler *ih;
+	char *last;
+	int missed, space;
+
+	/* Start off with no entropy and just the name of the event. */
+	mtx_assert(&ie->ie_lock, MA_OWNED);
+	strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
+	ie->ie_flags &= ~IE_ENTROPY;
+	missed = 0;
+	space = 1;
+
+	/* Run through all the handlers updating values. */
+	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+		if (strlen(ie->ie_fullname) + strlen(ih->ih_name) + 1 <
+		    sizeof(ie->ie_fullname)) {
+			strcat(ie->ie_fullname, " ");
+			strcat(ie->ie_fullname, ih->ih_name);
+			space = 0;
+		} else
+			missed++;
+		if (ih->ih_flags & IH_ENTROPY)
+			ie->ie_flags |= IE_ENTROPY;
+	}
+
+	/*
+	 * If the handler names were too long, add +'s to indicate missing
+	 * names. If we run out of room and still have +'s to add, change
+	 * the last character from a + to a *.
+	 */
+	last = &ie->ie_fullname[sizeof(ie->ie_fullname) - 2];
+	while (missed-- > 0) {
+		if (strlen(ie->ie_fullname) + 1 == sizeof(ie->ie_fullname)) {
+			if (*last == '+') {
+				*last = '*';
+				break;
+			} else
+				*last = '+';
+		} else if (space) {
+			strcat(ie->ie_fullname, " +");
+			space = 0;
+		} else
+			strcat(ie->ie_fullname, "+");
+	}
+
+	/*
+	 * If this event has an ithread, update it's priority and
+	 * name.
+	 */
+	if (ie->ie_thread != NULL)
+		ithread_update(ie->ie_thread);
+	CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname);
+}
+
+int
+intr_event_create(struct intr_event **event, void *source, int flags, int irq,
+    void (*pre_ithread)(void *), void (*post_ithread)(void *),
+    void (*post_filter)(void *), int (*assign_cpu)(void *, u_char),
+    const char *fmt, ...)
+{
+	struct intr_event *ie;
+	va_list ap;
+
+	/* The only valid flag during creation is IE_SOFT. */
+	if ((flags & ~IE_SOFT) != 0)
+		return (EINVAL);
+	ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO);
+	ie->ie_source = source;
+	ie->ie_pre_ithread = pre_ithread;
+	ie->ie_post_ithread = post_ithread;
+	ie->ie_post_filter = post_filter;
+	ie->ie_assign_cpu = assign_cpu;
+	ie->ie_flags = flags;
+	ie->ie_irq = irq;
+	ie->ie_cpu = NOCPU;
+	TAILQ_INIT(&ie->ie_handlers);
+	mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF);
+
+	va_start(ap, fmt);
+	vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap);
+	va_end(ap);
+	strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
+	mtx_lock(&event_lock);
+	TAILQ_INSERT_TAIL(&event_list, ie, ie_list);
+	mtx_unlock(&event_lock);
+	if (event != NULL)
+		*event = ie;
+	CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name);
+	return (0);
+}
+
+/*
+ * Bind an interrupt event to the specified CPU.  Note that not all
+ * platforms support binding an interrupt to a CPU.  For those
+ * platforms this request will fail.  For supported platforms, any
+ * associated ithreads as well as the primary interrupt context will
+ * be bound to the specificed CPU.  Using a cpu id of NOCPU unbinds
+ * the interrupt event.
+ */
+int
+intr_event_bind(struct intr_event *ie, u_char cpu)
+{
+	cpuset_t mask;
+	lwpid_t id;
+	int error;
+
+	/* Need a CPU to bind to. */
+	if (cpu != NOCPU && CPU_ABSENT(cpu))
+		return (EINVAL);
+
+	if (ie->ie_assign_cpu == NULL)
+		return (EOPNOTSUPP);
+
+	error = priv_check(curthread, PRIV_SCHED_CPUSET_INTR);
+	if (error)
+		return (error);
+
+	/*
+	 * If we have any ithreads try to set their mask first to verify
+	 * permissions, etc.
+	 */
+	mtx_lock(&ie->ie_lock);
+	if (ie->ie_thread != NULL) {
+		CPU_ZERO(&mask);
+		if (cpu == NOCPU)
+			CPU_COPY(cpuset_root, &mask);
+		else
+			CPU_SET(cpu, &mask);
+		id = ie->ie_thread->it_thread->td_tid;
+		mtx_unlock(&ie->ie_lock);
+		error = cpuset_setthread(id, &mask);
+		if (error)
+			return (error);
+	} else
+		mtx_unlock(&ie->ie_lock);
+	error = ie->ie_assign_cpu(ie->ie_source, cpu);
+	if (error) {
+		mtx_lock(&ie->ie_lock);
+		if (ie->ie_thread != NULL) {
+			CPU_ZERO(&mask);
+			if (ie->ie_cpu == NOCPU)
+				CPU_COPY(cpuset_root, &mask);
+			else
+				CPU_SET(ie->ie_cpu, &mask);
+			id = ie->ie_thread->it_thread->td_tid;
+			mtx_unlock(&ie->ie_lock);
+			(void)cpuset_setthread(id, &mask);
+		} else
+			mtx_unlock(&ie->ie_lock);
+		return (error);
+	}
+
+	mtx_lock(&ie->ie_lock);
+	ie->ie_cpu = cpu;
+	mtx_unlock(&ie->ie_lock);
+
+	return (error);
+}
+
+static struct intr_event *
+intr_lookup(int irq)
+{
+	struct intr_event *ie;
+
+	mtx_lock(&event_lock);
+	TAILQ_FOREACH(ie, &event_list, ie_list)
+		if (ie->ie_irq == irq &&
+		    (ie->ie_flags & IE_SOFT) == 0 &&
+		    TAILQ_FIRST(&ie->ie_handlers) != NULL)
+			break;
+	mtx_unlock(&event_lock);
+	return (ie);
+}
+
+int
+intr_setaffinity(int irq, void *m)
+{
+	struct intr_event *ie;
+	cpuset_t *mask;
+	u_char cpu;
+	int n;
+
+	mask = m;
+	cpu = NOCPU;
+	/*
+	 * If we're setting all cpus we can unbind.  Otherwise make sure
+	 * only one cpu is in the set.
+	 */
+	if (CPU_CMP(cpuset_root, mask)) {
+		for (n = 0; n < CPU_SETSIZE; n++) {
+			if (!CPU_ISSET(n, mask))
+				continue;
+			if (cpu != NOCPU)
+				return (EINVAL);
+			cpu = (u_char)n;
+		}
+	}
+	ie = intr_lookup(irq);
+	if (ie == NULL)
+		return (ESRCH);
+	return (intr_event_bind(ie, cpu));
+}
+
+int
+intr_getaffinity(int irq, void *m)
+{
+	struct intr_event *ie;
+	cpuset_t *mask;
+
+	mask = m;
+	ie = intr_lookup(irq);
+	if (ie == NULL)
+		return (ESRCH);
+	CPU_ZERO(mask);
+	mtx_lock(&ie->ie_lock);
+	if (ie->ie_cpu == NOCPU)
+		CPU_COPY(cpuset_root, mask);
+	else
+		CPU_SET(ie->ie_cpu, mask);
+	mtx_unlock(&ie->ie_lock);
+	return (0);
+}
+
+int
+intr_event_destroy(struct intr_event *ie)
+{
+
+	mtx_lock(&event_lock);
+	mtx_lock(&ie->ie_lock);
+	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
+		mtx_unlock(&ie->ie_lock);
+		mtx_unlock(&event_lock);
+		return (EBUSY);
+	}
+	TAILQ_REMOVE(&event_list, ie, ie_list);
+#ifndef notyet
+	if (ie->ie_thread != NULL) {
+		ithread_destroy(ie->ie_thread);
+		ie->ie_thread = NULL;
+	}
+#endif
+	mtx_unlock(&ie->ie_lock);
+	mtx_unlock(&event_lock);
+	mtx_destroy(&ie->ie_lock);
+	free(ie, M_ITHREAD);
+	return (0);
+}
+
+#ifndef INTR_FILTER
+static struct intr_thread *
+ithread_create(const char *name)
+{
+	struct intr_thread *ithd;
+	struct thread *td;
+	int error;
+
+	ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
+
+	error = kproc_kthread_add(ithread_loop, ithd, &intrproc,
+		    &td, RFSTOPPED | RFHIGHPID,
+	    	    0, "intr", "%s", name);
+	if (error)
+		panic("kproc_create() failed with %d", error);
+	thread_lock(td);
+	sched_class(td, PRI_ITHD);
+	TD_SET_IWAIT(td);
+	thread_unlock(td);
+	td->td_pflags |= TDP_ITHREAD;
+	ithd->it_thread = td;
+	CTR2(KTR_INTR, "%s: created %s", __func__, name);
+	return (ithd);
+}
+#else
+static struct intr_thread *
+ithread_create(const char *name, struct intr_handler *ih)
+{
+	struct intr_thread *ithd;
+	struct thread *td;
+	int error;
+
+	ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
+
+	error = kproc_kthread_add(ithread_loop, ih, &intrproc,
+		    &td, RFSTOPPED | RFHIGHPID,
+	    	    0, "intr", "%s", name);
+	if (error)
+		panic("kproc_create() failed with %d", error);
+	thread_lock(td);
+	sched_class(td, PRI_ITHD);
+	TD_SET_IWAIT(td);
+	thread_unlock(td);
+	td->td_pflags |= TDP_ITHREAD;
+	ithd->it_thread = td;
+	CTR2(KTR_INTR, "%s: created %s", __func__, name);
+	return (ithd);
+}
+#endif
+
+static void
+ithread_destroy(struct intr_thread *ithread)
+{
+	struct thread *td;
+
+	CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name);
+	td = ithread->it_thread;
+	thread_lock(td);
+	ithread->it_flags |= IT_DEAD;
+	if (TD_AWAITING_INTR(td)) {
+		TD_CLR_IWAIT(td);
+		sched_add(td, SRQ_INTR);
+	}
+	thread_unlock(td);
+}
+
+#ifndef INTR_FILTER
+int
+intr_event_add_handler(struct intr_event *ie, const char *name,
+    driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
+    enum intr_type flags, void **cookiep)
+{
+	struct intr_handler *ih, *temp_ih;
+	struct intr_thread *it;
+
+	if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
+		return (EINVAL);
+
+	/* Allocate and populate an interrupt handler structure. */
+	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
+	ih->ih_filter = filter;
+	ih->ih_handler = handler;
+	ih->ih_argument = arg;
+	strlcpy(ih->ih_name, name, sizeof(ih->ih_name));
+	ih->ih_event = ie;
+	ih->ih_pri = pri;
+	if (flags & INTR_EXCL)
+		ih->ih_flags = IH_EXCLUSIVE;
+	if (flags & INTR_MPSAFE)
+		ih->ih_flags |= IH_MPSAFE;
+	if (flags & INTR_ENTROPY)
+		ih->ih_flags |= IH_ENTROPY;
+
+	/* We can only have one exclusive handler in a event. */
+	mtx_lock(&ie->ie_lock);
+	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
+		if ((flags & INTR_EXCL) ||
+		    (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
+			mtx_unlock(&ie->ie_lock);
+			free(ih, M_ITHREAD);
+			return (EINVAL);
+		}
+	}
+
+	/* Create a thread if we need one. */
+	while (ie->ie_thread == NULL && handler != NULL) {
+		if (ie->ie_flags & IE_ADDING_THREAD)
+			msleep(ie, &ie->ie_lock, 0, "ithread", 0);
+		else {
+			ie->ie_flags |= IE_ADDING_THREAD;
+			mtx_unlock(&ie->ie_lock);
+			it = ithread_create("intr: newborn");
+			mtx_lock(&ie->ie_lock);
+			ie->ie_flags &= ~IE_ADDING_THREAD;
+			ie->ie_thread = it;
+			it->it_event = ie;
+			ithread_update(it);
+			wakeup(ie);
+		}
+	}
+
+	/* Add the new handler to the event in priority order. */
+	TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) {
+		if (temp_ih->ih_pri > ih->ih_pri)
+			break;
+	}
+	if (temp_ih == NULL)
+		TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
+	else
+		TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
+	intr_event_update(ie);
+
+	CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
+	    ie->ie_name);
+	mtx_unlock(&ie->ie_lock);
+
+	if (cookiep != NULL)
+		*cookiep = ih;
+	return (0);
+}
+#else
+int
+intr_event_add_handler(struct intr_event *ie, const char *name,
+    driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
+    enum intr_type flags, void **cookiep)
+{
+	struct intr_handler *ih, *temp_ih;
+	struct intr_thread *it;
+
+	if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
+		return (EINVAL);
+
+	/* Allocate and populate an interrupt handler structure. */
+	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
+	ih->ih_filter = filter;
+	ih->ih_handler = handler;
+	ih->ih_argument = arg;
+	strlcpy(ih->ih_name, name, sizeof(ih->ih_name));
+	ih->ih_event = ie;
+	ih->ih_pri = pri;
+	if (flags & INTR_EXCL)
+		ih->ih_flags = IH_EXCLUSIVE;
+	if (flags & INTR_MPSAFE)
+		ih->ih_flags |= IH_MPSAFE;
+	if (flags & INTR_ENTROPY)
+		ih->ih_flags |= IH_ENTROPY;
+
+	/* We can only have one exclusive handler in a event. */
+	mtx_lock(&ie->ie_lock);
+	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
+		if ((flags & INTR_EXCL) ||
+		    (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
+			mtx_unlock(&ie->ie_lock);
+			free(ih, M_ITHREAD);
+			return (EINVAL);
+		}
+	}
+
+	/* For filtered handlers, create a private ithread to run on. */
+	if (filter != NULL && handler != NULL) {
+		mtx_unlock(&ie->ie_lock);
+		it = ithread_create("intr: newborn", ih);
+		mtx_lock(&ie->ie_lock);
+		it->it_event = ie;
+		ih->ih_thread = it;
+		ithread_update(it); /* XXX - do we really need this?!?!? */
+	} else { /* Create the global per-event thread if we need one. */
+		while (ie->ie_thread == NULL && handler != NULL) {
+			if (ie->ie_flags & IE_ADDING_THREAD)
+				msleep(ie, &ie->ie_lock, 0, "ithread", 0);
+			else {
+				ie->ie_flags |= IE_ADDING_THREAD;
+				mtx_unlock(&ie->ie_lock);
+				it = ithread_create("intr: newborn", ih);
+				mtx_lock(&ie->ie_lock);
+				ie->ie_flags &= ~IE_ADDING_THREAD;
+				ie->ie_thread = it;
+				it->it_event = ie;
+				ithread_update(it);
+				wakeup(ie);
+			}
+		}
+	}
+
+	/* Add the new handler to the event in priority order. */
+	TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) {
+		if (temp_ih->ih_pri > ih->ih_pri)
+			break;
+	}
+	if (temp_ih == NULL)
+		TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
+	else
+		TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
+	intr_event_update(ie);
+
+	CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
+	    ie->ie_name);
+	mtx_unlock(&ie->ie_lock);
+
+	if (cookiep != NULL)
+		*cookiep = ih;
+	return (0);
+}
+#endif
+
+/*
+ * Append a description preceded by a ':' to the name of the specified
+ * interrupt handler.
+ */
+int
+intr_event_describe_handler(struct intr_event *ie, void *cookie,
+    const char *descr)
+{
+	struct intr_handler *ih;
+	size_t space;
+	char *start;
+
+	mtx_lock(&ie->ie_lock);
+#ifdef INVARIANTS
+	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+		if (ih == cookie)
+			break;
+	}
+	if (ih == NULL) {
+		mtx_unlock(&ie->ie_lock);
+		panic("handler %p not found in interrupt event %p", cookie, ie);
+	}
+#endif
+	ih = cookie;
+
+	/*
+	 * Look for an existing description by checking for an
+	 * existing ":".  This assumes device names do not include
+	 * colons.  If one is found, prepare to insert the new
+	 * description at that point.  If one is not found, find the
+	 * end of the name to use as the insertion point.
+	 */
+	start = strchr(ih->ih_name, ':');
+	if (start == NULL)
+		start = strchr(ih->ih_name, 0);
+
+	/*
+	 * See if there is enough remaining room in the string for the
+	 * description + ":".  The "- 1" leaves room for the trailing
+	 * '\0'.  The "+ 1" accounts for the colon.
+	 */
+	space = sizeof(ih->ih_name) - (start - ih->ih_name) - 1;
+	if (strlen(descr) + 1 > space) {
+		mtx_unlock(&ie->ie_lock);
+		return (ENOSPC);
+	}
+
+	/* Append a colon followed by the description. */
+	*start = ':';
+	strcpy(start + 1, descr);
+	intr_event_update(ie);
+	mtx_unlock(&ie->ie_lock);
+	return (0);
+}
+
+/*
+ * Return the ie_source field from the intr_event an intr_handler is
+ * associated with.
+ */
+void *
+intr_handler_source(void *cookie)
+{
+	struct intr_handler *ih;
+	struct intr_event *ie;
+
+	ih = (struct intr_handler *)cookie;
+	if (ih == NULL)
+		return (NULL);
+	ie = ih->ih_event;
+	KASSERT(ie != NULL,
+	    ("interrupt handler \"%s\" has a NULL interrupt event",
+	    ih->ih_name));
+	return (ie->ie_source);
+}
+
+/*
+ * Sleep until an ithread finishes executing an interrupt handler.
+ *
+ * XXX Doesn't currently handle interrupt filters or fast interrupt
+ * handlers.  This is intended for compatibility with linux drivers
+ * only.  Do not use in BSD code.
+ */
+void
+_intr_drain(int irq)
+{
+	struct intr_event *ie;
+	struct intr_thread *ithd;
+	struct thread *td;
+
+	ie = intr_lookup(irq);
+	if (ie == NULL)
+		return;
+	if (ie->ie_thread == NULL)
+		return;
+	ithd = ie->ie_thread;
+	td = ithd->it_thread;
+	/*
+	 * We set the flag and wait for it to be cleared to avoid
+	 * long delays with potentially busy interrupt handlers
+	 * were we to only sample TD_AWAITING_INTR() every tick.
+	 */
+	thread_lock(td);
+	if (!TD_AWAITING_INTR(td)) {
+		ithd->it_flags |= IT_WAIT;
+		while (ithd->it_flags & IT_WAIT) {
+			thread_unlock(td);
+			pause("idrain", 1);
+			thread_lock(td);
+		}
+	}
+	thread_unlock(td);
+	return;
+}
+
+
+#ifndef INTR_FILTER
+int
+intr_event_remove_handler(void *cookie)
+{
+	struct intr_handler *handler = (struct intr_handler *)cookie;
+	struct intr_event *ie;
+#ifdef INVARIANTS
+	struct intr_handler *ih;
+#endif
+#ifdef notyet
+	int dead;
+#endif
+
+	if (handler == NULL)
+		return (EINVAL);
+	ie = handler->ih_event;
+	KASSERT(ie != NULL,
+	    ("interrupt handler \"%s\" has a NULL interrupt event",
+	    handler->ih_name));
+	mtx_lock(&ie->ie_lock);
+	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
+	    ie->ie_name);
+#ifdef INVARIANTS
+	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
+		if (ih == handler)
+			goto ok;
+	mtx_unlock(&ie->ie_lock);
+	panic("interrupt handler \"%s\" not found in interrupt event \"%s\"",
+	    ih->ih_name, ie->ie_name);
+ok:
+#endif
+	/*
+	 * If there is no ithread, then just remove the handler and return.
+	 * XXX: Note that an INTR_FAST handler might be running on another
+	 * CPU!
+	 */
+	if (ie->ie_thread == NULL) {
+		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
+		mtx_unlock(&ie->ie_lock);
+		free(handler, M_ITHREAD);
+		return (0);
+	}
+
+	/*
+	 * If the interrupt thread is already running, then just mark this
+	 * handler as being dead and let the ithread do the actual removal.
+	 *
+	 * During a cold boot while cold is set, msleep() does not sleep,
+	 * so we have to remove the handler here rather than letting the
+	 * thread do it.
+	 */
+	thread_lock(ie->ie_thread->it_thread);
+	if (!TD_AWAITING_INTR(ie->ie_thread->it_thread) && !cold) {
+		handler->ih_flags |= IH_DEAD;
+
+		/*
+		 * Ensure that the thread will process the handler list
+		 * again and remove this handler if it has already passed
+		 * it on the list.
+		 */
+		atomic_store_rel_int(&ie->ie_thread->it_need, 1);
+	} else
+		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
+	thread_unlock(ie->ie_thread->it_thread);
+	while (handler->ih_flags & IH_DEAD)
+		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
+	intr_event_update(ie);
+#ifdef notyet
+	/*
+	 * XXX: This could be bad in the case of ppbus(8).  Also, I think
+	 * this could lead to races of stale data when servicing an
+	 * interrupt.
+	 */
+	dead = 1;
+	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+		if (!(ih->ih_flags & IH_FAST)) {
+			dead = 0;
+			break;
+		}
+	}
+	if (dead) {
+		ithread_destroy(ie->ie_thread);
+		ie->ie_thread = NULL;
+	}
+#endif
+	mtx_unlock(&ie->ie_lock);
+	free(handler, M_ITHREAD);
+	return (0);
+}
+
+static int
+intr_event_schedule_thread(struct intr_event *ie)
+{
+	struct intr_entropy entropy;
+	struct intr_thread *it;
+	struct thread *td;
+	struct thread *ctd;
+	struct proc *p;
+
+	/*
+	 * If no ithread or no handlers, then we have a stray interrupt.
+	 */
+	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) ||
+	    ie->ie_thread == NULL)
+		return (EINVAL);
+
+	ctd = curthread;
+	it = ie->ie_thread;
+	td = it->it_thread;
+	p = td->td_proc;
+
+	/*
+	 * If any of the handlers for this ithread claim to be good
+	 * sources of entropy, then gather some.
+	 */
+	if (harvest.interrupt && ie->ie_flags & IE_ENTROPY) {
+		CTR3(KTR_INTR, "%s: pid %d (%s) gathering entropy", __func__,
+		    p->p_pid, td->td_name);
+		entropy.event = (uintptr_t)ie;
+		entropy.td = ctd;
+		random_harvest(&entropy, sizeof(entropy), 2, 0,
+		    RANDOM_INTERRUPT);
+	}
+
+	KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
+
+	/*
+	 * Set it_need to tell the thread to keep running if it is already
+	 * running.  Then, lock the thread and see if we actually need to
+	 * put it on the runqueue.
+	 */
+	atomic_store_rel_int(&it->it_need, 1);
+	thread_lock(td);
+	if (TD_AWAITING_INTR(td)) {
+		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
+		    td->td_name);
+		TD_CLR_IWAIT(td);
+		sched_add(td, SRQ_INTR);
+	} else {
+		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
+		    __func__, p->p_pid, td->td_name, it->it_need, td->td_state);
+	}
+	thread_unlock(td);
+
+	return (0);
+}
+#else
+int
+intr_event_remove_handler(void *cookie)
+{
+	struct intr_handler *handler = (struct intr_handler *)cookie;
+	struct intr_event *ie;
+	struct intr_thread *it;
+#ifdef INVARIANTS
+	struct intr_handler *ih;
+#endif
+#ifdef notyet
+	int dead;
+#endif
+
+	if (handler == NULL)
+		return (EINVAL);
+	ie = handler->ih_event;
+	KASSERT(ie != NULL,
+	    ("interrupt handler \"%s\" has a NULL interrupt event",
+	    handler->ih_name));
+	mtx_lock(&ie->ie_lock);
+	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
+	    ie->ie_name);
+#ifdef INVARIANTS
+	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
+		if (ih == handler)
+			goto ok;
+	mtx_unlock(&ie->ie_lock);
+	panic("interrupt handler \"%s\" not found in interrupt event \"%s\"",
+	    ih->ih_name, ie->ie_name);
+ok:
+#endif
+	/*
+	 * If there are no ithreads (per event and per handler), then
+	 * just remove the handler and return.  
+	 * XXX: Note that an INTR_FAST handler might be running on another CPU!
+	 */
+	if (ie->ie_thread == NULL && handler->ih_thread == NULL) {
+		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
+		mtx_unlock(&ie->ie_lock);
+		free(handler, M_ITHREAD);
+		return (0);
+	}
+
+	/* Private or global ithread? */
+	it = (handler->ih_thread) ? handler->ih_thread : ie->ie_thread;
+	/*
+	 * If the interrupt thread is already running, then just mark this
+	 * handler as being dead and let the ithread do the actual removal.
+	 *
+	 * During a cold boot while cold is set, msleep() does not sleep,
+	 * so we have to remove the handler here rather than letting the
+	 * thread do it.
+	 */
+	thread_lock(it->it_thread);
+	if (!TD_AWAITING_INTR(it->it_thread) && !cold) {
+		handler->ih_flags |= IH_DEAD;
+
+		/*
+		 * Ensure that the thread will process the handler list
+		 * again and remove this handler if it has already passed
+		 * it on the list.
+		 */
+		atomic_store_rel_int(&it->it_need, 1);
+	} else
+		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
+	thread_unlock(it->it_thread);
+	while (handler->ih_flags & IH_DEAD)
+		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
+	/* 
+	 * At this point, the handler has been disconnected from the event,
+	 * so we can kill the private ithread if any.
+	 */
+	if (handler->ih_thread) {
+		ithread_destroy(handler->ih_thread);
+		handler->ih_thread = NULL;
+	}
+	intr_event_update(ie);
+#ifdef notyet
+	/*
+	 * XXX: This could be bad in the case of ppbus(8).  Also, I think
+	 * this could lead to races of stale data when servicing an
+	 * interrupt.
+	 */
+	dead = 1;
+	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+		if (handler != NULL) {
+			dead = 0;
+			break;
+		}
+	}
+	if (dead) {
+		ithread_destroy(ie->ie_thread);
+		ie->ie_thread = NULL;
+	}
+#endif
+	mtx_unlock(&ie->ie_lock);
+	free(handler, M_ITHREAD);
+	return (0);
+}
+
+static int
+intr_event_schedule_thread(struct intr_event *ie, struct intr_thread *it)
+{
+	struct intr_entropy entropy;
+	struct thread *td;
+	struct thread *ctd;
+	struct proc *p;
+
+	/*
+	 * If no ithread or no handlers, then we have a stray interrupt.
+	 */
+	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) || it == NULL)
+		return (EINVAL);
+
+	ctd = curthread;
+	td = it->it_thread;
+	p = td->td_proc;
+
+	/*
+	 * If any of the handlers for this ithread claim to be good
+	 * sources of entropy, then gather some.
+	 */
+	if (harvest.interrupt && ie->ie_flags & IE_ENTROPY) {
+		CTR3(KTR_INTR, "%s: pid %d (%s) gathering entropy", __func__,
+		    p->p_pid, td->td_name);
+		entropy.event = (uintptr_t)ie;
+		entropy.td = ctd;
+		random_harvest(&entropy, sizeof(entropy), 2, 0,
+		    RANDOM_INTERRUPT);
+	}
+
+	KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
+
+	/*
+	 * Set it_need to tell the thread to keep running if it is already
+	 * running.  Then, lock the thread and see if we actually need to
+	 * put it on the runqueue.
+	 */
+	atomic_store_rel_int(&it->it_need, 1);
+	thread_lock(td);
+	if (TD_AWAITING_INTR(td)) {
+		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
+		    td->td_name);
+		TD_CLR_IWAIT(td);
+		sched_add(td, SRQ_INTR);
+	} else {
+		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
+		    __func__, p->p_pid, td->td_name, it->it_need, td->td_state);
+	}
+	thread_unlock(td);
+
+	return (0);
+}
+#endif
+
+/*
+ * Allow interrupt event binding for software interrupt handlers -- a no-op,
+ * since interrupts are generated in software rather than being directed by
+ * a PIC.
+ */
+static int
+swi_assign_cpu(void *arg, u_char cpu)
+{
+
+	return (0);
+}
+
+/*
+ * Add a software interrupt handler to a specified event.  If a given event
+ * is not specified, then a new event is created.
+ */
+int
+swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler,
+	    void *arg, int pri, enum intr_type flags, void **cookiep)
+{
+	struct intr_event *ie;
+	int error;
+
+	if (flags & INTR_ENTROPY)
+		return (EINVAL);
+
+	ie = (eventp != NULL) ? *eventp : NULL;
+
+	if (ie != NULL) {
+		if (!(ie->ie_flags & IE_SOFT))
+			return (EINVAL);
+	} else {
+		error = intr_event_create(&ie, NULL, IE_SOFT, 0,
+		    NULL, NULL, NULL, swi_assign_cpu, "swi%d:", pri);
+		if (error)
+			return (error);
+		if (eventp != NULL)
+			*eventp = ie;
+	}
+	error = intr_event_add_handler(ie, name, NULL, handler, arg,
+	    PI_SWI(pri), flags, cookiep);
+	return (error);
+}
+
+/*
+ * Schedule a software interrupt thread.
+ */
+void
+swi_sched(void *cookie, int flags)
+{
+	struct intr_handler *ih = (struct intr_handler *)cookie;
+	struct intr_event *ie = ih->ih_event;
+	struct intr_entropy entropy;
+	int error;
+
+	CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name,
+	    ih->ih_need);
+
+	if (harvest.swi) {
+		CTR2(KTR_INTR, "swi_sched: pid %d (%s) gathering entropy",
+		    curproc->p_pid, curthread->td_name);
+		entropy.event = (uintptr_t)ih;
+		entropy.td = curthread;
+		random_harvest(&entropy, sizeof(entropy), 1, 0,
+		    RANDOM_SWI);
+	}
+
+	/*
+	 * Set ih_need for this handler so that if the ithread is already
+	 * running it will execute this handler on the next pass.  Otherwise,
+	 * it will execute it the next time it runs.
+	 */
+	atomic_store_rel_int(&ih->ih_need, 1);
+
+	if (!(flags & SWI_DELAY)) {
+		PCPU_INC(cnt.v_soft);
+#ifdef INTR_FILTER
+		error = intr_event_schedule_thread(ie, ie->ie_thread);
+#else
+		error = intr_event_schedule_thread(ie);
+#endif
+		KASSERT(error == 0, ("stray software interrupt"));
+	}
+}
+
+/*
+ * Remove a software interrupt handler.  Currently this code does not
+ * remove the associated interrupt event if it becomes empty.  Calling code
+ * may do so manually via intr_event_destroy(), but that's not really
+ * an optimal interface.
+ */
+int
+swi_remove(void *cookie)
+{
+
+	return (intr_event_remove_handler(cookie));
+}
+
+#ifdef INTR_FILTER
+static void
+priv_ithread_execute_handler(struct proc *p, struct intr_handler *ih)
+{
+	struct intr_event *ie;
+
+	ie = ih->ih_event;
+	/*
+	 * If this handler is marked for death, remove it from
+	 * the list of handlers and wake up the sleeper.
+	 */
+	if (ih->ih_flags & IH_DEAD) {
+		mtx_lock(&ie->ie_lock);
+		TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
+		ih->ih_flags &= ~IH_DEAD;
+		wakeup(ih);
+		mtx_unlock(&ie->ie_lock);
+		return;
+	}
+	
+	/* Execute this handler. */
+	CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
+	     __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument,
+	     ih->ih_name, ih->ih_flags);
+	
+	if (!(ih->ih_flags & IH_MPSAFE))
+		mtx_lock(&Giant);
+	ih->ih_handler(ih->ih_argument);
+	if (!(ih->ih_flags & IH_MPSAFE))
+		mtx_unlock(&Giant);
+}
+#endif
+
+/*
+ * This is a public function for use by drivers that mux interrupt
+ * handlers for child devices from their interrupt handler.
+ */
+void
+intr_event_execute_handlers(struct proc *p, struct intr_event *ie)
+{
+	struct intr_handler *ih, *ihn;
+
+	TAILQ_FOREACH_SAFE(ih, &ie->ie_handlers, ih_next, ihn) {
+		/*
+		 * If this handler is marked for death, remove it from
+		 * the list of handlers and wake up the sleeper.
+		 */
+		if (ih->ih_flags & IH_DEAD) {
+			mtx_lock(&ie->ie_lock);
+			TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
+			ih->ih_flags &= ~IH_DEAD;
+			wakeup(ih);
+			mtx_unlock(&ie->ie_lock);
+			continue;
+		}
+
+		/* Skip filter only handlers */
+		if (ih->ih_handler == NULL)
+			continue;
+
+		/*
+		 * For software interrupt threads, we only execute
+		 * handlers that have their need flag set.  Hardware
+		 * interrupt threads always invoke all of their handlers.
+		 */
+		if (ie->ie_flags & IE_SOFT) {
+			if (atomic_load_acq_int(&ih->ih_need) == 0)
+				continue;
+			else
+				atomic_store_rel_int(&ih->ih_need, 0);
+		}
+
+		/* Execute this handler. */
+		CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
+		    __func__, p->p_pid, (void *)ih->ih_handler, 
+		    ih->ih_argument, ih->ih_name, ih->ih_flags);
+
+		if (!(ih->ih_flags & IH_MPSAFE))
+			mtx_lock(&Giant);
+		ih->ih_handler(ih->ih_argument);
+		if (!(ih->ih_flags & IH_MPSAFE))
+			mtx_unlock(&Giant);
+	}
+}
+
+static void
+ithread_execute_handlers(struct proc *p, struct intr_event *ie)
+{
+
+	/* Interrupt handlers should not sleep. */
+	if (!(ie->ie_flags & IE_SOFT))
+		THREAD_NO_SLEEPING();
+	intr_event_execute_handlers(p, ie);
+	if (!(ie->ie_flags & IE_SOFT))
+		THREAD_SLEEPING_OK();
+
+	/*
+	 * Interrupt storm handling:
+	 *
+	 * If this interrupt source is currently storming, then throttle
+	 * it to only fire the handler once  per clock tick.
+	 *
+	 * If this interrupt source is not currently storming, but the
+	 * number of back to back interrupts exceeds the storm threshold,
+	 * then enter storming mode.
+	 */
+	if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold &&
+	    !(ie->ie_flags & IE_SOFT)) {
+		/* Report the message only once every second. */
+		if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) {
+			printf(
+	"interrupt storm detected on \"%s\"; throttling interrupt source\n",
+			    ie->ie_name);
+		}
+		pause("istorm", 1);
+	} else
+		ie->ie_count++;
+
+	/*
+	 * Now that all the handlers have had a chance to run, reenable
+	 * the interrupt source.
+	 */
+	if (ie->ie_post_ithread != NULL)
+		ie->ie_post_ithread(ie->ie_source);
+}
+
+#ifndef INTR_FILTER
+/*
+ * This is the main code for interrupt threads.
+ */
+static void
+ithread_loop(void *arg)
+{
+	struct intr_thread *ithd;
+	struct intr_event *ie;
+	struct thread *td;
+	struct proc *p;
+	int wake;
+
+	td = curthread;
+	p = td->td_proc;
+	ithd = (struct intr_thread *)arg;
+	KASSERT(ithd->it_thread == td,
+	    ("%s: ithread and proc linkage out of sync", __func__));
+	ie = ithd->it_event;
+	ie->ie_count = 0;
+	wake = 0;
+
+	/*
+	 * As long as we have interrupts outstanding, go through the
+	 * list of handlers, giving each one a go at it.
+	 */
+	for (;;) {
+		/*
+		 * If we are an orphaned thread, then just die.
+		 */
+		if (ithd->it_flags & IT_DEAD) {
+			CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
+			    p->p_pid, td->td_name);
+			free(ithd, M_ITHREAD);
+			kthread_exit();
+		}
+
+		/*
+		 * Service interrupts.  If another interrupt arrives while
+		 * we are running, it will set it_need to note that we
+		 * should make another pass.
+		 */
+		while (atomic_load_acq_int(&ithd->it_need) != 0) {
+			/*
+			 * This might need a full read and write barrier
+			 * to make sure that this write posts before any
+			 * of the memory or device accesses in the
+			 * handlers.
+			 */
+			atomic_store_rel_int(&ithd->it_need, 0);
+			ithread_execute_handlers(p, ie);
+		}
+		WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
+		mtx_assert(&Giant, MA_NOTOWNED);
+
+		/*
+		 * Processed all our interrupts.  Now get the sched
+		 * lock.  This may take a while and it_need may get
+		 * set again, so we have to check it again.
+		 */
+		thread_lock(td);
+		if ((atomic_load_acq_int(&ithd->it_need) == 0) &&
+		    !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
+			TD_SET_IWAIT(td);
+			ie->ie_count = 0;
+			mi_switch(SW_VOL | SWT_IWAIT, NULL);
+		}
+		if (ithd->it_flags & IT_WAIT) {
+			wake = 1;
+			ithd->it_flags &= ~IT_WAIT;
+		}
+		thread_unlock(td);
+		if (wake) {
+			wakeup(ithd);
+			wake = 0;
+		}
+	}
+}
+
+/*
+ * Main interrupt handling body.
+ *
+ * Input:
+ * o ie:                        the event connected to this interrupt.
+ * o frame:                     some archs (i.e. i386) pass a frame to some.
+ *                              handlers as their main argument.
+ * Return value:
+ * o 0:                         everything ok.
+ * o EINVAL:                    stray interrupt.
+ */
+int
+intr_event_handle(struct intr_event *ie, struct trapframe *frame)
+{
+	struct intr_handler *ih;
+	struct trapframe *oldframe;
+	struct thread *td;
+	int error, ret, thread;
+
+	td = curthread;
+
+	/* An interrupt with no event or handlers is a stray interrupt. */
+	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
+		return (EINVAL);
+
+	/*
+	 * Execute fast interrupt handlers directly.
+	 * To support clock handlers, if a handler registers
+	 * with a NULL argument, then we pass it a pointer to
+	 * a trapframe as its argument.
+	 */
+	td->td_intr_nesting_level++;
+	thread = 0;
+	ret = 0;
+	critical_enter();
+	oldframe = td->td_intr_frame;
+	td->td_intr_frame = frame;
+	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+		if (ih->ih_filter == NULL) {
+			thread = 1;
+			continue;
+		}
+		CTR4(KTR_INTR, "%s: exec %p(%p) for %s", __func__,
+		    ih->ih_filter, ih->ih_argument == NULL ? frame :
+		    ih->ih_argument, ih->ih_name);
+		if (ih->ih_argument == NULL)
+			ret = ih->ih_filter(frame);
+		else
+			ret = ih->ih_filter(ih->ih_argument);
+		KASSERT(ret == FILTER_STRAY ||
+		    ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 &&
+		    (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0),
+		    ("%s: incorrect return value %#x from %s", __func__, ret,
+		    ih->ih_name));
+
+		/* 
+		 * Wrapper handler special handling:
+		 *
+		 * in some particular cases (like pccard and pccbb), 
+		 * the _real_ device handler is wrapped in a couple of
+		 * functions - a filter wrapper and an ithread wrapper.
+		 * In this case (and just in this case), the filter wrapper 
+		 * could ask the system to schedule the ithread and mask
+		 * the interrupt source if the wrapped handler is composed
+		 * of just an ithread handler.
+		 *
+		 * TODO: write a generic wrapper to avoid people rolling 
+		 * their own
+		 */
+		if (!thread) {
+			if (ret == FILTER_SCHEDULE_THREAD)
+				thread = 1;
+		}
+	}
+	td->td_intr_frame = oldframe;
+
+	if (thread) {
+		if (ie->ie_pre_ithread != NULL)
+			ie->ie_pre_ithread(ie->ie_source);
+	} else {
+		if (ie->ie_post_filter != NULL)
+			ie->ie_post_filter(ie->ie_source);
+	}
+	
+	/* Schedule the ithread if needed. */
+	if (thread) {
+		error = intr_event_schedule_thread(ie);
+#ifndef XEN		
+		KASSERT(error == 0, ("bad stray interrupt"));
+#else
+		if (error != 0)
+			log(LOG_WARNING, "bad stray interrupt");
+#endif		
+	}
+	critical_exit();
+	td->td_intr_nesting_level--;
+	return (0);
+}
+#else
+/*
+ * This is the main code for interrupt threads.
+ */
+static void
+ithread_loop(void *arg)
+{
+	struct intr_thread *ithd;
+	struct intr_handler *ih;
+	struct intr_event *ie;
+	struct thread *td;
+	struct proc *p;
+	int priv;
+	int wake;
+
+	td = curthread;
+	p = td->td_proc;
+	ih = (struct intr_handler *)arg;
+	priv = (ih->ih_thread != NULL) ? 1 : 0;
+	ithd = (priv) ? ih->ih_thread : ih->ih_event->ie_thread;
+	KASSERT(ithd->it_thread == td,
+	    ("%s: ithread and proc linkage out of sync", __func__));
+	ie = ithd->it_event;
+	ie->ie_count = 0;
+	wake = 0;
+
+	/*
+	 * As long as we have interrupts outstanding, go through the
+	 * list of handlers, giving each one a go at it.
+	 */
+	for (;;) {
+		/*
+		 * If we are an orphaned thread, then just die.
+		 */
+		if (ithd->it_flags & IT_DEAD) {
+			CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
+			    p->p_pid, td->td_name);
+			free(ithd, M_ITHREAD);
+			kthread_exit();
+		}
+
+		/*
+		 * Service interrupts.  If another interrupt arrives while
+		 * we are running, it will set it_need to note that we
+		 * should make another pass.
+		 */
+		while (atomic_load_acq_int(&ithd->it_need) != 0) {
+			/*
+			 * This might need a full read and write barrier
+			 * to make sure that this write posts before any
+			 * of the memory or device accesses in the
+			 * handlers.
+			 */
+			atomic_store_rel_int(&ithd->it_need, 0);
+			if (priv)
+				priv_ithread_execute_handler(p, ih);
+			else 
+				ithread_execute_handlers(p, ie);
+		}
+		WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
+		mtx_assert(&Giant, MA_NOTOWNED);
+
+		/*
+		 * Processed all our interrupts.  Now get the sched
+		 * lock.  This may take a while and it_need may get
+		 * set again, so we have to check it again.
+		 */
+		thread_lock(td);
+		if ((atomic_load_acq_int(&ithd->it_need) == 0) &&
+		    !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
+			TD_SET_IWAIT(td);
+			ie->ie_count = 0;
+			mi_switch(SW_VOL | SWT_IWAIT, NULL);
+		}
+		if (ithd->it_flags & IT_WAIT) {
+			wake = 1;
+			ithd->it_flags &= ~IT_WAIT;
+		}
+		thread_unlock(td);
+		if (wake) {
+			wakeup(ithd);
+			wake = 0;
+		}
+	}
+}
+
+/* 
+ * Main loop for interrupt filter.
+ *
+ * Some architectures (i386, amd64 and arm) require the optional frame 
+ * parameter, and use it as the main argument for fast handler execution
+ * when ih_argument == NULL.
+ *
+ * Return value:
+ * o FILTER_STRAY:              No filter recognized the event, and no
+ *                              filter-less handler is registered on this 
+ *                              line.
+ * o FILTER_HANDLED:            A filter claimed the event and served it.
+ * o FILTER_SCHEDULE_THREAD:    No filter claimed the event, but there's at
+ *                              least one filter-less handler on this line.
+ * o FILTER_HANDLED | 
+ *   FILTER_SCHEDULE_THREAD:    A filter claimed the event, and asked for
+ *                              scheduling the per-handler ithread.
+ *
+ * In case an ithread has to be scheduled, in *ithd there will be a 
+ * pointer to a struct intr_thread containing the thread to be
+ * scheduled.
+ */
+
+static int
+intr_filter_loop(struct intr_event *ie, struct trapframe *frame, 
+		 struct intr_thread **ithd) 
+{
+	struct intr_handler *ih;
+	void *arg;
+	int ret, thread_only;
+
+	ret = 0;
+	thread_only = 0;
+	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+		/*
+		 * Execute fast interrupt handlers directly.
+		 * To support clock handlers, if a handler registers
+		 * with a NULL argument, then we pass it a pointer to
+		 * a trapframe as its argument.
+		 */
+		arg = ((ih->ih_argument == NULL) ? frame : ih->ih_argument);
+		
+		CTR5(KTR_INTR, "%s: exec %p/%p(%p) for %s", __func__,
+		     ih->ih_filter, ih->ih_handler, arg, ih->ih_name);
+
+		if (ih->ih_filter != NULL)
+			ret = ih->ih_filter(arg);
+		else {
+			thread_only = 1;
+			continue;
+		}
+		KASSERT(ret == FILTER_STRAY ||
+		    ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 &&
+		    (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0),
+		    ("%s: incorrect return value %#x from %s", __func__, ret,
+		    ih->ih_name));
+		if (ret & FILTER_STRAY)
+			continue;
+		else { 
+			*ithd = ih->ih_thread;
+			return (ret);
+		}
+	}
+
+	/*
+	 * No filters handled the interrupt and we have at least
+	 * one handler without a filter.  In this case, we schedule
+	 * all of the filter-less handlers to run in the ithread.
+	 */	
+	if (thread_only) {
+		*ithd = ie->ie_thread;
+		return (FILTER_SCHEDULE_THREAD);
+	}
+	return (FILTER_STRAY);
+}
+
+/*
+ * Main interrupt handling body.
+ *
+ * Input:
+ * o ie:                        the event connected to this interrupt.
+ * o frame:                     some archs (i.e. i386) pass a frame to some.
+ *                              handlers as their main argument.
+ * Return value:
+ * o 0:                         everything ok.
+ * o EINVAL:                    stray interrupt.
+ */
+int
+intr_event_handle(struct intr_event *ie, struct trapframe *frame)
+{
+	struct intr_thread *ithd;
+	struct trapframe *oldframe;
+	struct thread *td;
+	int thread;
+
+	ithd = NULL;
+	td = curthread;
+
+	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
+		return (EINVAL);
+
+	td->td_intr_nesting_level++;
+	thread = 0;
+	critical_enter();
+	oldframe = td->td_intr_frame;
+	td->td_intr_frame = frame;
+	thread = intr_filter_loop(ie, frame, &ithd);	
+	if (thread & FILTER_HANDLED) {
+		if (ie->ie_post_filter != NULL)
+			ie->ie_post_filter(ie->ie_source);
+	} else {
+		if (ie->ie_pre_ithread != NULL)
+			ie->ie_pre_ithread(ie->ie_source);
+	}
+	td->td_intr_frame = oldframe;
+	critical_exit();
+	
+	/* Interrupt storm logic */
+	if (thread & FILTER_STRAY) {
+		ie->ie_count++;
+		if (ie->ie_count < intr_storm_threshold)
+			printf("Interrupt stray detection not present\n");
+	}
+
+	/* Schedule an ithread if needed. */
+	if (thread & FILTER_SCHEDULE_THREAD) {
+		if (intr_event_schedule_thread(ie, ithd) != 0)
+			panic("%s: impossible stray interrupt", __func__);
+	}
+	td->td_intr_nesting_level--;
+	return (0);
+}
+#endif
+
+#ifdef DDB
+/*
+ * Dump details about an interrupt handler
+ */
+static void
+db_dump_intrhand(struct intr_handler *ih)
+{
+	int comma;
+
+	db_printf("\t%-10s ", ih->ih_name);
+	switch (ih->ih_pri) {
+	case PI_REALTIME:
+		db_printf("CLK ");
+		break;
+	case PI_AV:
+		db_printf("AV  ");
+		break;
+	case PI_TTY:
+		db_printf("TTY ");
+		break;
+	case PI_NET:
+		db_printf("NET ");
+		break;
+	case PI_DISK:
+		db_printf("DISK");
+		break;
+	case PI_DULL:
+		db_printf("DULL");
+		break;
+	default:
+		if (ih->ih_pri >= PI_SOFT)
+			db_printf("SWI ");
+		else
+			db_printf("%4u", ih->ih_pri);
+		break;
+	}
+	db_printf(" ");
+	if (ih->ih_filter != NULL) {
+		db_printf("[F]");
+		db_printsym((uintptr_t)ih->ih_filter, DB_STGY_PROC);
+	}
+	if (ih->ih_handler != NULL) {
+		if (ih->ih_filter != NULL)
+			db_printf(",");
+		db_printf("[H]");
+		db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC);
+	}
+	db_printf("(%p)", ih->ih_argument);
+	if (ih->ih_need ||
+	    (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD |
+	    IH_MPSAFE)) != 0) {
+		db_printf(" {");
+		comma = 0;
+		if (ih->ih_flags & IH_EXCLUSIVE) {
+			if (comma)
+				db_printf(", ");
+			db_printf("EXCL");
+			comma = 1;
+		}
+		if (ih->ih_flags & IH_ENTROPY) {
+			if (comma)
+				db_printf(", ");
+			db_printf("ENTROPY");
+			comma = 1;
+		}
+		if (ih->ih_flags & IH_DEAD) {
+			if (comma)
+				db_printf(", ");
+			db_printf("DEAD");
+			comma = 1;
+		}
+		if (ih->ih_flags & IH_MPSAFE) {
+			if (comma)
+				db_printf(", ");
+			db_printf("MPSAFE");
+			comma = 1;
+		}
+		if (ih->ih_need) {
+			if (comma)
+				db_printf(", ");
+			db_printf("NEED");
+		}
+		db_printf("}");
+	}
+	db_printf("\n");
+}
+
+/*
+ * Dump details about a event.
+ */
+void
+db_dump_intr_event(struct intr_event *ie, int handlers)
+{
+	struct intr_handler *ih;
+	struct intr_thread *it;
+	int comma;
+
+	db_printf("%s ", ie->ie_fullname);
+	it = ie->ie_thread;
+	if (it != NULL)
+		db_printf("(pid %d)", it->it_thread->td_proc->p_pid);
+	else
+		db_printf("(no thread)");
+	if ((ie->ie_flags & (IE_SOFT | IE_ENTROPY | IE_ADDING_THREAD)) != 0 ||
+	    (it != NULL && it->it_need)) {
+		db_printf(" {");
+		comma = 0;
+		if (ie->ie_flags & IE_SOFT) {
+			db_printf("SOFT");
+			comma = 1;
+		}
+		if (ie->ie_flags & IE_ENTROPY) {
+			if (comma)
+				db_printf(", ");
+			db_printf("ENTROPY");
+			comma = 1;
+		}
+		if (ie->ie_flags & IE_ADDING_THREAD) {
+			if (comma)
+				db_printf(", ");
+			db_printf("ADDING_THREAD");
+			comma = 1;
+		}
+		if (it != NULL && it->it_need) {
+			if (comma)
+				db_printf(", ");
+			db_printf("NEED");
+		}
+		db_printf("}");
+	}
+	db_printf("\n");
+
+	if (handlers)
+		TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
+		    db_dump_intrhand(ih);
+}
+
+/*
+ * Dump data about interrupt handlers
+ */
+DB_SHOW_COMMAND(intr, db_show_intr)
+{
+	struct intr_event *ie;
+	int all, verbose;
+
+	verbose = strchr(modif, 'v') != NULL;
+	all = strchr(modif, 'a') != NULL;
+	TAILQ_FOREACH(ie, &event_list, ie_list) {
+		if (!all && TAILQ_EMPTY(&ie->ie_handlers))
+			continue;
+		db_dump_intr_event(ie, verbose);
+		if (db_pager_quit)
+			break;
+	}
+}
+#endif /* DDB */
+
+/*
+ * Start standard software interrupt threads
+ */
+static void
+start_softintr(void *dummy)
+{
+
+	if (swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, INTR_MPSAFE, &vm_ih))
+		panic("died while creating vm swi ithread");
+}
+SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr,
+    NULL);
+
+/*
+ * Sysctls used by systat and others: hw.intrnames and hw.intrcnt.
+ * The data for this machine dependent, and the declarations are in machine
+ * dependent code.  The layout of intrnames and intrcnt however is machine
+ * independent.
+ *
+ * We do not know the length of intrcnt and intrnames at compile time, so
+ * calculate things at run time.
+ */
+static int
+sysctl_intrnames(SYSCTL_HANDLER_ARGS)
+{
+	return (sysctl_handle_opaque(oidp, intrnames, sintrnames, req));
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, intrnames, CTLTYPE_OPAQUE | CTLFLAG_RD,
+    NULL, 0, sysctl_intrnames, "", "Interrupt Names");
+
+static int
+sysctl_intrcnt(SYSCTL_HANDLER_ARGS)
+{
+#ifdef SCTL_MASK32
+	uint32_t *intrcnt32;
+	unsigned i;
+	int error;
+
+	if (req->flags & SCTL_MASK32) {
+		if (!req->oldptr)
+			return (sysctl_handle_opaque(oidp, NULL, sintrcnt / 2, req));
+		intrcnt32 = malloc(sintrcnt / 2, M_TEMP, M_NOWAIT);
+		if (intrcnt32 == NULL)
+			return (ENOMEM);
+		for (i = 0; i < sintrcnt / sizeof (u_long); i++)
+			intrcnt32[i] = intrcnt[i];
+		error = sysctl_handle_opaque(oidp, intrcnt32, sintrcnt / 2, req);
+		free(intrcnt32, M_TEMP);
+		return (error);
+	}
+#endif
+	return (sysctl_handle_opaque(oidp, intrcnt, sintrcnt, req));
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, intrcnt, CTLTYPE_OPAQUE | CTLFLAG_RD,
+    NULL, 0, sysctl_intrcnt, "", "Interrupt Counts");
+
+#ifdef DDB
+/*
+ * DDB command to dump the interrupt statistics.
+ */
+DB_SHOW_COMMAND(intrcnt, db_show_intrcnt)
+{
+	u_long *i;
+	char *cp;
+	u_int j;
+
+	cp = intrnames;
+	j = 0;
+	for (i = intrcnt; j < (sintrcnt / sizeof(u_long)) && !db_pager_quit;
+	    i++, j++) {
+		if (*cp == '\0')
+			break;
+		if (*i != 0)
+			db_printf("%s\t%lu\n", cp, *i);
+		cp += strlen(cp) + 1;
+	}
+}
+#endif
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
new file mode 100644
index 0000000..331b0e1
--- /dev/null
+++ b/sys/kern/kern_jail.c
@@ -0,0 +1,4677 @@
+/*-
+ * Copyright (c) 1999 Poul-Henning Kamp.
+ * Copyright (c) 2008 Bjoern A. Zeeb.
+ * Copyright (c) 2009 James Gritton.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_ddb.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/sysproto.h>
+#include <sys/malloc.h>
+#include <sys/osd.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/taskqueue.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/racct.h>
+#include <sys/refcount.h>
+#include <sys/sx.h>
+#include <sys/sysent.h>
+#include <sys/namei.h>
+#include <sys/mount.h>
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <net/if.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#ifdef INET6
+#include <netinet6/in6_var.h>
+#endif /* INET6 */
+#endif /* DDB */
+
+#include <security/mac/mac_framework.h>
+
+#define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
+
+MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
+static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
+
+/* Keep struct prison prison0 and some code in kern_jail_set() readable. */
+#ifdef INET
+#ifdef INET6
+#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
+#else
+#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
+#endif
+#else /* !INET */
+#ifdef INET6
+#define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
+#else
+#define	_PR_IP_SADDRSEL	0
+#endif
+#endif
+
+/* prison0 describes what is "real" about the system. */
+struct prison prison0 = {
+	.pr_id		= 0,
+	.pr_name	= "0",
+	.pr_ref		= 1,
+	.pr_uref	= 1,
+	.pr_path	= "/",
+	.pr_securelevel	= -1,
+	.pr_devfs_rsnum = 0,
+	.pr_childmax	= JAIL_MAX,
+	.pr_hostuuid	= DEFAULT_HOSTUUID,
+	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
+#ifdef VIMAGE
+	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
+#else
+	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
+#endif
+	.pr_allow	= PR_ALLOW_ALL,
+};
+MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
+
+/* allprison, allprison_racct and lastprid are protected by allprison_lock. */
+struct	sx allprison_lock;
+SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
+struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
+LIST_HEAD(, prison_racct) allprison_racct;
+int	lastprid = 0;
+
+static int do_jail_attach(struct thread *td, struct prison *pr);
+static void prison_complete(void *context, int pending);
+static void prison_deref(struct prison *pr, int flags);
+static char *prison_path(struct prison *pr1, struct prison *pr2);
+static void prison_remove_one(struct prison *pr);
+#ifdef RACCT
+static void prison_racct_attach(struct prison *pr);
+static void prison_racct_modify(struct prison *pr);
+static void prison_racct_detach(struct prison *pr);
+#endif
+#ifdef INET
+static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
+static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
+#endif
+#ifdef INET6
+static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
+static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
+#endif
+
+/* Flags for prison_deref */
+#define	PD_DEREF	0x01
+#define	PD_DEUREF	0x02
+#define	PD_LOCKED	0x04
+#define	PD_LIST_SLOCKED	0x08
+#define	PD_LIST_XLOCKED	0x10
+
+/*
+ * Parameter names corresponding to PR_* flag values.  Size values are for kvm
+ * as we cannot figure out the size of a sparse array, or an array without a
+ * terminating entry.
+ */
+static char *pr_flag_names[] = {
+	[0] = "persist",
+#ifdef INET
+	[7] = "ip4.saddrsel",
+#endif
+#ifdef INET6
+	[8] = "ip6.saddrsel",
+#endif
+};
+const size_t pr_flag_names_size = sizeof(pr_flag_names);
+
+static char *pr_flag_nonames[] = {
+	[0] = "nopersist",
+#ifdef INET
+	[7] = "ip4.nosaddrsel",
+#endif
+#ifdef INET6
+	[8] = "ip6.nosaddrsel",
+#endif
+};
+const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames);
+
+struct jailsys_flags {
+	const char	*name;
+	unsigned	 disable;
+	unsigned	 new;
+} pr_flag_jailsys[] = {
+	{ "host", 0, PR_HOST },
+#ifdef VIMAGE
+	{ "vnet", 0, PR_VNET },
+#endif
+#ifdef INET
+	{ "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
+#endif
+#ifdef INET6
+	{ "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
+#endif
+};
+const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
+
+static char *pr_allow_names[] = {
+	"allow.set_hostname",
+	"allow.sysvipc",
+	"allow.raw_sockets",
+	"allow.chflags",
+	"allow.mount",
+	"allow.quotas",
+	"allow.socket_af",
+	"allow.mount.devfs",
+	"allow.mount.nullfs",
+	"allow.mount.zfs",
+	"allow.mount.procfs",
+	"allow.mount.tmpfs",
+};
+const size_t pr_allow_names_size = sizeof(pr_allow_names);
+
+static char *pr_allow_nonames[] = {
+	"allow.noset_hostname",
+	"allow.nosysvipc",
+	"allow.noraw_sockets",
+	"allow.nochflags",
+	"allow.nomount",
+	"allow.noquotas",
+	"allow.nosocket_af",
+	"allow.mount.nodevfs",
+	"allow.mount.nonullfs",
+	"allow.mount.nozfs",
+	"allow.mount.noprocfs",
+	"allow.mount.notmpfs",
+};
+const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
+
+#define	JAIL_DEFAULT_ALLOW		PR_ALLOW_SET_HOSTNAME
+#define	JAIL_DEFAULT_ENFORCE_STATFS	2
+#define	JAIL_DEFAULT_DEVFS_RSNUM	0
+static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
+static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
+static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
+#if defined(INET) || defined(INET6)
+static unsigned jail_max_af_ips = 255;
+#endif
+
+#ifdef INET
+static int
+qcmp_v4(const void *ip1, const void *ip2)
+{
+	in_addr_t iaa, iab;
+
+	/*
+	 * We need to compare in HBO here to get the list sorted as expected
+	 * by the result of the code.  Sorting NBO addresses gives you
+	 * interesting results.  If you do not understand, do not try.
+	 */
+	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
+	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
+
+	/*
+	 * Do not simply return the difference of the two numbers, the int is
+	 * not wide enough.
+	 */
+	if (iaa > iab)
+		return (1);
+	else if (iaa < iab)
+		return (-1);
+	else
+		return (0);
+}
+#endif
+
+#ifdef INET6
+static int
+qcmp_v6(const void *ip1, const void *ip2)
+{
+	const struct in6_addr *ia6a, *ia6b;
+	int i, rc;
+
+	ia6a = (const struct in6_addr *)ip1;
+	ia6b = (const struct in6_addr *)ip2;
+
+	rc = 0;
+	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
+		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
+			rc = 1;
+		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
+			rc = -1;
+	}
+	return (rc);
+}
+#endif
+
+/*
+ * struct jail_args {
+ *	struct jail *jail;
+ * };
+ */
+int
+sys_jail(struct thread *td, struct jail_args *uap)
+{
+	uint32_t version;
+	int error;
+	struct jail j;
+
+	error = copyin(uap->jail, &version, sizeof(uint32_t));
+	if (error)
+		return (error);
+
+	switch (version) {
+	case 0:
+	{
+		struct jail_v0 j0;
+
+		/* FreeBSD single IPv4 jails. */
+		bzero(&j, sizeof(struct jail));
+		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
+		if (error)
+			return (error);
+		j.version = j0.version;
+		j.path = j0.path;
+		j.hostname = j0.hostname;
+		j.ip4s = j0.ip_number;
+		break;
+	}
+
+	case 1:
+		/*
+		 * Version 1 was used by multi-IPv4 jail implementations
+		 * that never made it into the official kernel.
+		 */
+		return (EINVAL);
+
+	case 2:	/* JAIL_API_VERSION */
+		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
+		error = copyin(uap->jail, &j, sizeof(struct jail));
+		if (error)
+			return (error);
+		break;
+
+	default:
+		/* Sci-Fi jails are not supported, sorry. */
+		return (EINVAL);
+	}
+	return (kern_jail(td, &j));
+}
+
+int
+kern_jail(struct thread *td, struct jail *j)
+{
+	struct iovec optiov[2 * (4
+			    + sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
+#ifdef INET
+			    + 1
+#endif
+#ifdef INET6
+			    + 1
+#endif
+			    )];
+	struct uio opt;
+	char *u_path, *u_hostname, *u_name;
+#ifdef INET
+	uint32_t ip4s;
+	struct in_addr *u_ip4;
+#endif
+#ifdef INET6
+	struct in6_addr *u_ip6;
+#endif
+	size_t tmplen;
+	int error, enforce_statfs, fi;
+
+	bzero(&optiov, sizeof(optiov));
+	opt.uio_iov = optiov;
+	opt.uio_iovcnt = 0;
+	opt.uio_offset = -1;
+	opt.uio_resid = -1;
+	opt.uio_segflg = UIO_SYSSPACE;
+	opt.uio_rw = UIO_READ;
+	opt.uio_td = td;
+
+	/* Set permissions for top-level jails from sysctls. */
+	if (!jailed(td->td_ucred)) {
+		for (fi = 0; fi < sizeof(pr_allow_names) /
+		     sizeof(pr_allow_names[0]); fi++) {
+			optiov[opt.uio_iovcnt].iov_base =
+			    (jail_default_allow & (1 << fi))
+			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
+			optiov[opt.uio_iovcnt].iov_len =
+			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
+			opt.uio_iovcnt += 2;
+		}
+		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
+		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
+		opt.uio_iovcnt++;
+		enforce_statfs = jail_default_enforce_statfs;
+		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
+		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
+		opt.uio_iovcnt++;
+	}
+
+	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
+#ifdef INET
+	ip4s = (j->version == 0) ? 1 : j->ip4s;
+	if (ip4s > jail_max_af_ips)
+		return (EINVAL);
+	tmplen += ip4s * sizeof(struct in_addr);
+#else
+	if (j->ip4s > 0)
+		return (EINVAL);
+#endif
+#ifdef INET6
+	if (j->ip6s > jail_max_af_ips)
+		return (EINVAL);
+	tmplen += j->ip6s * sizeof(struct in6_addr);
+#else
+	if (j->ip6s > 0)
+		return (EINVAL);
+#endif
+	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
+	u_hostname = u_path + MAXPATHLEN;
+	u_name = u_hostname + MAXHOSTNAMELEN;
+#ifdef INET
+	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
+#endif
+#ifdef INET6
+#ifdef INET
+	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
+#else
+	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
+#endif
+#endif
+	optiov[opt.uio_iovcnt].iov_base = "path";
+	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
+	opt.uio_iovcnt++;
+	optiov[opt.uio_iovcnt].iov_base = u_path;
+	error = copyinstr(j->path, u_path, MAXPATHLEN,
+	    &optiov[opt.uio_iovcnt].iov_len);
+	if (error) {
+		free(u_path, M_TEMP);
+		return (error);
+	}
+	opt.uio_iovcnt++;
+	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
+	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
+	opt.uio_iovcnt++;
+	optiov[opt.uio_iovcnt].iov_base = u_hostname;
+	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
+	    &optiov[opt.uio_iovcnt].iov_len);
+	if (error) {
+		free(u_path, M_TEMP);
+		return (error);
+	}
+	opt.uio_iovcnt++;
+	if (j->jailname != NULL) {
+		optiov[opt.uio_iovcnt].iov_base = "name";
+		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
+		opt.uio_iovcnt++;
+		optiov[opt.uio_iovcnt].iov_base = u_name;
+		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
+		    &optiov[opt.uio_iovcnt].iov_len);
+		if (error) {
+			free(u_path, M_TEMP);
+			return (error);
+		}
+		opt.uio_iovcnt++;
+	}
+#ifdef INET
+	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
+	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
+	opt.uio_iovcnt++;
+	optiov[opt.uio_iovcnt].iov_base = u_ip4;
+	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
+	if (j->version == 0)
+		u_ip4->s_addr = j->ip4s;
+	else {
+		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
+		if (error) {
+			free(u_path, M_TEMP);
+			return (error);
+		}
+	}
+	opt.uio_iovcnt++;
+#endif
+#ifdef INET6
+	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
+	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
+	opt.uio_iovcnt++;
+	optiov[opt.uio_iovcnt].iov_base = u_ip6;
+	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
+	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
+	if (error) {
+		free(u_path, M_TEMP);
+		return (error);
+	}
+	opt.uio_iovcnt++;
+#endif
+	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
+	    ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
+	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
+	free(u_path, M_TEMP);
+	return (error);
+}
+
+
+/*
+ * struct jail_set_args {
+ *	struct iovec *iovp;
+ *	unsigned int iovcnt;
+ *	int flags;
+ * };
+ */
+int
+sys_jail_set(struct thread *td, struct jail_set_args *uap)
+{
+	struct uio *auio;
+	int error;
+
+	/* Check that we have an even number of iovecs. */
+	if (uap->iovcnt & 1)
+		return (EINVAL);
+
+	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
+	if (error)
+		return (error);
+	error = kern_jail_set(td, auio, uap->flags);
+	free(auio, M_IOV);
+	return (error);
+}
+
+int
+kern_jail_set(struct thread *td, struct uio *optuio, int flags)
+{
+	struct nameidata nd;
+#ifdef INET
+	struct in_addr *ip4;
+#endif
+#ifdef INET6
+	struct in6_addr *ip6;
+#endif
+	struct vfsopt *opt;
+	struct vfsoptlist *opts;
+	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
+	struct vnode *root;
+	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
+	char *g_path;
+#if defined(INET) || defined(INET6)
+	struct prison *tppr;
+	void *op;
+#endif
+	unsigned long hid;
+	size_t namelen, onamelen;
+	int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
+	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
+	int fi, jid, jsys, len, level;
+	int childmax, rsnum, slevel;
+	int fullpath_disabled;
+#if defined(INET) || defined(INET6)
+	int ii, ij;
+#endif
+#ifdef INET
+	int ip4s, redo_ip4;
+#endif
+#ifdef INET6
+	int ip6s, redo_ip6;
+#endif
+	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
+	unsigned tallow;
+	char numbuf[12];
+
+	error = priv_check(td, PRIV_JAIL_SET);
+	if (!error && (flags & JAIL_ATTACH))
+		error = priv_check(td, PRIV_JAIL_ATTACH);
+	if (error)
+		return (error);
+	mypr = ppr = td->td_ucred->cr_prison;
+	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
+		return (EPERM);
+	if (flags & ~JAIL_SET_MASK)
+		return (EINVAL);
+
+	/*
+	 * Check all the parameters before committing to anything.  Not all
+	 * errors can be caught early, but we may as well try.  Also, this
+	 * takes care of some expensive stuff (path lookup) before getting
+	 * the allprison lock.
+	 *
+	 * XXX Jails are not filesystems, and jail parameters are not mount
+	 *     options.  But it makes more sense to re-use the vfsopt code
+	 *     than duplicate it under a different name.
+	 */
+	error = vfs_buildopts(optuio, &opts);
+	if (error)
+		return (error);
+#ifdef INET
+	ip4 = NULL;
+#endif
+#ifdef INET6
+	ip6 = NULL;
+#endif
+	g_path = NULL;
+
+	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
+	if (error == ENOENT)
+		jid = 0;
+	else if (error != 0)
+		goto done_free;
+
+	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
+	if (error == ENOENT)
+		gotslevel = 0;
+	else if (error != 0)
+		goto done_free;
+	else
+		gotslevel = 1;
+
+	error =
+	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
+	if (error == ENOENT)
+		gotchildmax = 0;
+	else if (error != 0)
+		goto done_free;
+	else
+		gotchildmax = 1;
+
+	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
+	if (error == ENOENT)
+		gotenforce = 0;
+	else if (error != 0)
+		goto done_free;
+	else if (enforce < 0 || enforce > 2) {
+		error = EINVAL;
+		goto done_free;
+	} else
+		gotenforce = 1;
+
+	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
+	if (error == ENOENT)
+		gotrsnum = 0;
+	else if (error != 0)
+		goto done_free;
+	else
+		gotrsnum = 1;
+
+	pr_flags = ch_flags = 0;
+	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
+	    fi++) {
+		if (pr_flag_names[fi] == NULL)
+			continue;
+		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
+		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
+	}
+	ch_flags |= pr_flags;
+	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
+	    fi++) {
+		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
+		    sizeof(jsys));
+		if (error == ENOENT)
+			continue;
+		if (error != 0)
+			goto done_free;
+		switch (jsys) {
+		case JAIL_SYS_DISABLE:
+			if (!pr_flag_jailsys[fi].disable) {
+				error = EINVAL;
+				goto done_free;
+			}
+			pr_flags |= pr_flag_jailsys[fi].disable;
+			break;
+		case JAIL_SYS_NEW:
+			pr_flags |= pr_flag_jailsys[fi].new;
+			break;
+		case JAIL_SYS_INHERIT:
+			break;
+		default:
+			error = EINVAL;
+			goto done_free;
+		}
+		ch_flags |=
+		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
+	}
+	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
+	    && !(pr_flags & PR_PERSIST)) {
+		error = EINVAL;
+		vfs_opterror(opts, "new jail must persist or attach");
+		goto done_errmsg;
+	}
+#ifdef VIMAGE
+	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
+		error = EINVAL;
+		vfs_opterror(opts, "vnet cannot be changed after creation");
+		goto done_errmsg;
+	}
+#endif
+#ifdef INET
+	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
+		error = EINVAL;
+		vfs_opterror(opts, "ip4 cannot be changed after creation");
+		goto done_errmsg;
+	}
+#endif
+#ifdef INET6
+	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
+		error = EINVAL;
+		vfs_opterror(opts, "ip6 cannot be changed after creation");
+		goto done_errmsg;
+	}
+#endif
+
+	pr_allow = ch_allow = 0;
+	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
+	    fi++) {
+		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
+		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
+	}
+	ch_allow |= pr_allow;
+
+	error = vfs_getopt(opts, "name", (void **)&name, &len);
+	if (error == ENOENT)
+		name = NULL;
+	else if (error != 0)
+		goto done_free;
+	else {
+		if (len == 0 || name[len - 1] != '\0') {
+			error = EINVAL;
+			goto done_free;
+		}
+		if (len > MAXHOSTNAMELEN) {
+			error = ENAMETOOLONG;
+			goto done_free;
+		}
+	}
+
+	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
+	if (error == ENOENT)
+		host = NULL;
+	else if (error != 0)
+		goto done_free;
+	else {
+		ch_flags |= PR_HOST;
+		pr_flags |= PR_HOST;
+		if (len == 0 || host[len - 1] != '\0') {
+			error = EINVAL;
+			goto done_free;
+		}
+		if (len > MAXHOSTNAMELEN) {
+			error = ENAMETOOLONG;
+			goto done_free;
+		}
+	}
+
+	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
+	if (error == ENOENT)
+		domain = NULL;
+	else if (error != 0)
+		goto done_free;
+	else {
+		ch_flags |= PR_HOST;
+		pr_flags |= PR_HOST;
+		if (len == 0 || domain[len - 1] != '\0') {
+			error = EINVAL;
+			goto done_free;
+		}
+		if (len > MAXHOSTNAMELEN) {
+			error = ENAMETOOLONG;
+			goto done_free;
+		}
+	}
+
+	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
+	if (error == ENOENT)
+		uuid = NULL;
+	else if (error != 0)
+		goto done_free;
+	else {
+		ch_flags |= PR_HOST;
+		pr_flags |= PR_HOST;
+		if (len == 0 || uuid[len - 1] != '\0') {
+			error = EINVAL;
+			goto done_free;
+		}
+		if (len > HOSTUUIDLEN) {
+			error = ENAMETOOLONG;
+			goto done_free;
+		}
+	}
+
+#ifdef COMPAT_FREEBSD32
+	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+		uint32_t hid32;
+
+		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
+		hid = hid32;
+	} else
+#endif
+		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
+	if (error == ENOENT)
+		gothid = 0;
+	else if (error != 0)
+		goto done_free;
+	else {
+		gothid = 1;
+		ch_flags |= PR_HOST;
+		pr_flags |= PR_HOST;
+	}
+
+#ifdef INET
+	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
+	if (error == ENOENT)
+		ip4s = (pr_flags & PR_IP4_DISABLE) ? 0 : -1;
+	else if (error != 0)
+		goto done_free;
+	else if (ip4s & (sizeof(*ip4) - 1)) {
+		error = EINVAL;
+		goto done_free;
+	} else {
+		ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
+		if (ip4s == 0)
+			pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
+		else {
+			pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
+			ip4s /= sizeof(*ip4);
+			if (ip4s > jail_max_af_ips) {
+				error = EINVAL;
+				vfs_opterror(opts, "too many IPv4 addresses");
+				goto done_errmsg;
+			}
+			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
+			bcopy(op, ip4, ip4s * sizeof(*ip4));
+			/*
+			 * IP addresses are all sorted but ip[0] to preserve
+			 * the primary IP address as given from userland.
+			 * This special IP is used for unbound outgoing
+			 * connections as well for "loopback" traffic in case
+			 * source address selection cannot find any more fitting
+			 * address to connect from.
+			 */
+			if (ip4s > 1)
+				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
+			/*
+			 * Check for duplicate addresses and do some simple
+			 * zero and broadcast checks. If users give other bogus
+			 * addresses it is their problem.
+			 *
+			 * We do not have to care about byte order for these
+			 * checks so we will do them in NBO.
+			 */
+			for (ii = 0; ii < ip4s; ii++) {
+				if (ip4[ii].s_addr == INADDR_ANY ||
+				    ip4[ii].s_addr == INADDR_BROADCAST) {
+					error = EINVAL;
+					goto done_free;
+				}
+				if ((ii+1) < ip4s &&
+				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
+				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
+					error = EINVAL;
+					goto done_free;
+				}
+			}
+		}
+	}
+#endif
+
+#ifdef INET6
+	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
+	if (error == ENOENT)
+		ip6s = (pr_flags & PR_IP6_DISABLE) ? 0 : -1;
+	else if (error != 0)
+		goto done_free;
+	else if (ip6s & (sizeof(*ip6) - 1)) {
+		error = EINVAL;
+		goto done_free;
+	} else {
+		ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
+		if (ip6s == 0)
+			pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
+		else {
+			pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
+			ip6s /= sizeof(*ip6);
+			if (ip6s > jail_max_af_ips) {
+				error = EINVAL;
+				vfs_opterror(opts, "too many IPv6 addresses");
+				goto done_errmsg;
+			}
+			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
+			bcopy(op, ip6, ip6s * sizeof(*ip6));
+			if (ip6s > 1)
+				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
+			for (ii = 0; ii < ip6s; ii++) {
+				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
+					error = EINVAL;
+					goto done_free;
+				}
+				if ((ii+1) < ip6s &&
+				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
+				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
+				{
+					error = EINVAL;
+					goto done_free;
+				}
+			}
+		}
+	}
+#endif
+
+#if defined(VIMAGE) && (defined(INET) || defined(INET6))
+	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
+		error = EINVAL;
+		vfs_opterror(opts,
+		    "vnet jails cannot have IP address restrictions");
+		goto done_errmsg;
+	}
+#endif
+
+	fullpath_disabled = 0;
+	root = NULL;
+	error = vfs_getopt(opts, "path", (void **)&path, &len);
+	if (error == ENOENT)
+		path = NULL;
+	else if (error != 0)
+		goto done_free;
+	else {
+		if (flags & JAIL_UPDATE) {
+			error = EINVAL;
+			vfs_opterror(opts,
+			    "path cannot be changed after creation");
+			goto done_errmsg;
+		}
+		if (len == 0 || path[len - 1] != '\0') {
+			error = EINVAL;
+			goto done_free;
+		}
+		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
+		    path, td);
+		error = namei(&nd);
+		if (error)
+			goto done_free;
+		root = nd.ni_vp;
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+		strlcpy(g_path, path, MAXPATHLEN);
+		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
+		if (error == 0)
+			path = g_path;
+		else if (error == ENODEV) {
+			/* proceed if sysctl debug.disablefullpath == 1 */
+			fullpath_disabled = 1;
+			if (len < 2 || (len == 2 && path[0] == '/'))
+				path = NULL;
+		} else {
+			/* exit on other errors */
+			goto done_free;
+		}
+		if (root->v_type != VDIR) {
+			error = ENOTDIR;
+			vput(root);
+			goto done_free;
+		}
+		VOP_UNLOCK(root, 0);
+		if (fullpath_disabled) {
+			/* Leave room for a real-root full pathname. */
+			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
+			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
+				error = ENAMETOOLONG;
+				goto done_free;
+			}
+		}
+	}
+
+	/*
+	 * Grab the allprison lock before letting modules check their
+	 * parameters.  Once we have it, do not let go so we'll have a
+	 * consistent view of the OSD list.
+	 */
+	sx_xlock(&allprison_lock);
+	error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
+	if (error)
+		goto done_unlock_list;
+
+	/* By now, all parameters should have been noted. */
+	TAILQ_FOREACH(opt, opts, link) {
+		if (!opt->seen && strcmp(opt->name, "errmsg")) {
+			error = EINVAL;
+			vfs_opterror(opts, "unknown parameter: %s", opt->name);
+			goto done_unlock_list;
+		}
+	}
+
+	/*
+	 * See if we are creating a new record or updating an existing one.
+	 * This abuses the file error codes ENOENT and EEXIST.
+	 */
+	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
+	if (!cuflags) {
+		error = EINVAL;
+		vfs_opterror(opts, "no valid operation (create or update)");
+		goto done_unlock_list;
+	}
+	pr = NULL;
+	namelc = NULL;
+	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
+		namelc = strrchr(name, '.');
+		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
+		if (*p != '\0')
+			jid = 0;
+	}
+	if (jid != 0) {
+		/*
+		 * See if a requested jid already exists.  There is an
+		 * information leak here if the jid exists but is not within
+		 * the caller's jail hierarchy.  Jail creators will get EEXIST
+		 * even though they cannot see the jail, and CREATE | UPDATE
+		 * will return ENOENT which is not normally a valid error.
+		 */
+		if (jid < 0) {
+			error = EINVAL;
+			vfs_opterror(opts, "negative jid");
+			goto done_unlock_list;
+		}
+		pr = prison_find(jid);
+		if (pr != NULL) {
+			ppr = pr->pr_parent;
+			/* Create: jid must not exist. */
+			if (cuflags == JAIL_CREATE) {
+				mtx_unlock(&pr->pr_mtx);
+				error = EEXIST;
+				vfs_opterror(opts, "jail %d already exists",
+				    jid);
+				goto done_unlock_list;
+			}
+			if (!prison_ischild(mypr, pr)) {
+				mtx_unlock(&pr->pr_mtx);
+				pr = NULL;
+			} else if (pr->pr_uref == 0) {
+				if (!(flags & JAIL_DYING)) {
+					mtx_unlock(&pr->pr_mtx);
+					error = ENOENT;
+					vfs_opterror(opts, "jail %d is dying",
+					    jid);
+					goto done_unlock_list;
+				} else if ((flags & JAIL_ATTACH) ||
+				    (pr_flags & PR_PERSIST)) {
+					/*
+					 * A dying jail might be resurrected
+					 * (via attach or persist), but first
+					 * it must determine if another jail
+					 * has claimed its name.  Accomplish
+					 * this by implicitly re-setting the
+					 * name.
+					 */
+					if (name == NULL)
+						name = prison_name(mypr, pr);
+				}
+			}
+		}
+		if (pr == NULL) {
+			/* Update: jid must exist. */
+			if (cuflags == JAIL_UPDATE) {
+				error = ENOENT;
+				vfs_opterror(opts, "jail %d not found", jid);
+				goto done_unlock_list;
+			}
+		}
+	}
+	/*
+	 * If the caller provided a name, look for a jail by that name.
+	 * This has different semantics for creates and updates keyed by jid
+	 * (where the name must not already exist in a different jail),
+	 * and updates keyed by the name itself (where the name must exist
+	 * because that is the jail being updated).
+	 */
+	if (name != NULL) {
+		namelc = strrchr(name, '.');
+		if (namelc == NULL)
+			namelc = name;
+		else {
+			/*
+			 * This is a hierarchical name.  Split it into the
+			 * parent and child names, and make sure the parent
+			 * exists or matches an already found jail.
+			 */
+			*namelc = '\0';
+			if (pr != NULL) {
+				if (strncmp(name, ppr->pr_name, namelc - name)
+				    || ppr->pr_name[namelc - name] != '\0') {
+					mtx_unlock(&pr->pr_mtx);
+					error = EINVAL;
+					vfs_opterror(opts,
+					    "cannot change jail's parent");
+					goto done_unlock_list;
+				}
+			} else {
+				ppr = prison_find_name(mypr, name);
+				if (ppr == NULL) {
+					error = ENOENT;
+					vfs_opterror(opts,
+					    "jail \"%s\" not found", name);
+					goto done_unlock_list;
+				}
+				mtx_unlock(&ppr->pr_mtx);
+			}
+			name = ++namelc;
+		}
+		if (name[0] != '\0') {
+			namelen =
+			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
+ name_again:
+			deadpr = NULL;
+			FOREACH_PRISON_CHILD(ppr, tpr) {
+				if (tpr != pr && tpr->pr_ref > 0 &&
+				    !strcmp(tpr->pr_name + namelen, name)) {
+					if (pr == NULL &&
+					    cuflags != JAIL_CREATE) {
+						mtx_lock(&tpr->pr_mtx);
+						if (tpr->pr_ref > 0) {
+							/*
+							 * Use this jail
+							 * for updates.
+							 */
+							if (tpr->pr_uref > 0) {
+								pr = tpr;
+								break;
+							}
+							deadpr = tpr;
+						}
+						mtx_unlock(&tpr->pr_mtx);
+					} else if (tpr->pr_uref > 0) {
+						/*
+						 * Create, or update(jid):
+						 * name must not exist in an
+						 * active sibling jail.
+						 */
+						error = EEXIST;
+						if (pr != NULL)
+							mtx_unlock(&pr->pr_mtx);
+						vfs_opterror(opts,
+						   "jail \"%s\" already exists",
+						   name);
+						goto done_unlock_list;
+					}
+				}
+			}
+			/* If no active jail is found, use a dying one. */
+			if (deadpr != NULL && pr == NULL) {
+				if (flags & JAIL_DYING) {
+					mtx_lock(&deadpr->pr_mtx);
+					if (deadpr->pr_ref == 0) {
+						mtx_unlock(&deadpr->pr_mtx);
+						goto name_again;
+					}
+					pr = deadpr;
+				} else if (cuflags == JAIL_UPDATE) {
+					error = ENOENT;
+					vfs_opterror(opts,
+					    "jail \"%s\" is dying", name);
+					goto done_unlock_list;
+				}
+			}
+			/* Update: name must exist if no jid. */
+			else if (cuflags == JAIL_UPDATE && pr == NULL) {
+				error = ENOENT;
+				vfs_opterror(opts, "jail \"%s\" not found",
+				    name);
+				goto done_unlock_list;
+			}
+		}
+	}
+	/* Update: must provide a jid or name. */
+	else if (cuflags == JAIL_UPDATE && pr == NULL) {
+		error = ENOENT;
+		vfs_opterror(opts, "update specified no jail");
+		goto done_unlock_list;
+	}
+
+	/* If there's no prison to update, create a new one and link it in. */
+	if (pr == NULL) {
+		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
+			if (tpr->pr_childcount >= tpr->pr_childmax) {
+				error = EPERM;
+				vfs_opterror(opts, "prison limit exceeded");
+				goto done_unlock_list;
+			}
+		created = 1;
+		mtx_lock(&ppr->pr_mtx);
+		if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) {
+			mtx_unlock(&ppr->pr_mtx);
+			error = ENOENT;
+			vfs_opterror(opts, "parent jail went away!");
+			goto done_unlock_list;
+		}
+		ppr->pr_ref++;
+		ppr->pr_uref++;
+		mtx_unlock(&ppr->pr_mtx);
+		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
+		if (jid == 0) {
+			/* Find the next free jid. */
+			jid = lastprid + 1;
+ findnext:
+			if (jid == JAIL_MAX)
+				jid = 1;
+			TAILQ_FOREACH(tpr, &allprison, pr_list) {
+				if (tpr->pr_id < jid)
+					continue;
+				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
+					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
+					break;
+				}
+				if (jid == lastprid) {
+					error = EAGAIN;
+					vfs_opterror(opts,
+					    "no available jail IDs");
+					free(pr, M_PRISON);
+					prison_deref(ppr, PD_DEREF |
+					    PD_DEUREF | PD_LIST_XLOCKED);
+					goto done_releroot;
+				}
+				jid++;
+				goto findnext;
+			}
+			lastprid = jid;
+		} else {
+			/*
+			 * The jail already has a jid (that did not yet exist),
+			 * so just find where to insert it.
+			 */
+			TAILQ_FOREACH(tpr, &allprison, pr_list)
+				if (tpr->pr_id >= jid) {
+					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
+					break;
+				}
+		}
+		if (tpr == NULL)
+			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
+		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
+		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
+			tpr->pr_childcount++;
+
+		pr->pr_parent = ppr;
+		pr->pr_id = jid;
+
+		/* Set some default values, and inherit some from the parent. */
+		if (name == NULL)
+			name = "";
+		if (path == NULL) {
+			path = "/";
+			root = mypr->pr_root;
+			vref(root);
+		}
+		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
+		pr->pr_flags |= PR_HOST;
+#if defined(INET) || defined(INET6)
+#ifdef VIMAGE
+		if (!(pr_flags & PR_VNET))
+#endif
+		{
+#ifdef INET
+			if (!(ch_flags & PR_IP4_USER))
+				pr->pr_flags |=
+				    PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
+			else if (!(pr_flags & PR_IP4_USER)) {
+				pr->pr_flags |= ppr->pr_flags & PR_IP4;
+				if (ppr->pr_ip4 != NULL) {
+					pr->pr_ip4s = ppr->pr_ip4s;
+					pr->pr_ip4 = malloc(pr->pr_ip4s *
+					    sizeof(struct in_addr), M_PRISON,
+					    M_WAITOK);
+					bcopy(ppr->pr_ip4, pr->pr_ip4,
+					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
+				}
+			}
+#endif
+#ifdef INET6
+			if (!(ch_flags & PR_IP6_USER))
+				pr->pr_flags |=
+				    PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
+			else if (!(pr_flags & PR_IP6_USER)) {
+				pr->pr_flags |= ppr->pr_flags & PR_IP6;
+				if (ppr->pr_ip6 != NULL) {
+					pr->pr_ip6s = ppr->pr_ip6s;
+					pr->pr_ip6 = malloc(pr->pr_ip6s *
+					    sizeof(struct in6_addr), M_PRISON,
+					    M_WAITOK);
+					bcopy(ppr->pr_ip6, pr->pr_ip6,
+					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
+				}
+			}
+#endif
+		}
+#endif
+		/* Source address selection is always on by default. */
+		pr->pr_flags |= _PR_IP_SADDRSEL;
+
+		pr->pr_securelevel = ppr->pr_securelevel;
+		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
+		pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
+		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
+
+		LIST_INIT(&pr->pr_children);
+		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
+
+#ifdef VIMAGE
+		/* Allocate a new vnet if specified. */
+		pr->pr_vnet = (pr_flags & PR_VNET)
+		    ? vnet_alloc() : ppr->pr_vnet;
+#endif
+		/*
+		 * Allocate a dedicated cpuset for each jail.
+		 * Unlike other initial settings, this may return an erorr.
+		 */
+		error = cpuset_create_root(ppr, &pr->pr_cpuset);
+		if (error) {
+			prison_deref(pr, PD_LIST_XLOCKED);
+			goto done_releroot;
+		}
+
+		mtx_lock(&pr->pr_mtx);
+		/*
+		 * New prisons do not yet have a reference, because we do not
+		 * want other to see the incomplete prison once the
+		 * allprison_lock is downgraded.
+		 */
+	} else {
+		created = 0;
+		/*
+		 * Grab a reference for existing prisons, to ensure they
+		 * continue to exist for the duration of the call.
+		 */
+		pr->pr_ref++;
+#if defined(VIMAGE) && (defined(INET) || defined(INET6))
+		if ((pr->pr_flags & PR_VNET) &&
+		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
+			error = EINVAL;
+			vfs_opterror(opts,
+			    "vnet jails cannot have IP address restrictions");
+			goto done_deref_locked;
+		}
+#endif
+#ifdef INET
+		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
+			error = EINVAL;
+			vfs_opterror(opts,
+			    "ip4 cannot be changed after creation");
+			goto done_deref_locked;
+		}
+#endif
+#ifdef INET6
+		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
+			error = EINVAL;
+			vfs_opterror(opts,
+			    "ip6 cannot be changed after creation");
+			goto done_deref_locked;
+		}
+#endif
+	}
+
+	/* Do final error checking before setting anything. */
+	if (gotslevel) {
+		if (slevel < ppr->pr_securelevel) {
+			error = EPERM;
+			goto done_deref_locked;
+		}
+	}
+	if (gotchildmax) {
+		if (childmax >= ppr->pr_childmax) {
+			error = EPERM;
+			goto done_deref_locked;
+		}
+	}
+	if (gotenforce) {
+		if (enforce < ppr->pr_enforce_statfs) {
+			error = EPERM;
+			goto done_deref_locked;
+		}
+	}
+	if (gotrsnum) {
+		/*
+		 * devfs_rsnum is a uint16_t
+		 */
+		if (rsnum < 0 || rsnum > 65535) {
+			error = EINVAL;
+			goto done_deref_locked;
+		}
+		/*
+		 * Nested jails always inherit parent's devfs ruleset
+		 */
+		if (jailed(td->td_ucred)) {
+			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
+				error = EPERM;
+				goto done_deref_locked;
+			} else
+				rsnum = ppr->pr_devfs_rsnum;
+		}
+	}
+#ifdef INET
+	if (ip4s > 0) {
+		if (ppr->pr_flags & PR_IP4) {
+			/*
+			 * Make sure the new set of IP addresses is a
+			 * subset of the parent's list.  Don't worry
+			 * about the parent being unlocked, as any
+			 * setting is done with allprison_lock held.
+			 */
+			for (ij = 0; ij < ppr->pr_ip4s; ij++)
+				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
+					break;
+			if (ij == ppr->pr_ip4s) {
+				error = EPERM;
+				goto done_deref_locked;
+			}
+			if (ip4s > 1) {
+				for (ii = ij = 1; ii < ip4s; ii++) {
+					if (ip4[ii].s_addr ==
+					    ppr->pr_ip4[0].s_addr)
+						continue;
+					for (; ij < ppr->pr_ip4s; ij++)
+						if (ip4[ii].s_addr ==
+						    ppr->pr_ip4[ij].s_addr)
+							break;
+					if (ij == ppr->pr_ip4s)
+						break;
+				}
+				if (ij == ppr->pr_ip4s) {
+					error = EPERM;
+					goto done_deref_locked;
+				}
+			}
+		}
+		/*
+		 * Check for conflicting IP addresses.  We permit them
+		 * if there is no more than one IP on each jail.  If
+		 * there is a duplicate on a jail with more than one
+		 * IP stop checking and return error.
+		 */
+		tppr = ppr;
+#ifdef VIMAGE
+		for (; tppr != &prison0; tppr = tppr->pr_parent)
+			if (tppr->pr_flags & PR_VNET)
+				break;
+#endif
+		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
+			if (tpr == pr ||
+#ifdef VIMAGE
+			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
+#endif
+			    tpr->pr_uref == 0) {
+				descend = 0;
+				continue;
+			}
+			if (!(tpr->pr_flags & PR_IP4_USER))
+				continue;
+			descend = 0;
+			if (tpr->pr_ip4 == NULL ||
+			    (ip4s == 1 && tpr->pr_ip4s == 1))
+				continue;
+			for (ii = 0; ii < ip4s; ii++) {
+				if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
+					error = EADDRINUSE;
+					vfs_opterror(opts,
+					    "IPv4 addresses clash");
+					goto done_deref_locked;
+				}
+			}
+		}
+	}
+#endif
+#ifdef INET6
+	if (ip6s > 0) {
+		if (ppr->pr_flags & PR_IP6) {
+			/*
+			 * Make sure the new set of IP addresses is a
+			 * subset of the parent's list.
+			 */
+			for (ij = 0; ij < ppr->pr_ip6s; ij++)
+				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
+				    &ppr->pr_ip6[ij]))
+					break;
+			if (ij == ppr->pr_ip6s) {
+				error = EPERM;
+				goto done_deref_locked;
+			}
+			if (ip6s > 1) {
+				for (ii = ij = 1; ii < ip6s; ii++) {
+					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
+					     &ppr->pr_ip6[0]))
+						continue;
+					for (; ij < ppr->pr_ip6s; ij++)
+						if (IN6_ARE_ADDR_EQUAL(
+						    &ip6[ii], &ppr->pr_ip6[ij]))
+							break;
+					if (ij == ppr->pr_ip6s)
+						break;
+				}
+				if (ij == ppr->pr_ip6s) {
+					error = EPERM;
+					goto done_deref_locked;
+				}
+			}
+		}
+		/* Check for conflicting IP addresses. */
+		tppr = ppr;
+#ifdef VIMAGE
+		for (; tppr != &prison0; tppr = tppr->pr_parent)
+			if (tppr->pr_flags & PR_VNET)
+				break;
+#endif
+		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
+			if (tpr == pr ||
+#ifdef VIMAGE
+			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
+#endif
+			    tpr->pr_uref == 0) {
+				descend = 0;
+				continue;
+			}
+			if (!(tpr->pr_flags & PR_IP6_USER))
+				continue;
+			descend = 0;
+			if (tpr->pr_ip6 == NULL ||
+			    (ip6s == 1 && tpr->pr_ip6s == 1))
+				continue;
+			for (ii = 0; ii < ip6s; ii++) {
+				if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
+					error = EADDRINUSE;
+					vfs_opterror(opts,
+					    "IPv6 addresses clash");
+					goto done_deref_locked;
+				}
+			}
+		}
+	}
+#endif
+	onamelen = namelen = 0;
+	if (name != NULL) {
+		/* Give a default name of the jid. */
+		if (name[0] == '\0')
+			snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
+		else if (*namelc == '0' || (strtoul(namelc, &p, 10) != jid &&
+		    *p == '\0')) {
+			error = EINVAL;
+			vfs_opterror(opts,
+			    "name cannot be numeric (unless it is the jid)");
+			goto done_deref_locked;
+		}
+		/*
+		 * Make sure the name isn't too long for the prison or its
+		 * children.
+		 */
+		onamelen = strlen(pr->pr_name);
+		namelen = strlen(name);
+		if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
+			error = ENAMETOOLONG;
+			goto done_deref_locked;
+		}
+		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
+			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
+			    sizeof(pr->pr_name)) {
+				error = ENAMETOOLONG;
+				goto done_deref_locked;
+			}
+		}
+	}
+	if (pr_allow & ~ppr->pr_allow) {
+		error = EPERM;
+		goto done_deref_locked;
+	}
+
+	/* Set the parameters of the prison. */
+#ifdef INET
+	redo_ip4 = 0;
+	if (pr_flags & PR_IP4_USER) {
+		pr->pr_flags |= PR_IP4;
+		free(pr->pr_ip4, M_PRISON);
+		pr->pr_ip4s = ip4s;
+		pr->pr_ip4 = ip4;
+		ip4 = NULL;
+		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
+#ifdef VIMAGE
+			if (tpr->pr_flags & PR_VNET) {
+				descend = 0;
+				continue;
+			}
+#endif
+			if (prison_restrict_ip4(tpr, NULL)) {
+				redo_ip4 = 1;
+				descend = 0;
+			}
+		}
+	}
+#endif
+#ifdef INET6
+	redo_ip6 = 0;
+	if (pr_flags & PR_IP6_USER) {
+		pr->pr_flags |= PR_IP6;
+		free(pr->pr_ip6, M_PRISON);
+		pr->pr_ip6s = ip6s;
+		pr->pr_ip6 = ip6;
+		ip6 = NULL;
+		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
+#ifdef VIMAGE
+			if (tpr->pr_flags & PR_VNET) {
+				descend = 0;
+				continue;
+			}
+#endif
+			if (prison_restrict_ip6(tpr, NULL)) {
+				redo_ip6 = 1;
+				descend = 0;
+			}
+		}
+	}
+#endif
+	if (gotslevel) {
+		pr->pr_securelevel = slevel;
+		/* Set all child jails to be at least this level. */
+		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
+			if (tpr->pr_securelevel < slevel)
+				tpr->pr_securelevel = slevel;
+	}
+	if (gotchildmax) {
+		pr->pr_childmax = childmax;
+		/* Set all child jails to under this limit. */
+		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
+			if (tpr->pr_childmax > childmax - level)
+				tpr->pr_childmax = childmax > level
+				    ? childmax - level : 0;
+	}
+	if (gotenforce) {
+		pr->pr_enforce_statfs = enforce;
+		/* Pass this restriction on to the children. */
+		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
+			if (tpr->pr_enforce_statfs < enforce)
+				tpr->pr_enforce_statfs = enforce;
+	}
+	if (gotrsnum) {
+		pr->pr_devfs_rsnum = rsnum;
+		/* Pass this restriction on to the children. */
+		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
+			tpr->pr_devfs_rsnum = rsnum;
+	}
+	if (name != NULL) {
+		if (ppr == &prison0)
+			strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
+		else
+			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
+			    ppr->pr_name, name);
+		/* Change this component of child names. */
+		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
+			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
+			    strlen(tpr->pr_name + onamelen) + 1);
+			bcopy(pr->pr_name, tpr->pr_name, namelen);
+		}
+	}
+	if (path != NULL) {
+		/* Try to keep a real-rooted full pathname. */
+		if (fullpath_disabled && path[0] == '/' &&
+		    strcmp(mypr->pr_path, "/"))
+			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
+			    mypr->pr_path, path);
+		else
+			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
+		pr->pr_root = root;
+	}
+	if (PR_HOST & ch_flags & ~pr_flags) {
+		if (pr->pr_flags & PR_HOST) {
+			/*
+			 * Copy the parent's host info.  As with pr_ip4 above,
+			 * the lack of a lock on the parent is not a problem;
+			 * it is always set with allprison_lock at least
+			 * shared, and is held exclusively here.
+			 */
+			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
+			    sizeof(pr->pr_hostname));
+			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
+			    sizeof(pr->pr_domainname));
+			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
+			    sizeof(pr->pr_hostuuid));
+			pr->pr_hostid = pr->pr_parent->pr_hostid;
+		}
+	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
+		/* Set this prison, and any descendants without PR_HOST. */
+		if (host != NULL)
+			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
+		if (domain != NULL)
+			strlcpy(pr->pr_domainname, domain, 
+			    sizeof(pr->pr_domainname));
+		if (uuid != NULL)
+			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
+		if (gothid)
+			pr->pr_hostid = hid;
+		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
+			if (tpr->pr_flags & PR_HOST)
+				descend = 0;
+			else {
+				if (host != NULL)
+					strlcpy(tpr->pr_hostname,
+					    pr->pr_hostname,
+					    sizeof(tpr->pr_hostname));
+				if (domain != NULL)
+					strlcpy(tpr->pr_domainname, 
+					    pr->pr_domainname,
+					    sizeof(tpr->pr_domainname));
+				if (uuid != NULL)
+					strlcpy(tpr->pr_hostuuid,
+					    pr->pr_hostuuid,
+					    sizeof(tpr->pr_hostuuid));
+				if (gothid)
+					tpr->pr_hostid = hid;
+			}
+		}
+	}
+	if ((tallow = ch_allow & ~pr_allow)) {
+		/* Clear allow bits in all children. */
+		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
+			tpr->pr_allow &= ~tallow;
+	}
+	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
+	/*
+	 * Persistent prisons get an extra reference, and prisons losing their
+	 * persist flag lose that reference.  Only do this for existing prisons
+	 * for now, so new ones will remain unseen until after the module
+	 * handlers have completed.
+	 */
+	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
+		if (pr_flags & PR_PERSIST) {
+			pr->pr_ref++;
+			pr->pr_uref++;
+		} else {
+			pr->pr_ref--;
+			pr->pr_uref--;
+		}
+	}
+	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
+	mtx_unlock(&pr->pr_mtx);
+
+#ifdef RACCT
+	if (created)
+		prison_racct_attach(pr);
+#endif
+
+	/* Locks may have prevented a complete restriction of child IP
+	 * addresses.  If so, allocate some more memory and try again.
+	 */
+#ifdef INET
+	while (redo_ip4) {
+		ip4s = pr->pr_ip4s;
+		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
+		mtx_lock(&pr->pr_mtx);
+		redo_ip4 = 0;
+		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
+#ifdef VIMAGE
+			if (tpr->pr_flags & PR_VNET) {
+				descend = 0;
+				continue;
+			}
+#endif
+			if (prison_restrict_ip4(tpr, ip4)) {
+				if (ip4 != NULL)
+					ip4 = NULL;
+				else
+					redo_ip4 = 1;
+			}
+		}
+		mtx_unlock(&pr->pr_mtx);
+	}
+#endif
+#ifdef INET6
+	while (redo_ip6) {
+		ip6s = pr->pr_ip6s;
+		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
+		mtx_lock(&pr->pr_mtx);
+		redo_ip6 = 0;
+		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
+#ifdef VIMAGE
+			if (tpr->pr_flags & PR_VNET) {
+				descend = 0;
+				continue;
+			}
+#endif
+			if (prison_restrict_ip6(tpr, ip6)) {
+				if (ip6 != NULL)
+					ip6 = NULL;
+				else
+					redo_ip6 = 1;
+			}
+		}
+		mtx_unlock(&pr->pr_mtx);
+	}
+#endif
+
+	/* Let the modules do their work. */
+	sx_downgrade(&allprison_lock);
+	if (created) {
+		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
+		if (error) {
+			prison_deref(pr, PD_LIST_SLOCKED);
+			goto done_errmsg;
+		}
+	}
+	error = osd_jail_call(pr, PR_METHOD_SET, opts);
+	if (error) {
+		prison_deref(pr, created
+		    ? PD_LIST_SLOCKED
+		    : PD_DEREF | PD_LIST_SLOCKED);
+		goto done_errmsg;
+	}
+
+	/* Attach this process to the prison if requested. */
+	if (flags & JAIL_ATTACH) {
+		mtx_lock(&pr->pr_mtx);
+		error = do_jail_attach(td, pr);
+		if (error) {
+			vfs_opterror(opts, "attach failed");
+			if (!created)
+				prison_deref(pr, PD_DEREF);
+			goto done_errmsg;
+		}
+	}
+
+#ifdef RACCT
+	if (!created) {
+		sx_sunlock(&allprison_lock);
+		prison_racct_modify(pr);
+		sx_slock(&allprison_lock);
+	}
+#endif
+
+	td->td_retval[0] = pr->pr_id;
+
+	/*
+	 * Now that it is all there, drop the temporary reference from existing
+	 * prisons.  Or add a reference to newly created persistent prisons
+	 * (which was not done earlier so that the prison would not be publicly
+	 * visible).
+	 */
+	if (!created) {
+		prison_deref(pr, (flags & JAIL_ATTACH)
+		    ? PD_DEREF
+		    : PD_DEREF | PD_LIST_SLOCKED);
+	} else {
+		if (pr_flags & PR_PERSIST) {
+			mtx_lock(&pr->pr_mtx);
+			pr->pr_ref++;
+			pr->pr_uref++;
+			mtx_unlock(&pr->pr_mtx);
+		}
+		if (!(flags & JAIL_ATTACH))
+			sx_sunlock(&allprison_lock);
+	}
+
+	goto done_errmsg;
+
+ done_deref_locked:
+	prison_deref(pr, created
+	    ? PD_LOCKED | PD_LIST_XLOCKED
+	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
+	goto done_releroot;
+ done_unlock_list:
+	sx_xunlock(&allprison_lock);
+ done_releroot:
+	if (root != NULL)
+		vrele(root);
+ done_errmsg:
+	if (error) {
+		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
+		if (errmsg_len > 0) {
+			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
+			if (errmsg_pos > 0) {
+				if (optuio->uio_segflg == UIO_SYSSPACE)
+					bcopy(errmsg,
+					   optuio->uio_iov[errmsg_pos].iov_base,
+					   errmsg_len);
+				else
+					copyout(errmsg,
+					   optuio->uio_iov[errmsg_pos].iov_base,
+					   errmsg_len);
+			}
+		}
+	}
+ done_free:
+#ifdef INET
+	free(ip4, M_PRISON);
+#endif
+#ifdef INET6
+	free(ip6, M_PRISON);
+#endif
+	if (g_path != NULL)
+		free(g_path, M_TEMP);
+	vfs_freeopts(opts);
+	return (error);
+}
+
+
+/*
+ * struct jail_get_args {
+ *	struct iovec *iovp;
+ *	unsigned int iovcnt;
+ *	int flags;
+ * };
+ */
+int
+sys_jail_get(struct thread *td, struct jail_get_args *uap)
+{
+	struct uio *auio;
+	int error;
+
+	/* Check that we have an even number of iovecs. */
+	if (uap->iovcnt & 1)
+		return (EINVAL);
+
+	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
+	if (error)
+		return (error);
+	error = kern_jail_get(td, auio, uap->flags);
+	if (error == 0)
+		error = copyout(auio->uio_iov, uap->iovp,
+		    uap->iovcnt * sizeof (struct iovec));
+	free(auio, M_IOV);
+	return (error);
+}
+
+int
+kern_jail_get(struct thread *td, struct uio *optuio, int flags)
+{
+	struct prison *pr, *mypr;
+	struct vfsopt *opt;
+	struct vfsoptlist *opts;
+	char *errmsg, *name;
+	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
+
+	if (flags & ~JAIL_GET_MASK)
+		return (EINVAL);
+
+	/* Get the parameter list. */
+	error = vfs_buildopts(optuio, &opts);
+	if (error)
+		return (error);
+	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
+	mypr = td->td_ucred->cr_prison;
+
+	/*
+	 * Find the prison specified by one of: lastjid, jid, name.
+	 */
+	sx_slock(&allprison_lock);
+	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
+	if (error == 0) {
+		TAILQ_FOREACH(pr, &allprison, pr_list) {
+			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
+				mtx_lock(&pr->pr_mtx);
+				if (pr->pr_ref > 0 &&
+				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
+					break;
+				mtx_unlock(&pr->pr_mtx);
+			}
+		}
+		if (pr != NULL)
+			goto found_prison;
+		error = ENOENT;
+		vfs_opterror(opts, "no jail after %d", jid);
+		goto done_unlock_list;
+	} else if (error != ENOENT)
+		goto done_unlock_list;
+
+	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
+	if (error == 0) {
+		if (jid != 0) {
+			pr = prison_find_child(mypr, jid);
+			if (pr != NULL) {
+				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
+					mtx_unlock(&pr->pr_mtx);
+					error = ENOENT;
+					vfs_opterror(opts, "jail %d is dying",
+					    jid);
+					goto done_unlock_list;
+				}
+				goto found_prison;
+			}
+			error = ENOENT;
+			vfs_opterror(opts, "jail %d not found", jid);
+			goto done_unlock_list;
+		}
+	} else if (error != ENOENT)
+		goto done_unlock_list;
+
+	error = vfs_getopt(opts, "name", (void **)&name, &len);
+	if (error == 0) {
+		if (len == 0 || name[len - 1] != '\0') {
+			error = EINVAL;
+			goto done_unlock_list;
+		}
+		pr = prison_find_name(mypr, name);
+		if (pr != NULL) {
+			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
+				mtx_unlock(&pr->pr_mtx);
+				error = ENOENT;
+				vfs_opterror(opts, "jail \"%s\" is dying",
+				    name);
+				goto done_unlock_list;
+			}
+			goto found_prison;
+		}
+		error = ENOENT;
+		vfs_opterror(opts, "jail \"%s\" not found", name);
+		goto done_unlock_list;
+	} else if (error != ENOENT)
+		goto done_unlock_list;
+
+	vfs_opterror(opts, "no jail specified");
+	error = ENOENT;
+	goto done_unlock_list;
+
+ found_prison:
+	/* Get the parameters of the prison. */
+	pr->pr_ref++;
+	locked = PD_LOCKED;
+	td->td_retval[0] = pr->pr_id;
+	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
+	error = vfs_setopt(opts, "parent", &i, sizeof(i));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
+	    sizeof(pr->pr_cpuset->cs_id));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+#ifdef INET
+	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
+	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+#endif
+#ifdef INET6
+	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
+	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+#endif
+	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
+	    sizeof(pr->pr_securelevel));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
+	    sizeof(pr->pr_childcount));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
+	    sizeof(pr->pr_childmax));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+#ifdef COMPAT_FREEBSD32
+	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+		uint32_t hid32 = pr->pr_hostid;
+
+		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
+	} else
+#endif
+	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
+	    sizeof(pr->pr_hostid));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
+	    sizeof(pr->pr_enforce_statfs));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
+	    sizeof(pr->pr_devfs_rsnum));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
+	    fi++) {
+		if (pr_flag_names[fi] == NULL)
+			continue;
+		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
+		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
+		if (error != 0 && error != ENOENT)
+			goto done_deref;
+		i = !i;
+		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
+		if (error != 0 && error != ENOENT)
+			goto done_deref;
+	}
+	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
+	    fi++) {
+		i = pr->pr_flags &
+		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
+		i = pr_flag_jailsys[fi].disable &&
+		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
+		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
+		    : JAIL_SYS_INHERIT;
+		error =
+		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
+		if (error != 0 && error != ENOENT)
+			goto done_deref;
+	}
+	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
+	    fi++) {
+		if (pr_allow_names[fi] == NULL)
+			continue;
+		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
+		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
+		if (error != 0 && error != ENOENT)
+			goto done_deref;
+		i = !i;
+		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
+		if (error != 0 && error != ENOENT)
+			goto done_deref;
+	}
+	i = (pr->pr_uref == 0);
+	error = vfs_setopt(opts, "dying", &i, sizeof(i));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	i = !i;
+	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+
+	/* Get the module parameters. */
+	mtx_unlock(&pr->pr_mtx);
+	locked = 0;
+	error = osd_jail_call(pr, PR_METHOD_GET, opts);
+	if (error)
+		goto done_deref;
+	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
+
+	/* By now, all parameters should have been noted. */
+	TAILQ_FOREACH(opt, opts, link) {
+		if (!opt->seen && strcmp(opt->name, "errmsg")) {
+			error = EINVAL;
+			vfs_opterror(opts, "unknown parameter: %s", opt->name);
+			goto done_errmsg;
+		}
+	}
+
+	/* Write the fetched parameters back to userspace. */
+	error = 0;
+	TAILQ_FOREACH(opt, opts, link) {
+		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
+			pos = 2 * opt->pos + 1;
+			optuio->uio_iov[pos].iov_len = opt->len;
+			if (opt->value != NULL) {
+				if (optuio->uio_segflg == UIO_SYSSPACE) {
+					bcopy(opt->value,
+					    optuio->uio_iov[pos].iov_base,
+					    opt->len);
+				} else {
+					error = copyout(opt->value,
+					    optuio->uio_iov[pos].iov_base,
+					    opt->len);
+					if (error)
+						break;
+				}
+			}
+		}
+	}
+	goto done_errmsg;
+
+ done_deref:
+	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
+	goto done_errmsg;
+
+ done_unlock_list:
+	sx_sunlock(&allprison_lock);
+ done_errmsg:
+	if (error && errmsg_pos >= 0) {
+		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
+		errmsg_pos = 2 * errmsg_pos + 1;
+		if (errmsg_len > 0) {
+			if (optuio->uio_segflg == UIO_SYSSPACE)
+				bcopy(errmsg,
+				    optuio->uio_iov[errmsg_pos].iov_base,
+				    errmsg_len);
+			else
+				copyout(errmsg,
+				    optuio->uio_iov[errmsg_pos].iov_base,
+				    errmsg_len);
+		}
+	}
+	vfs_freeopts(opts);
+	return (error);
+}
+
+
+/*
+ * struct jail_remove_args {
+ *	int jid;
+ * };
+ */
+int
+sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
+{
+	struct prison *pr, *cpr, *lpr, *tpr;
+	int descend, error;
+
+	error = priv_check(td, PRIV_JAIL_REMOVE);
+	if (error)
+		return (error);
+
+	sx_xlock(&allprison_lock);
+	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
+	if (pr == NULL) {
+		sx_xunlock(&allprison_lock);
+		return (EINVAL);
+	}
+
+	/* Remove all descendants of this prison, then remove this prison. */
+	pr->pr_ref++;
+	pr->pr_flags |= PR_REMOVE;
+	if (!LIST_EMPTY(&pr->pr_children)) {
+		mtx_unlock(&pr->pr_mtx);
+		lpr = NULL;
+		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
+			mtx_lock(&cpr->pr_mtx);
+			if (cpr->pr_ref > 0) {
+				tpr = cpr;
+				cpr->pr_ref++;
+				cpr->pr_flags |= PR_REMOVE;
+			} else {
+				/* Already removed - do not do it again. */
+				tpr = NULL;
+			}
+			mtx_unlock(&cpr->pr_mtx);
+			if (lpr != NULL) {
+				mtx_lock(&lpr->pr_mtx);
+				prison_remove_one(lpr);
+				sx_xlock(&allprison_lock);
+			}
+			lpr = tpr;
+		}
+		if (lpr != NULL) {
+			mtx_lock(&lpr->pr_mtx);
+			prison_remove_one(lpr);
+			sx_xlock(&allprison_lock);
+		}
+		mtx_lock(&pr->pr_mtx);
+	}
+	prison_remove_one(pr);
+	return (0);
+}
+
+static void
+prison_remove_one(struct prison *pr)
+{
+	struct proc *p;
+	int deuref;
+
+	/* If the prison was persistent, it is not anymore. */
+	deuref = 0;
+	if (pr->pr_flags & PR_PERSIST) {
+		pr->pr_ref--;
+		deuref = PD_DEUREF;
+		pr->pr_flags &= ~PR_PERSIST;
+	}
+
+	/*
+	 * jail_remove added a reference.  If that's the only one, remove
+	 * the prison now.
+	 */
+	KASSERT(pr->pr_ref > 0,
+	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
+	if (pr->pr_ref == 1) {
+		prison_deref(pr,
+		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
+		return;
+	}
+
+	mtx_unlock(&pr->pr_mtx);
+	sx_xunlock(&allprison_lock);
+	/*
+	 * Kill all processes unfortunate enough to be attached to this prison.
+	 */
+	sx_slock(&allproc_lock);
+	LIST_FOREACH(p, &allproc, p_list) {
+		PROC_LOCK(p);
+		if (p->p_state != PRS_NEW && p->p_ucred &&
+		    p->p_ucred->cr_prison == pr)
+			kern_psignal(p, SIGKILL);
+		PROC_UNLOCK(p);
+	}
+	sx_sunlock(&allproc_lock);
+	/* Remove the temporary reference added by jail_remove. */
+	prison_deref(pr, deuref | PD_DEREF);
+}
+
+
+/*
+ * struct jail_attach_args {
+ *	int jid;
+ * };
+ */
+int
+sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
+{
+	struct prison *pr;
+	int error;
+
+	error = priv_check(td, PRIV_JAIL_ATTACH);
+	if (error)
+		return (error);
+
+	sx_slock(&allprison_lock);
+	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
+	if (pr == NULL) {
+		sx_sunlock(&allprison_lock);
+		return (EINVAL);
+	}
+
+	/*
+	 * Do not allow a process to attach to a prison that is not
+	 * considered to be "alive".
+	 */
+	if (pr->pr_uref == 0) {
+		mtx_unlock(&pr->pr_mtx);
+		sx_sunlock(&allprison_lock);
+		return (EINVAL);
+	}
+
+	return (do_jail_attach(td, pr));
+}
+
+static int
+do_jail_attach(struct thread *td, struct prison *pr)
+{
+	struct prison *ppr;
+	struct proc *p;
+	struct ucred *newcred, *oldcred;
+	int error;
+
+	/*
+	 * XXX: Note that there is a slight race here if two threads
+	 * in the same privileged process attempt to attach to two
+	 * different jails at the same time.  It is important for
+	 * user processes not to do this, or they might end up with
+	 * a process root from one prison, but attached to the jail
+	 * of another.
+	 */
+	pr->pr_ref++;
+	pr->pr_uref++;
+	mtx_unlock(&pr->pr_mtx);
+
+	/* Let modules do whatever they need to prepare for attaching. */
+	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
+	if (error) {
+		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
+		return (error);
+	}
+	sx_sunlock(&allprison_lock);
+
+	/*
+	 * Reparent the newly attached process to this jail.
+	 */
+	ppr = td->td_ucred->cr_prison;
+	p = td->td_proc;
+	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
+	if (error)
+		goto e_revert_osd;
+
+	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
+	if ((error = change_dir(pr->pr_root, td)) != 0)
+		goto e_unlock;
+#ifdef MAC
+	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
+		goto e_unlock;
+#endif
+	VOP_UNLOCK(pr->pr_root, 0);
+	if ((error = change_root(pr->pr_root, td)))
+		goto e_revert_osd;
+
+	newcred = crget();
+	PROC_LOCK(p);
+	oldcred = p->p_ucred;
+	setsugid(p);
+	crcopy(newcred, oldcred);
+	newcred->cr_prison = pr;
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+#ifdef RACCT
+	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+	crfree(oldcred);
+	prison_deref(ppr, PD_DEREF | PD_DEUREF);
+	return (0);
+ e_unlock:
+	VOP_UNLOCK(pr->pr_root, 0);
+ e_revert_osd:
+	/* Tell modules this thread is still in its old jail after all. */
+	(void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
+	prison_deref(pr, PD_DEREF | PD_DEUREF);
+	return (error);
+}
+
+
+/*
+ * Returns a locked prison instance, or NULL on failure.
+ */
+struct prison *
+prison_find(int prid)
+{
+	struct prison *pr;
+
+	sx_assert(&allprison_lock, SX_LOCKED);
+	TAILQ_FOREACH(pr, &allprison, pr_list) {
+		if (pr->pr_id == prid) {
+			mtx_lock(&pr->pr_mtx);
+			if (pr->pr_ref > 0)
+				return (pr);
+			mtx_unlock(&pr->pr_mtx);
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
+ */
+struct prison *
+prison_find_child(struct prison *mypr, int prid)
+{
+	struct prison *pr;
+	int descend;
+
+	sx_assert(&allprison_lock, SX_LOCKED);
+	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
+		if (pr->pr_id == prid) {
+			mtx_lock(&pr->pr_mtx);
+			if (pr->pr_ref > 0)
+				return (pr);
+			mtx_unlock(&pr->pr_mtx);
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Look for the name relative to mypr.  Returns a locked prison or NULL.
+ */
+struct prison *
+prison_find_name(struct prison *mypr, const char *name)
+{
+	struct prison *pr, *deadpr;
+	size_t mylen;
+	int descend;
+
+	sx_assert(&allprison_lock, SX_LOCKED);
+	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
+ again:
+	deadpr = NULL;
+	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
+		if (!strcmp(pr->pr_name + mylen, name)) {
+			mtx_lock(&pr->pr_mtx);
+			if (pr->pr_ref > 0) {
+				if (pr->pr_uref > 0)
+					return (pr);
+				deadpr = pr;
+			}
+			mtx_unlock(&pr->pr_mtx);
+		}
+	}
+	/* There was no valid prison - perhaps there was a dying one. */
+	if (deadpr != NULL) {
+		mtx_lock(&deadpr->pr_mtx);
+		if (deadpr->pr_ref == 0) {
+			mtx_unlock(&deadpr->pr_mtx);
+			goto again;
+		}
+	}
+	return (deadpr);
+}
+
+/*
+ * See if a prison has the specific flag set.
+ */
+int
+prison_flag(struct ucred *cred, unsigned flag)
+{
+
+	/* This is an atomic read, so no locking is necessary. */
+	return (cred->cr_prison->pr_flags & flag);
+}
+
+int
+prison_allow(struct ucred *cred, unsigned flag)
+{
+
+	/* This is an atomic read, so no locking is necessary. */
+	return (cred->cr_prison->pr_allow & flag);
+}
+
+/*
+ * Remove a prison reference.  If that was the last reference, remove the
+ * prison itself - but not in this context in case there are locks held.
+ */
+void
+prison_free_locked(struct prison *pr)
+{
+
+	mtx_assert(&pr->pr_mtx, MA_OWNED);
+	pr->pr_ref--;
+	if (pr->pr_ref == 0) {
+		mtx_unlock(&pr->pr_mtx);
+		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
+		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
+		return;
+	}
+	mtx_unlock(&pr->pr_mtx);
+}
+
+void
+prison_free(struct prison *pr)
+{
+
+	mtx_lock(&pr->pr_mtx);
+	prison_free_locked(pr);
+}
+
+static void
+prison_complete(void *context, int pending)
+{
+
+	prison_deref((struct prison *)context, 0);
+}
+
+/*
+ * Remove a prison reference (usually).  This internal version assumes no
+ * mutexes are held, except perhaps the prison itself.  If there are no more
+ * references, release and delist the prison.  On completion, the prison lock
+ * and the allprison lock are both unlocked.
+ */
+static void
+prison_deref(struct prison *pr, int flags)
+{
+	struct prison *ppr, *tpr;
+
+	if (!(flags & PD_LOCKED))
+		mtx_lock(&pr->pr_mtx);
+	for (;;) {
+		if (flags & PD_DEUREF) {
+			pr->pr_uref--;
+			KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
+		}
+		if (flags & PD_DEREF)
+			pr->pr_ref--;
+		/* If the prison still has references, nothing else to do. */
+		if (pr->pr_ref > 0) {
+			mtx_unlock(&pr->pr_mtx);
+			if (flags & PD_LIST_SLOCKED)
+				sx_sunlock(&allprison_lock);
+			else if (flags & PD_LIST_XLOCKED)
+				sx_xunlock(&allprison_lock);
+			return;
+		}
+
+		mtx_unlock(&pr->pr_mtx);
+		if (flags & PD_LIST_SLOCKED) {
+			if (!sx_try_upgrade(&allprison_lock)) {
+				sx_sunlock(&allprison_lock);
+				sx_xlock(&allprison_lock);
+			}
+		} else if (!(flags & PD_LIST_XLOCKED))
+			sx_xlock(&allprison_lock);
+
+		TAILQ_REMOVE(&allprison, pr, pr_list);
+		LIST_REMOVE(pr, pr_sibling);
+		ppr = pr->pr_parent;
+		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
+			tpr->pr_childcount--;
+		sx_xunlock(&allprison_lock);
+
+#ifdef VIMAGE
+		if (pr->pr_vnet != ppr->pr_vnet)
+			vnet_destroy(pr->pr_vnet);
+#endif
+		if (pr->pr_root != NULL)
+			vrele(pr->pr_root);
+		mtx_destroy(&pr->pr_mtx);
+#ifdef INET
+		free(pr->pr_ip4, M_PRISON);
+#endif
+#ifdef INET6
+		free(pr->pr_ip6, M_PRISON);
+#endif
+		if (pr->pr_cpuset != NULL)
+			cpuset_rel(pr->pr_cpuset);
+		osd_jail_exit(pr);
+#ifdef RACCT
+		prison_racct_detach(pr);
+#endif
+		free(pr, M_PRISON);
+
+		/* Removing a prison frees a reference on its parent. */
+		pr = ppr;
+		mtx_lock(&pr->pr_mtx);
+		flags = PD_DEREF | PD_DEUREF;
+	}
+}
+
+void
+prison_hold_locked(struct prison *pr)
+{
+
+	mtx_assert(&pr->pr_mtx, MA_OWNED);
+	KASSERT(pr->pr_ref > 0,
+	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
+	pr->pr_ref++;
+}
+
+void
+prison_hold(struct prison *pr)
+{
+
+	mtx_lock(&pr->pr_mtx);
+	prison_hold_locked(pr);
+	mtx_unlock(&pr->pr_mtx);
+}
+
+void
+prison_proc_hold(struct prison *pr)
+{
+
+	mtx_lock(&pr->pr_mtx);
+	KASSERT(pr->pr_uref > 0,
+	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
+	pr->pr_uref++;
+	mtx_unlock(&pr->pr_mtx);
+}
+
+void
+prison_proc_free(struct prison *pr)
+{
+
+	mtx_lock(&pr->pr_mtx);
+	KASSERT(pr->pr_uref > 0,
+	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
+	prison_deref(pr, PD_DEUREF | PD_LOCKED);
+}
+
+
+#ifdef INET
+/*
+ * Restrict a prison's IP address list with its parent's, possibly replacing
+ * it.  Return true if the replacement buffer was used (or would have been).
+ */
+static int
+prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
+{
+	int ii, ij, used;
+	struct prison *ppr;
+
+	ppr = pr->pr_parent;
+	if (!(pr->pr_flags & PR_IP4_USER)) {
+		/* This has no user settings, so just copy the parent's list. */
+		if (pr->pr_ip4s < ppr->pr_ip4s) {
+			/*
+			 * There's no room for the parent's list.  Use the
+			 * new list buffer, which is assumed to be big enough
+			 * (if it was passed).  If there's no buffer, try to
+			 * allocate one.
+			 */
+			used = 1;
+			if (newip4 == NULL) {
+				newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
+				    M_PRISON, M_NOWAIT);
+				if (newip4 != NULL)
+					used = 0;
+			}
+			if (newip4 != NULL) {
+				bcopy(ppr->pr_ip4, newip4,
+				    ppr->pr_ip4s * sizeof(*newip4));
+				free(pr->pr_ip4, M_PRISON);
+				pr->pr_ip4 = newip4;
+				pr->pr_ip4s = ppr->pr_ip4s;
+			}
+			return (used);
+		}
+		pr->pr_ip4s = ppr->pr_ip4s;
+		if (pr->pr_ip4s > 0)
+			bcopy(ppr->pr_ip4, pr->pr_ip4,
+			    pr->pr_ip4s * sizeof(*newip4));
+		else if (pr->pr_ip4 != NULL) {
+			free(pr->pr_ip4, M_PRISON);
+			pr->pr_ip4 = NULL;
+		}
+	} else if (pr->pr_ip4s > 0) {
+		/* Remove addresses that aren't in the parent. */
+		for (ij = 0; ij < ppr->pr_ip4s; ij++)
+			if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
+				break;
+		if (ij < ppr->pr_ip4s)
+			ii = 1;
+		else {
+			bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
+			    --pr->pr_ip4s * sizeof(*pr->pr_ip4));
+			ii = 0;
+		}
+		for (ij = 1; ii < pr->pr_ip4s; ) {
+			if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
+				ii++;
+				continue;
+			}
+			switch (ij >= ppr->pr_ip4s ? -1 :
+				qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
+			case -1:
+				bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
+				    (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
+				break;
+			case 0:
+				ii++;
+				ij++;
+				break;
+			case 1:
+				ij++;
+				break;
+			}
+		}
+		if (pr->pr_ip4s == 0) {
+			pr->pr_flags |= PR_IP4_DISABLE;
+			free(pr->pr_ip4, M_PRISON);
+			pr->pr_ip4 = NULL;
+		}
+	}
+	return (0);
+}
+
+/*
+ * Pass back primary IPv4 address of this jail.
+ *
+ * If not restricted return success but do not alter the address.  Caller has
+ * to make sure to initialize it correctly (e.g. INADDR_ANY).
+ *
+ * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
+ * Address returned in NBO.
+ */
+int
+prison_get_ip4(struct ucred *cred, struct in_addr *ia)
+{
+	struct prison *pr;
+
+	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
+
+	pr = cred->cr_prison;
+	if (!(pr->pr_flags & PR_IP4))
+		return (0);
+	mtx_lock(&pr->pr_mtx);
+	if (!(pr->pr_flags & PR_IP4)) {
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+	if (pr->pr_ip4 == NULL) {
+		mtx_unlock(&pr->pr_mtx);
+		return (EAFNOSUPPORT);
+	}
+
+	ia->s_addr = pr->pr_ip4[0].s_addr;
+	mtx_unlock(&pr->pr_mtx);
+	return (0);
+}
+
+/*
+ * Return 1 if we should do proper source address selection or are not jailed.
+ * We will return 0 if we should bypass source address selection in favour
+ * of the primary jail IPv4 address. Only in this case *ia will be updated and
+ * returned in NBO.
+ * Return EAFNOSUPPORT, in case this jail does not allow IPv4.
+ */
+int
+prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia)
+{
+	struct prison *pr;
+	struct in_addr lia;
+	int error;
+
+	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
+
+	if (!jailed(cred))
+		return (1);
+
+	pr = cred->cr_prison;
+	if (pr->pr_flags & PR_IP4_SADDRSEL)
+		return (1);
+
+	lia.s_addr = INADDR_ANY;
+	error = prison_get_ip4(cred, &lia);
+	if (error)
+		return (error);
+	if (lia.s_addr == INADDR_ANY)
+		return (1);
+
+	ia->s_addr = lia.s_addr;
+	return (0);
+}
+
+/*
+ * Return true if pr1 and pr2 have the same IPv4 address restrictions.
+ */
+int
+prison_equal_ip4(struct prison *pr1, struct prison *pr2)
+{
+
+	if (pr1 == pr2)
+		return (1);
+
+	/*
+	 * No need to lock since the PR_IP4_USER flag can't be altered for
+	 * existing prisons.
+	 */
+	while (pr1 != &prison0 &&
+#ifdef VIMAGE
+	       !(pr1->pr_flags & PR_VNET) &&
+#endif
+	       !(pr1->pr_flags & PR_IP4_USER))
+		pr1 = pr1->pr_parent;
+	while (pr2 != &prison0 &&
+#ifdef VIMAGE
+	       !(pr2->pr_flags & PR_VNET) &&
+#endif
+	       !(pr2->pr_flags & PR_IP4_USER))
+		pr2 = pr2->pr_parent;
+	return (pr1 == pr2);
+}
+
+/*
+ * Make sure our (source) address is set to something meaningful to this
+ * jail.
+ *
+ * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
+ * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
+ * doesn't allow IPv4.  Address passed in in NBO and returned in NBO.
+ */
+int
+prison_local_ip4(struct ucred *cred, struct in_addr *ia)
+{
+	struct prison *pr;
+	struct in_addr ia0;
+	int error;
+
+	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
+
+	pr = cred->cr_prison;
+	if (!(pr->pr_flags & PR_IP4))
+		return (0);
+	mtx_lock(&pr->pr_mtx);
+	if (!(pr->pr_flags & PR_IP4)) {
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+	if (pr->pr_ip4 == NULL) {
+		mtx_unlock(&pr->pr_mtx);
+		return (EAFNOSUPPORT);
+	}
+
+	ia0.s_addr = ntohl(ia->s_addr);
+	if (ia0.s_addr == INADDR_LOOPBACK) {
+		ia->s_addr = pr->pr_ip4[0].s_addr;
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+
+	if (ia0.s_addr == INADDR_ANY) {
+		/*
+		 * In case there is only 1 IPv4 address, bind directly.
+		 */
+		if (pr->pr_ip4s == 1)
+			ia->s_addr = pr->pr_ip4[0].s_addr;
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+
+	error = _prison_check_ip4(pr, ia);
+	mtx_unlock(&pr->pr_mtx);
+	return (error);
+}
+
+/*
+ * Rewrite destination address in case we will connect to loopback address.
+ *
+ * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
+ * Address passed in in NBO and returned in NBO.
+ */
+int
+prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
+{
+	struct prison *pr;
+
+	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
+
+	pr = cred->cr_prison;
+	if (!(pr->pr_flags & PR_IP4))
+		return (0);
+	mtx_lock(&pr->pr_mtx);
+	if (!(pr->pr_flags & PR_IP4)) {
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+	if (pr->pr_ip4 == NULL) {
+		mtx_unlock(&pr->pr_mtx);
+		return (EAFNOSUPPORT);
+	}
+
+	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
+		ia->s_addr = pr->pr_ip4[0].s_addr;
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+
+	/*
+	 * Return success because nothing had to be changed.
+	 */
+	mtx_unlock(&pr->pr_mtx);
+	return (0);
+}
+
+/*
+ * Check if given address belongs to the jail referenced by cred/prison.
+ *
+ * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
+ * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
+ * doesn't allow IPv4.  Address passed in in NBO.
+ */
+static int
+_prison_check_ip4(struct prison *pr, struct in_addr *ia)
+{
+	int i, a, z, d;
+
+	/*
+	 * Check the primary IP.
+	 */
+	if (pr->pr_ip4[0].s_addr == ia->s_addr)
+		return (0);
+
+	/*
+	 * All the other IPs are sorted so we can do a binary search.
+	 */
+	a = 0;
+	z = pr->pr_ip4s - 2;
+	while (a <= z) {
+		i = (a + z) / 2;
+		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
+		if (d > 0)
+			z = i - 1;
+		else if (d < 0)
+			a = i + 1;
+		else
+			return (0);
+	}
+
+	return (EADDRNOTAVAIL);
+}
+
+int
+prison_check_ip4(struct ucred *cred, struct in_addr *ia)
+{
+	struct prison *pr;
+	int error;
+
+	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
+
+	pr = cred->cr_prison;
+	if (!(pr->pr_flags & PR_IP4))
+		return (0);
+	mtx_lock(&pr->pr_mtx);
+	if (!(pr->pr_flags & PR_IP4)) {
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+	if (pr->pr_ip4 == NULL) {
+		mtx_unlock(&pr->pr_mtx);
+		return (EAFNOSUPPORT);
+	}
+
+	error = _prison_check_ip4(pr, ia);
+	mtx_unlock(&pr->pr_mtx);
+	return (error);
+}
+#endif
+
+#ifdef INET6
+static int
+prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
+{
+	int ii, ij, used;
+	struct prison *ppr;
+
+	ppr = pr->pr_parent;
+	if (!(pr->pr_flags & PR_IP6_USER)) {
+		/* This has no user settings, so just copy the parent's list. */
+		if (pr->pr_ip6s < ppr->pr_ip6s) {
+			/*
+			 * There's no room for the parent's list.  Use the
+			 * new list buffer, which is assumed to be big enough
+			 * (if it was passed).  If there's no buffer, try to
+			 * allocate one.
+			 */
+			used = 1;
+			if (newip6 == NULL) {
+				newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
+				    M_PRISON, M_NOWAIT);
+				if (newip6 != NULL)
+					used = 0;
+			}
+			if (newip6 != NULL) {
+				bcopy(ppr->pr_ip6, newip6,
+				    ppr->pr_ip6s * sizeof(*newip6));
+				free(pr->pr_ip6, M_PRISON);
+				pr->pr_ip6 = newip6;
+				pr->pr_ip6s = ppr->pr_ip6s;
+			}
+			return (used);
+		}
+		pr->pr_ip6s = ppr->pr_ip6s;
+		if (pr->pr_ip6s > 0)
+			bcopy(ppr->pr_ip6, pr->pr_ip6,
+			    pr->pr_ip6s * sizeof(*newip6));
+		else if (pr->pr_ip6 != NULL) {
+			free(pr->pr_ip6, M_PRISON);
+			pr->pr_ip6 = NULL;
+		}
+	} else if (pr->pr_ip6s > 0) {
+		/* Remove addresses that aren't in the parent. */
+		for (ij = 0; ij < ppr->pr_ip6s; ij++)
+			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
+			    &ppr->pr_ip6[ij]))
+				break;
+		if (ij < ppr->pr_ip6s)
+			ii = 1;
+		else {
+			bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
+			    --pr->pr_ip6s * sizeof(*pr->pr_ip6));
+			ii = 0;
+		}
+		for (ij = 1; ii < pr->pr_ip6s; ) {
+			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
+			    &ppr->pr_ip6[0])) {
+				ii++;
+				continue;
+			}
+			switch (ij >= ppr->pr_ip4s ? -1 :
+				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
+			case -1:
+				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
+				    (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
+				break;
+			case 0:
+				ii++;
+				ij++;
+				break;
+			case 1:
+				ij++;
+				break;
+			}
+		}
+		if (pr->pr_ip6s == 0) {
+			pr->pr_flags |= PR_IP6_DISABLE;
+			free(pr->pr_ip6, M_PRISON);
+			pr->pr_ip6 = NULL;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Pass back primary IPv6 address for this jail.
+ *
+ * If not restricted return success but do not alter the address.  Caller has
+ * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
+ *
+ * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
+ */
+int
+prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
+{
+	struct prison *pr;
+
+	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
+
+	pr = cred->cr_prison;
+	if (!(pr->pr_flags & PR_IP6))
+		return (0);
+	mtx_lock(&pr->pr_mtx);
+	if (!(pr->pr_flags & PR_IP6)) {
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+	if (pr->pr_ip6 == NULL) {
+		mtx_unlock(&pr->pr_mtx);
+		return (EAFNOSUPPORT);
+	}
+
+	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
+	mtx_unlock(&pr->pr_mtx);
+	return (0);
+}
+
+/*
+ * Return 1 if we should do proper source address selection or are not jailed.
+ * We will return 0 if we should bypass source address selection in favour
+ * of the primary jail IPv6 address. Only in this case *ia will be updated and
+ * returned in NBO.
+ * Return EAFNOSUPPORT, in case this jail does not allow IPv6.
+ */
+int
+prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6)
+{
+	struct prison *pr;
+	struct in6_addr lia6;
+	int error;
+
+	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
+
+	if (!jailed(cred))
+		return (1);
+
+	pr = cred->cr_prison;
+	if (pr->pr_flags & PR_IP6_SADDRSEL)
+		return (1);
+
+	lia6 = in6addr_any;
+	error = prison_get_ip6(cred, &lia6);
+	if (error)
+		return (error);
+	if (IN6_IS_ADDR_UNSPECIFIED(&lia6))
+		return (1);
+
+	bcopy(&lia6, ia6, sizeof(struct in6_addr));
+	return (0);
+}
+
+/*
+ * Return true if pr1 and pr2 have the same IPv6 address restrictions.
+ */
+int
+prison_equal_ip6(struct prison *pr1, struct prison *pr2)
+{
+
+	if (pr1 == pr2)
+		return (1);
+
+	while (pr1 != &prison0 &&
+#ifdef VIMAGE
+	       !(pr1->pr_flags & PR_VNET) &&
+#endif
+	       !(pr1->pr_flags & PR_IP6_USER))
+		pr1 = pr1->pr_parent;
+	while (pr2 != &prison0 &&
+#ifdef VIMAGE
+	       !(pr2->pr_flags & PR_VNET) &&
+#endif
+	       !(pr2->pr_flags & PR_IP6_USER))
+		pr2 = pr2->pr_parent;
+	return (pr1 == pr2);
+}
+
+/*
+ * Make sure our (source) address is set to something meaningful to this jail.
+ *
+ * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
+ * when needed while binding.
+ *
+ * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
+ * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
+ * doesn't allow IPv6.
+ */
+int
+prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
+{
+	struct prison *pr;
+	int error;
+
+	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
+
+	pr = cred->cr_prison;
+	if (!(pr->pr_flags & PR_IP6))
+		return (0);
+	mtx_lock(&pr->pr_mtx);
+	if (!(pr->pr_flags & PR_IP6)) {
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+	if (pr->pr_ip6 == NULL) {
+		mtx_unlock(&pr->pr_mtx);
+		return (EAFNOSUPPORT);
+	}
+
+	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
+		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+
+	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
+		/*
+		 * In case there is only 1 IPv6 address, and v6only is true,
+		 * then bind directly.
+		 */
+		if (v6only != 0 && pr->pr_ip6s == 1)
+			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+
+	error = _prison_check_ip6(pr, ia6);
+	mtx_unlock(&pr->pr_mtx);
+	return (error);
+}
+
+/*
+ * Rewrite destination address in case we will connect to loopback address.
+ *
+ * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
+ */
+int
+prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
+{
+	struct prison *pr;
+
+	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
+
+	pr = cred->cr_prison;
+	if (!(pr->pr_flags & PR_IP6))
+		return (0);
+	mtx_lock(&pr->pr_mtx);
+	if (!(pr->pr_flags & PR_IP6)) {
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+	if (pr->pr_ip6 == NULL) {
+		mtx_unlock(&pr->pr_mtx);
+		return (EAFNOSUPPORT);
+	}
+
+	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
+		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+
+	/*
+	 * Return success because nothing had to be changed.
+	 */
+	mtx_unlock(&pr->pr_mtx);
+	return (0);
+}
+
+/*
+ * Check if given address belongs to the jail referenced by cred/prison.
+ *
+ * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
+ * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
+ * doesn't allow IPv6.
+ */
+static int
+_prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
+{
+	int i, a, z, d;
+
+	/*
+	 * Check the primary IP.
+	 */
+	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
+		return (0);
+
+	/*
+	 * All the other IPs are sorted so we can do a binary search.
+	 */
+	a = 0;
+	z = pr->pr_ip6s - 2;
+	while (a <= z) {
+		i = (a + z) / 2;
+		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
+		if (d > 0)
+			z = i - 1;
+		else if (d < 0)
+			a = i + 1;
+		else
+			return (0);
+	}
+
+	return (EADDRNOTAVAIL);
+}
+
+int
+prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
+{
+	struct prison *pr;
+	int error;
+
+	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
+
+	pr = cred->cr_prison;
+	if (!(pr->pr_flags & PR_IP6))
+		return (0);
+	mtx_lock(&pr->pr_mtx);
+	if (!(pr->pr_flags & PR_IP6)) {
+		mtx_unlock(&pr->pr_mtx);
+		return (0);
+	}
+	if (pr->pr_ip6 == NULL) {
+		mtx_unlock(&pr->pr_mtx);
+		return (EAFNOSUPPORT);
+	}
+
+	error = _prison_check_ip6(pr, ia6);
+	mtx_unlock(&pr->pr_mtx);
+	return (error);
+}
+#endif
+
+/*
+ * Check if a jail supports the given address family.
+ *
+ * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
+ * if not.
+ */
+int
+prison_check_af(struct ucred *cred, int af)
+{
+	struct prison *pr;
+	int error;
+
+	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+
+	pr = cred->cr_prison;
+#ifdef VIMAGE
+	/* Prisons with their own network stack are not limited. */
+	if (prison_owns_vnet(cred))
+		return (0);
+#endif
+
+	error = 0;
+	switch (af)
+	{
+#ifdef INET
+	case AF_INET:
+		if (pr->pr_flags & PR_IP4)
+		{
+			mtx_lock(&pr->pr_mtx);
+			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
+				error = EAFNOSUPPORT;
+			mtx_unlock(&pr->pr_mtx);
+		}
+		break;
+#endif
+#ifdef INET6
+	case AF_INET6:
+		if (pr->pr_flags & PR_IP6)
+		{
+			mtx_lock(&pr->pr_mtx);
+			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
+				error = EAFNOSUPPORT;
+			mtx_unlock(&pr->pr_mtx);
+		}
+		break;
+#endif
+	case AF_LOCAL:
+	case AF_ROUTE:
+		break;
+	default:
+		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
+			error = EAFNOSUPPORT;
+	}
+	return (error);
+}
+
+/*
+ * Check if given address belongs to the jail referenced by cred (wrapper to
+ * prison_check_ip[46]).
+ *
+ * Returns 0 if jail doesn't restrict the address family or if address belongs
+ * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
+ * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
+ */
+int
+prison_if(struct ucred *cred, struct sockaddr *sa)
+{
+#ifdef INET
+	struct sockaddr_in *sai;
+#endif
+#ifdef INET6
+	struct sockaddr_in6 *sai6;
+#endif
+	int error;
+
+	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
+
+#ifdef VIMAGE
+	if (prison_owns_vnet(cred))
+		return (0);
+#endif
+
+	error = 0;
+	switch (sa->sa_family)
+	{
+#ifdef INET
+	case AF_INET:
+		sai = (struct sockaddr_in *)sa;
+		error = prison_check_ip4(cred, &sai->sin_addr);
+		break;
+#endif
+#ifdef INET6
+	case AF_INET6:
+		sai6 = (struct sockaddr_in6 *)sa;
+		error = prison_check_ip6(cred, &sai6->sin6_addr);
+		break;
+#endif
+	default:
+		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
+			error = EAFNOSUPPORT;
+	}
+	return (error);
+}
+
+/*
+ * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
+ */
+int
+prison_check(struct ucred *cred1, struct ucred *cred2)
+{
+
+	return ((cred1->cr_prison == cred2->cr_prison ||
+	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
+}
+
+/*
+ * Return 1 if p2 is a child of p1, otherwise 0.
+ */
+int
+prison_ischild(struct prison *pr1, struct prison *pr2)
+{
+
+	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
+		if (pr1 == pr2)
+			return (1);
+	return (0);
+}
+
+/*
+ * Return 1 if the passed credential is in a jail, otherwise 0.
+ */
+int
+jailed(struct ucred *cred)
+{
+
+	return (cred->cr_prison != &prison0);
+}
+
+/*
+ * Return 1 if the passed credential is in a jail and that jail does not
+ * have its own virtual network stack, otherwise 0.
+ */
+int
+jailed_without_vnet(struct ucred *cred)
+{
+
+	if (!jailed(cred))
+		return (0);
+#ifdef VIMAGE
+	if (prison_owns_vnet(cred))
+		return (0);
+#endif
+
+	return (1);
+}
+
+/*
+ * Return the correct hostname (domainname, et al) for the passed credential.
+ */
+void
+getcredhostname(struct ucred *cred, char *buf, size_t size)
+{
+	struct prison *pr;
+
+	/*
+	 * A NULL credential can be used to shortcut to the physical
+	 * system's hostname.
+	 */
+	pr = (cred != NULL) ? cred->cr_prison : &prison0;
+	mtx_lock(&pr->pr_mtx);
+	strlcpy(buf, pr->pr_hostname, size);
+	mtx_unlock(&pr->pr_mtx);
+}
+
+void
+getcreddomainname(struct ucred *cred, char *buf, size_t size)
+{
+
+	mtx_lock(&cred->cr_prison->pr_mtx);
+	strlcpy(buf, cred->cr_prison->pr_domainname, size);
+	mtx_unlock(&cred->cr_prison->pr_mtx);
+}
+
+void
+getcredhostuuid(struct ucred *cred, char *buf, size_t size)
+{
+
+	mtx_lock(&cred->cr_prison->pr_mtx);
+	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
+	mtx_unlock(&cred->cr_prison->pr_mtx);
+}
+
+void
+getcredhostid(struct ucred *cred, unsigned long *hostid)
+{
+
+	mtx_lock(&cred->cr_prison->pr_mtx);
+	*hostid = cred->cr_prison->pr_hostid;
+	mtx_unlock(&cred->cr_prison->pr_mtx);
+}
+
+#ifdef VIMAGE
+/*
+ * Determine whether the prison represented by cred owns
+ * its vnet rather than having it inherited.
+ *
+ * Returns 1 in case the prison owns the vnet, 0 otherwise.
+ */
+int
+prison_owns_vnet(struct ucred *cred)
+{
+
+	/*
+	 * vnets cannot be added/removed after jail creation,
+	 * so no need to lock here.
+	 */
+	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
+}
+#endif
+
+/*
+ * Determine whether the subject represented by cred can "see"
+ * status of a mount point.
+ * Returns: 0 for permitted, ENOENT otherwise.
+ * XXX: This function should be called cr_canseemount() and should be
+ *      placed in kern_prot.c.
+ */
+int
+prison_canseemount(struct ucred *cred, struct mount *mp)
+{
+	struct prison *pr;
+	struct statfs *sp;
+	size_t len;
+
+	pr = cred->cr_prison;
+	if (pr->pr_enforce_statfs == 0)
+		return (0);
+	if (pr->pr_root->v_mount == mp)
+		return (0);
+	if (pr->pr_enforce_statfs == 2)
+		return (ENOENT);
+	/*
+	 * If jail's chroot directory is set to "/" we should be able to see
+	 * all mount-points from inside a jail.
+	 * This is ugly check, but this is the only situation when jail's
+	 * directory ends with '/'.
+	 */
+	if (strcmp(pr->pr_path, "/") == 0)
+		return (0);
+	len = strlen(pr->pr_path);
+	sp = &mp->mnt_stat;
+	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
+		return (ENOENT);
+	/*
+	 * Be sure that we don't have situation where jail's root directory
+	 * is "/some/path" and mount point is "/some/pathpath".
+	 */
+	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
+		return (ENOENT);
+	return (0);
+}
+
+void
+prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
+{
+	char jpath[MAXPATHLEN];
+	struct prison *pr;
+	size_t len;
+
+	pr = cred->cr_prison;
+	if (pr->pr_enforce_statfs == 0)
+		return;
+	if (prison_canseemount(cred, mp) != 0) {
+		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
+		strlcpy(sp->f_mntonname, "[restricted]",
+		    sizeof(sp->f_mntonname));
+		return;
+	}
+	if (pr->pr_root->v_mount == mp) {
+		/*
+		 * Clear current buffer data, so we are sure nothing from
+		 * the valid path left there.
+		 */
+		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
+		*sp->f_mntonname = '/';
+		return;
+	}
+	/*
+	 * If jail's chroot directory is set to "/" we should be able to see
+	 * all mount-points from inside a jail.
+	 */
+	if (strcmp(pr->pr_path, "/") == 0)
+		return;
+	len = strlen(pr->pr_path);
+	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
+	/*
+	 * Clear current buffer data, so we are sure nothing from
+	 * the valid path left there.
+	 */
+	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
+	if (*jpath == '\0') {
+		/* Should never happen. */
+		*sp->f_mntonname = '/';
+	} else {
+		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
+	}
+}
+
+/*
+ * Check with permission for a specific privilege is granted within jail.  We
+ * have a specific list of accepted privileges; the rest are denied.
+ */
+int
+prison_priv_check(struct ucred *cred, int priv)
+{
+
+	if (!jailed(cred))
+		return (0);
+
+#ifdef VIMAGE
+	/*
+	 * Privileges specific to prisons with a virtual network stack.
+	 * There might be a duplicate entry here in case the privilege
+	 * is only granted conditionally in the legacy jail case.
+	 */
+	switch (priv) {
+#ifdef notyet
+		/*
+		 * NFS-specific privileges.
+		 */
+	case PRIV_NFS_DAEMON:
+	case PRIV_NFS_LOCKD:
+#endif
+		/*
+		 * Network stack privileges.
+		 */
+	case PRIV_NET_BRIDGE:
+	case PRIV_NET_GRE:
+	case PRIV_NET_BPF:
+	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
+	case PRIV_NET_ROUTE:
+	case PRIV_NET_TAP:
+	case PRIV_NET_SETIFMTU:
+	case PRIV_NET_SETIFFLAGS:
+	case PRIV_NET_SETIFCAP:
+	case PRIV_NET_SETIFDESCR:
+	case PRIV_NET_SETIFNAME	:
+	case PRIV_NET_SETIFMETRIC:
+	case PRIV_NET_SETIFPHYS:
+	case PRIV_NET_SETIFMAC:
+	case PRIV_NET_ADDMULTI:
+	case PRIV_NET_DELMULTI:
+	case PRIV_NET_HWIOCTL:
+	case PRIV_NET_SETLLADDR:
+	case PRIV_NET_ADDIFGROUP:
+	case PRIV_NET_DELIFGROUP:
+	case PRIV_NET_IFCREATE:
+	case PRIV_NET_IFDESTROY:
+	case PRIV_NET_ADDIFADDR:
+	case PRIV_NET_DELIFADDR:
+	case PRIV_NET_LAGG:
+	case PRIV_NET_GIF:
+	case PRIV_NET_SETIFVNET:
+	case PRIV_NET_SETIFFIB:
+
+		/*
+		 * 802.11-related privileges.
+		 */
+	case PRIV_NET80211_GETKEY:
+#ifdef notyet
+	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
+#endif
+
+#ifdef notyet
+		/*
+		 * AppleTalk privileges.
+		 */
+	case PRIV_NETATALK_RESERVEDPORT:
+
+		/*
+		 * ATM privileges.
+		 */
+	case PRIV_NETATM_CFG:
+	case PRIV_NETATM_ADD:
+	case PRIV_NETATM_DEL:
+	case PRIV_NETATM_SET:
+
+		/*
+		 * Bluetooth privileges.
+		 */
+	case PRIV_NETBLUETOOTH_RAW:
+#endif
+
+		/*
+		 * Netgraph and netgraph module privileges.
+		 */
+	case PRIV_NETGRAPH_CONTROL:
+#ifdef notyet
+	case PRIV_NETGRAPH_TTY:
+#endif
+
+		/*
+		 * IPv4 and IPv6 privileges.
+		 */
+	case PRIV_NETINET_IPFW:
+	case PRIV_NETINET_DIVERT:
+	case PRIV_NETINET_PF:
+	case PRIV_NETINET_DUMMYNET:
+	case PRIV_NETINET_CARP:
+	case PRIV_NETINET_MROUTE:
+	case PRIV_NETINET_RAW:
+	case PRIV_NETINET_ADDRCTRL6:
+	case PRIV_NETINET_ND6:
+	case PRIV_NETINET_SCOPE6:
+	case PRIV_NETINET_ALIFETIME6:
+	case PRIV_NETINET_IPSEC:
+	case PRIV_NETINET_BINDANY:
+
+#ifdef notyet
+		/*
+		 * IPX/SPX privileges.
+		 */
+	case PRIV_NETIPX_RESERVEDPORT:
+	case PRIV_NETIPX_RAW:
+
+		/*
+		 * NCP privileges.
+		 */
+	case PRIV_NETNCP:
+
+		/*
+		 * SMB privileges.
+		 */
+	case PRIV_NETSMB:
+#endif
+
+	/*
+	 * No default: or deny here.
+	 * In case of no permit fall through to next switch().
+	 */
+		if (cred->cr_prison->pr_flags & PR_VNET)
+			return (0);
+	}
+#endif /* VIMAGE */
+
+	switch (priv) {
+
+		/*
+		 * Allow ktrace privileges for root in jail.
+		 */
+	case PRIV_KTRACE:
+
+#if 0
+		/*
+		 * Allow jailed processes to configure audit identity and
+		 * submit audit records (login, etc).  In the future we may
+		 * want to further refine the relationship between audit and
+		 * jail.
+		 */
+	case PRIV_AUDIT_GETAUDIT:
+	case PRIV_AUDIT_SETAUDIT:
+	case PRIV_AUDIT_SUBMIT:
+#endif
+
+		/*
+		 * Allow jailed processes to manipulate process UNIX
+		 * credentials in any way they see fit.
+		 */
+	case PRIV_CRED_SETUID:
+	case PRIV_CRED_SETEUID:
+	case PRIV_CRED_SETGID:
+	case PRIV_CRED_SETEGID:
+	case PRIV_CRED_SETGROUPS:
+	case PRIV_CRED_SETREUID:
+	case PRIV_CRED_SETREGID:
+	case PRIV_CRED_SETRESUID:
+	case PRIV_CRED_SETRESGID:
+
+		/*
+		 * Jail implements visibility constraints already, so allow
+		 * jailed root to override uid/gid-based constraints.
+		 */
+	case PRIV_SEEOTHERGIDS:
+	case PRIV_SEEOTHERUIDS:
+
+		/*
+		 * Jail implements inter-process debugging limits already, so
+		 * allow jailed root various debugging privileges.
+		 */
+	case PRIV_DEBUG_DIFFCRED:
+	case PRIV_DEBUG_SUGID:
+	case PRIV_DEBUG_UNPRIV:
+
+		/*
+		 * Allow jail to set various resource limits and login
+		 * properties, and for now, exceed process resource limits.
+		 */
+	case PRIV_PROC_LIMIT:
+	case PRIV_PROC_SETLOGIN:
+	case PRIV_PROC_SETRLIMIT:
+
+		/*
+		 * System V and POSIX IPC privileges are granted in jail.
+		 */
+	case PRIV_IPC_READ:
+	case PRIV_IPC_WRITE:
+	case PRIV_IPC_ADMIN:
+	case PRIV_IPC_MSGSIZE:
+	case PRIV_MQ_ADMIN:
+
+		/*
+		 * Jail operations within a jail work on child jails.
+		 */
+	case PRIV_JAIL_ATTACH:
+	case PRIV_JAIL_SET:
+	case PRIV_JAIL_REMOVE:
+
+		/*
+		 * Jail implements its own inter-process limits, so allow
+		 * root processes in jail to change scheduling on other
+		 * processes in the same jail.  Likewise for signalling.
+		 */
+	case PRIV_SCHED_DIFFCRED:
+	case PRIV_SCHED_CPUSET:
+	case PRIV_SIGNAL_DIFFCRED:
+	case PRIV_SIGNAL_SUGID:
+
+		/*
+		 * Allow jailed processes to write to sysctls marked as jail
+		 * writable.
+		 */
+	case PRIV_SYSCTL_WRITEJAIL:
+
+		/*
+		 * Allow root in jail to manage a variety of quota
+		 * properties.  These should likely be conditional on a
+		 * configuration option.
+		 */
+	case PRIV_VFS_GETQUOTA:
+	case PRIV_VFS_SETQUOTA:
+
+		/*
+		 * Since Jail relies on chroot() to implement file system
+		 * protections, grant many VFS privileges to root in jail.
+		 * Be careful to exclude mount-related and NFS-related
+		 * privileges.
+		 */
+	case PRIV_VFS_READ:
+	case PRIV_VFS_WRITE:
+	case PRIV_VFS_ADMIN:
+	case PRIV_VFS_EXEC:
+	case PRIV_VFS_LOOKUP:
+	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
+	case PRIV_VFS_CHFLAGS_DEV:
+	case PRIV_VFS_CHOWN:
+	case PRIV_VFS_CHROOT:
+	case PRIV_VFS_RETAINSUGID:
+	case PRIV_VFS_FCHROOT:
+	case PRIV_VFS_LINK:
+	case PRIV_VFS_SETGID:
+	case PRIV_VFS_STAT:
+	case PRIV_VFS_STICKYFILE:
+
+		/*
+		 * As in the non-jail case, non-root users are expected to be
+		 * able to read kernel/phyiscal memory (provided /dev/[k]mem
+		 * exists in the jail and they have permission to access it).
+		 */
+	case PRIV_KMEM_READ:
+		return (0);
+
+		/*
+		 * Depending on the global setting, allow privilege of
+		 * setting system flags.
+		 */
+	case PRIV_VFS_SYSFLAGS:
+		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
+			return (0);
+		else
+			return (EPERM);
+
+		/*
+		 * Depending on the global setting, allow privilege of
+		 * mounting/unmounting file systems.
+		 */
+	case PRIV_VFS_MOUNT:
+	case PRIV_VFS_UNMOUNT:
+	case PRIV_VFS_MOUNT_NONUSER:
+	case PRIV_VFS_MOUNT_OWNER:
+		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
+		    cred->cr_prison->pr_enforce_statfs < 2)
+			return (0);
+		else
+			return (EPERM);
+
+		/*
+		 * Allow jailed root to bind reserved ports and reuse in-use
+		 * ports.
+		 */
+	case PRIV_NETINET_RESERVEDPORT:
+	case PRIV_NETINET_REUSEPORT:
+		return (0);
+
+		/*
+		 * Allow jailed root to set certian IPv4/6 (option) headers.
+		 */
+	case PRIV_NETINET_SETHDROPTS:
+		return (0);
+
+		/*
+		 * Conditionally allow creating raw sockets in jail.
+		 */
+	case PRIV_NETINET_RAW:
+		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
+			return (0);
+		else
+			return (EPERM);
+
+		/*
+		 * Since jail implements its own visibility limits on netstat
+		 * sysctls, allow getcred.  This allows identd to work in
+		 * jail.
+		 */
+	case PRIV_NETINET_GETCRED:
+		return (0);
+
+		/*
+		 * Allow jailed root to set loginclass.
+		 */
+	case PRIV_PROC_SETLOGINCLASS:
+		return (0);
+
+	default:
+		/*
+		 * In all remaining cases, deny the privilege request.  This
+		 * includes almost all network privileges, many system
+		 * configuration privileges.
+		 */
+		return (EPERM);
+	}
+}
+
+/*
+ * Return the part of pr2's name that is relative to pr1, or the whole name
+ * if it does not directly follow.
+ */
+
+char *
+prison_name(struct prison *pr1, struct prison *pr2)
+{
+	char *name;
+
+	/* Jails see themselves as "0" (if they see themselves at all). */
+	if (pr1 == pr2)
+		return "0";
+	name = pr2->pr_name;
+	if (prison_ischild(pr1, pr2)) {
+		/*
+		 * pr1 isn't locked (and allprison_lock may not be either)
+		 * so its length can't be counted on.  But the number of dots
+		 * can be counted on - and counted.
+		 */
+		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
+			name = strchr(name, '.') + 1;
+	}
+	return (name);
+}
+
+/*
+ * Return the part of pr2's path that is relative to pr1, or the whole path
+ * if it does not directly follow.
+ */
+static char *
+prison_path(struct prison *pr1, struct prison *pr2)
+{
+	char *path1, *path2;
+	int len1;
+
+	path1 = pr1->pr_path;
+	path2 = pr2->pr_path;
+	if (!strcmp(path1, "/"))
+		return (path2);
+	len1 = strlen(path1);
+	if (strncmp(path1, path2, len1))
+		return (path2);
+	if (path2[len1] == '\0')
+		return "/";
+	if (path2[len1] == '/')
+		return (path2 + len1);
+	return (path2);
+}
+
+
+/*
+ * Jail-related sysctls.
+ */
+static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
+    "Jails");
+
+static int
+sysctl_jail_list(SYSCTL_HANDLER_ARGS)
+{
+	struct xprison *xp;
+	struct prison *pr, *cpr;
+#ifdef INET
+	struct in_addr *ip4 = NULL;
+	int ip4s = 0;
+#endif
+#ifdef INET6
+	struct in6_addr *ip6 = NULL;
+	int ip6s = 0;
+#endif
+	int descend, error;
+
+	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
+	pr = req->td->td_ucred->cr_prison;
+	error = 0;
+	sx_slock(&allprison_lock);
+	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
+#if defined(INET) || defined(INET6)
+ again:
+#endif
+		mtx_lock(&cpr->pr_mtx);
+#ifdef INET
+		if (cpr->pr_ip4s > 0) {
+			if (ip4s < cpr->pr_ip4s) {
+				ip4s = cpr->pr_ip4s;
+				mtx_unlock(&cpr->pr_mtx);
+				ip4 = realloc(ip4, ip4s *
+				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
+				goto again;
+			}
+			bcopy(cpr->pr_ip4, ip4,
+			    cpr->pr_ip4s * sizeof(struct in_addr));
+		}
+#endif
+#ifdef INET6
+		if (cpr->pr_ip6s > 0) {
+			if (ip6s < cpr->pr_ip6s) {
+				ip6s = cpr->pr_ip6s;
+				mtx_unlock(&cpr->pr_mtx);
+				ip6 = realloc(ip6, ip6s *
+				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
+				goto again;
+			}
+			bcopy(cpr->pr_ip6, ip6,
+			    cpr->pr_ip6s * sizeof(struct in6_addr));
+		}
+#endif
+		if (cpr->pr_ref == 0) {
+			mtx_unlock(&cpr->pr_mtx);
+			continue;
+		}
+		bzero(xp, sizeof(*xp));
+		xp->pr_version = XPRISON_VERSION;
+		xp->pr_id = cpr->pr_id;
+		xp->pr_state = cpr->pr_uref > 0
+		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
+		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
+		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
+		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
+#ifdef INET
+		xp->pr_ip4s = cpr->pr_ip4s;
+#endif
+#ifdef INET6
+		xp->pr_ip6s = cpr->pr_ip6s;
+#endif
+		mtx_unlock(&cpr->pr_mtx);
+		error = SYSCTL_OUT(req, xp, sizeof(*xp));
+		if (error)
+			break;
+#ifdef INET
+		if (xp->pr_ip4s > 0) {
+			error = SYSCTL_OUT(req, ip4,
+			    xp->pr_ip4s * sizeof(struct in_addr));
+			if (error)
+				break;
+		}
+#endif
+#ifdef INET6
+		if (xp->pr_ip6s > 0) {
+			error = SYSCTL_OUT(req, ip6,
+			    xp->pr_ip6s * sizeof(struct in6_addr));
+			if (error)
+				break;
+		}
+#endif
+	}
+	sx_sunlock(&allprison_lock);
+	free(xp, M_TEMP);
+#ifdef INET
+	free(ip4, M_TEMP);
+#endif
+#ifdef INET6
+	free(ip6, M_TEMP);
+#endif
+	return (error);
+}
+
+SYSCTL_OID(_security_jail, OID_AUTO, list,
+    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_jail_list, "S", "List of active jails");
+
+static int
+sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
+{
+	int error, injail;
+
+	injail = jailed(req->td->td_ucred);
+	error = SYSCTL_OUT(req, &injail, sizeof(injail));
+
+	return (error);
+}
+
+SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
+    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_jail_jailed, "I", "Process in jail?");
+
+static int
+sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
+{
+	int error, havevnet;
+#ifdef VIMAGE
+	struct ucred *cred = req->td->td_ucred;
+
+	havevnet = jailed(cred) && prison_owns_vnet(cred);
+#else
+	havevnet = 0;
+#endif
+	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
+
+	return (error);
+}
+
+SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
+    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_jail_vnet, "I", "Jail owns VNET?");
+
+#if defined(INET) || defined(INET6)
+SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
+    &jail_max_af_ips, 0,
+    "Number of IP addresses a jail may have at most per address family");
+#endif
+
+/*
+ * Default parameters for jail(2) compatability.  For historical reasons,
+ * the sysctl names have varying similarity to the parameter names.  Prisons
+ * just see their own parameters, and can't change them.
+ */
+static int
+sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
+{
+	struct prison *pr;
+	int allow, error, i;
+
+	pr = req->td->td_ucred->cr_prison;
+	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
+
+	/* Get the current flag value, and convert it to a boolean. */
+	i = (allow & arg2) ? 1 : 0;
+	if (arg1 != NULL)
+		i = !i;
+	error = sysctl_handle_int(oidp, &i, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	i = i ? arg2 : 0;
+	if (arg1 != NULL)
+		i ^= arg2;
+	/*
+	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
+	 * for writing.
+	 */
+	mtx_lock(&prison0.pr_mtx);
+	jail_default_allow = (jail_default_allow & ~arg2) | i;
+	mtx_unlock(&prison0.pr_mtx);
+	return (0);
+}
+
+SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
+    "Processes in jail can set their hostnames");
+SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
+    "Processes in jail are limited to creating UNIX/IP/route sockets only");
+SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
+    "Processes in jail can use System V IPC primitives");
+SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
+    "Prison root can create raw sockets");
+SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
+    "Processes in jail can alter system file flags");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
+    "Processes in jail can mount/unmount jail-friendly file systems");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I",
+    "Processes in jail can mount the devfs file system");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I",
+    "Processes in jail can mount the nullfs file system");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I",
+    "Processes in jail can mount the procfs file system");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I",
+    "Processes in jail can mount the tmpfs file system");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I",
+    "Processes in jail can mount the zfs file system");
+
+static int
+sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
+{
+	struct prison *pr;
+	int level, error;
+
+	pr = req->td->td_ucred->cr_prison;
+	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
+	error = sysctl_handle_int(oidp, &level, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	*(int *)arg1 = level;
+	return (0);
+}
+
+SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
+    sysctl_jail_default_level, "I",
+    "Processes in jail cannot see all mounted file systems");
+
+SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
+    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
+    &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
+    sysctl_jail_default_level, "I",
+    "Ruleset for the devfs filesystem in jail");
+
+/*
+ * Nodes to describe jail parameters.  Maximum length of string parameters
+ * is returned in the string itself, and the other parameters exist merely
+ * to make themselves and their types known.
+ */
+SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
+    "Jail parameters");
+
+int
+sysctl_jail_param(SYSCTL_HANDLER_ARGS)
+{
+	int i;
+	long l;
+	size_t s;
+	char numbuf[12];
+
+	switch (oidp->oid_kind & CTLTYPE)
+	{
+	case CTLTYPE_LONG:
+	case CTLTYPE_ULONG:
+		l = 0;
+#ifdef SCTL_MASK32
+		if (!(req->flags & SCTL_MASK32))
+#endif
+			return (SYSCTL_OUT(req, &l, sizeof(l)));
+	case CTLTYPE_INT:
+	case CTLTYPE_UINT:
+		i = 0;
+		return (SYSCTL_OUT(req, &i, sizeof(i)));
+	case CTLTYPE_STRING:
+		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
+		return
+		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
+	case CTLTYPE_STRUCT:
+		s = (size_t)arg2;
+		return (SYSCTL_OUT(req, &s, sizeof(s)));
+	}
+	return (0);
+}
+
+SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
+SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
+SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
+SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
+SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
+    "I", "Jail secure level");
+SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
+    "I", "Jail cannot see all mounted file systems");
+SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
+    "I", "Ruleset for in-jail devfs mounts");
+SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail persistence");
+#ifdef VIMAGE
+SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
+    "E,jailsys", "Virtual network stack");
+#endif
+SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
+    "B", "Jail is in the process of shutting down");
+
+SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
+SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
+    "I", "Current number of child jails");
+SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
+    "I", "Maximum number of child jails");
+
+SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
+SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
+    "Jail hostname");
+SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
+    "Jail NIS domainname");
+SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
+    "Jail host UUID");
+SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
+    "LU", "Jail host ID");
+
+SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
+SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
+
+#ifdef INET
+SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
+    "Jail IPv4 address virtualization");
+SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
+    "S,in_addr,a", "Jail IPv4 addresses");
+SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Do (not) use IPv4 source address selection rather than the "
+    "primary jail IPv4 address.");
+#endif
+#ifdef INET6
+SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
+    "Jail IPv6 address virtualization");
+SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
+    "S,in6_addr,a", "Jail IPv6 addresses");
+SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Do (not) use IPv6 source address selection rather than the "
+    "primary jail IPv6 address.");
+#endif
+
+SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
+SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may set hostname");
+SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may use SYSV IPC");
+SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may create raw sockets");
+SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may alter system file flags");
+SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may set file quotas");
+SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
+
+SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
+SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may mount/unmount jail-friendly file systems in general");
+SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may mount the devfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may mount the nullfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may mount the procfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may mount the tmpfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may mount the zfs file system");
+
+void
+prison_racct_foreach(void (*callback)(struct racct *racct,
+    void *arg2, void *arg3), void *arg2, void *arg3)
+{
+	struct prison_racct *prr;
+
+	sx_slock(&allprison_lock);
+	LIST_FOREACH(prr, &allprison_racct, prr_next)
+		(callback)(prr->prr_racct, arg2, arg3);
+	sx_sunlock(&allprison_lock);
+}
+
+static struct prison_racct *
+prison_racct_find_locked(const char *name)
+{
+	struct prison_racct *prr;
+
+	sx_assert(&allprison_lock, SA_XLOCKED);
+
+	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
+		return (NULL);
+
+	LIST_FOREACH(prr, &allprison_racct, prr_next) {
+		if (strcmp(name, prr->prr_name) != 0)
+			continue;
+
+		/* Found prison_racct with a matching name? */
+		prison_racct_hold(prr);
+		return (prr);
+	}
+
+	/* Add new prison_racct. */
+	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
+	racct_create(&prr->prr_racct);
+
+	strcpy(prr->prr_name, name);
+	refcount_init(&prr->prr_refcount, 1);
+	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
+
+	return (prr);
+}
+
+struct prison_racct *
+prison_racct_find(const char *name)
+{
+	struct prison_racct *prr;
+
+	sx_xlock(&allprison_lock);
+	prr = prison_racct_find_locked(name);
+	sx_xunlock(&allprison_lock);
+	return (prr);
+}
+
+void
+prison_racct_hold(struct prison_racct *prr)
+{
+
+	refcount_acquire(&prr->prr_refcount);
+}
+
+static void
+prison_racct_free_locked(struct prison_racct *prr)
+{
+
+	sx_assert(&allprison_lock, SA_XLOCKED);
+
+	if (refcount_release(&prr->prr_refcount)) {
+		racct_destroy(&prr->prr_racct);
+		LIST_REMOVE(prr, prr_next);
+		free(prr, M_PRISON_RACCT);
+	}
+}
+
+void
+prison_racct_free(struct prison_racct *prr)
+{
+	int old;
+
+	sx_assert(&allprison_lock, SA_UNLOCKED);
+
+	old = prr->prr_refcount;
+	if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
+		return;
+
+	sx_xlock(&allprison_lock);
+	prison_racct_free_locked(prr);
+	sx_xunlock(&allprison_lock);
+}
+
+#ifdef RACCT
+static void
+prison_racct_attach(struct prison *pr)
+{
+	struct prison_racct *prr;
+
+	sx_assert(&allprison_lock, SA_XLOCKED);
+
+	prr = prison_racct_find_locked(pr->pr_name);
+	KASSERT(prr != NULL, ("cannot find prison_racct"));
+
+	pr->pr_prison_racct = prr;
+}
+
+/*
+ * Handle jail renaming.  From the racct point of view, renaming means
+ * moving from one prison_racct to another.
+ */
+static void
+prison_racct_modify(struct prison *pr)
+{
+	struct proc *p;
+	struct ucred *cred;
+	struct prison_racct *oldprr;
+
+	sx_slock(&allproc_lock);
+	sx_xlock(&allprison_lock);
+
+	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
+		sx_xunlock(&allprison_lock);
+		sx_sunlock(&allproc_lock);
+		return;
+	}
+
+	oldprr = pr->pr_prison_racct;
+	pr->pr_prison_racct = NULL;
+
+	prison_racct_attach(pr);
+
+	/*
+	 * Move resource utilisation records.
+	 */
+	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
+
+	/*
+	 * Force rctl to reattach rules to processes.
+	 */
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		cred = crhold(p->p_ucred);
+		PROC_UNLOCK(p);
+		racct_proc_ucred_changed(p, cred, cred);
+		crfree(cred);
+	}
+
+	sx_sunlock(&allproc_lock);
+	prison_racct_free_locked(oldprr);
+	sx_xunlock(&allprison_lock);
+}
+
+static void
+prison_racct_detach(struct prison *pr)
+{
+
+	sx_assert(&allprison_lock, SA_UNLOCKED);
+
+	if (pr->pr_prison_racct == NULL)
+		return;
+	prison_racct_free(pr->pr_prison_racct);
+	pr->pr_prison_racct = NULL;
+}
+#endif /* RACCT */
+
+#ifdef DDB
+
+static void
+db_show_prison(struct prison *pr)
+{
+	int fi;
+#if defined(INET) || defined(INET6)
+	int ii;
+#endif
+	unsigned jsf;
+#ifdef INET6
+	char ip6buf[INET6_ADDRSTRLEN];
+#endif
+
+	db_printf("prison %p:\n", pr);
+	db_printf(" jid             = %d\n", pr->pr_id);
+	db_printf(" name            = %s\n", pr->pr_name);
+	db_printf(" parent          = %p\n", pr->pr_parent);
+	db_printf(" ref             = %d\n", pr->pr_ref);
+	db_printf(" uref            = %d\n", pr->pr_uref);
+	db_printf(" path            = %s\n", pr->pr_path);
+	db_printf(" cpuset          = %d\n", pr->pr_cpuset
+	    ? pr->pr_cpuset->cs_id : -1);
+#ifdef VIMAGE
+	db_printf(" vnet            = %p\n", pr->pr_vnet);
+#endif
+	db_printf(" root            = %p\n", pr->pr_root);
+	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
+	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
+	db_printf(" children.max    = %d\n", pr->pr_childmax);
+	db_printf(" children.cur    = %d\n", pr->pr_childcount);
+	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
+	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
+	db_printf(" flags           = 0x%x", pr->pr_flags);
+	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
+	    fi++)
+		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
+			db_printf(" %s", pr_flag_names[fi]);
+	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
+	    fi++) {
+		jsf = pr->pr_flags &
+		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
+		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
+		    pr_flag_jailsys[fi].disable && 
+		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
+		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
+		    : "inherit");
+	}
+	db_printf(" allow           = 0x%x", pr->pr_allow);
+	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
+	    fi++)
+		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
+			db_printf(" %s", pr_allow_names[fi]);
+	db_printf("\n");
+	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
+	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
+	db_printf(" host.domainname = %s\n", pr->pr_domainname);
+	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
+	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
+#ifdef INET
+	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
+	for (ii = 0; ii < pr->pr_ip4s; ii++)
+		db_printf(" %s %s\n",
+		    ii == 0 ? "ip4.addr        =" : "                 ",
+		    inet_ntoa(pr->pr_ip4[ii]));
+#endif
+#ifdef INET6
+	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
+	for (ii = 0; ii < pr->pr_ip6s; ii++)
+		db_printf(" %s %s\n",
+		    ii == 0 ? "ip6.addr        =" : "                 ",
+		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
+#endif
+}
+
+DB_SHOW_COMMAND(prison, db_show_prison_command)
+{
+	struct prison *pr;
+
+	if (!have_addr) {
+		/*
+		 * Show all prisons in the list, and prison0 which is not
+		 * listed.
+		 */
+		db_show_prison(&prison0);
+		if (!db_pager_quit) {
+			TAILQ_FOREACH(pr, &allprison, pr_list) {
+				db_show_prison(pr);
+				if (db_pager_quit)
+					break;
+			}
+		}
+		return;
+	}
+
+	if (addr == 0)
+		pr = &prison0;
+	else {
+		/* Look for a prison with the ID and with references. */
+		TAILQ_FOREACH(pr, &allprison, pr_list)
+			if (pr->pr_id == addr && pr->pr_ref > 0)
+				break;
+		if (pr == NULL)
+			/* Look again, without requiring a reference. */
+			TAILQ_FOREACH(pr, &allprison, pr_list)
+				if (pr->pr_id == addr)
+					break;
+		if (pr == NULL)
+			/* Assume address points to a valid prison. */
+			pr = (struct prison *)addr;
+	}
+	db_show_prison(pr);
+}
+
+#endif /* DDB */
diff --git a/sys/kern/kern_khelp.c b/sys/kern/kern_khelp.c
new file mode 100644
index 0000000..50751e9
--- /dev/null
+++ b/sys/kern/kern_khelp.c
@@ -0,0 +1,372 @@
+/*-
+ * Copyright (c) 2010,2013 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Lawrence Stewart while studying at the Centre
+ * for Advanced Internet Architectures, Swinburne University of Technology,
+ * made possible in part by grants from the FreeBSD Foundation and Cisco
+ * University Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/hhook.h>
+#include <sys/khelp.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/module_khelp.h>
+#include <sys/osd.h>
+#include <sys/queue.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/systm.h>
+
+static struct rwlock khelp_list_lock;
+RW_SYSINIT(khelplistlock, &khelp_list_lock, "helper list lock");
+
+static TAILQ_HEAD(helper_head, helper) helpers = TAILQ_HEAD_INITIALIZER(helpers);
+
+/* Private function prototypes. */
+static inline void khelp_remove_osd(struct helper *h, struct osd *hosd);
+void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags);
+
+#define	KHELP_LIST_WLOCK() rw_wlock(&khelp_list_lock)
+#define	KHELP_LIST_WUNLOCK() rw_wunlock(&khelp_list_lock)
+#define	KHELP_LIST_RLOCK() rw_rlock(&khelp_list_lock)
+#define	KHELP_LIST_RUNLOCK() rw_runlock(&khelp_list_lock)
+#define	KHELP_LIST_LOCK_ASSERT() rw_assert(&khelp_list_lock, RA_LOCKED)
+
+int
+khelp_register_helper(struct helper *h)
+{
+	struct helper *tmph;
+	int error, i, inserted;
+
+	error = inserted = 0;
+	refcount_init(&h->h_refcount, 0);
+	h->h_id = osd_register(OSD_KHELP, NULL, NULL);
+
+	/* It's only safe to add the hooks after osd_register(). */
+	for (i = 0; i < h->h_nhooks && !error; i++) {
+		/* We don't require the module to assign hook_helper. */
+		h->h_hooks[i].hook_helper = h;
+		error = hhook_add_hook_lookup(&h->h_hooks[i], HHOOK_WAITOK);
+		if (error)
+			printf("%s: \"%s\" khelp module unable to "
+			    "hook type %d id %d due to error %d\n", __func__,
+			    h->h_name, h->h_hooks[i].hook_type,
+			    h->h_hooks[i].hook_id, error);
+	}
+
+	if (error) {
+		for (i--; i >= 0; i--)
+			hhook_remove_hook_lookup(&h->h_hooks[i]);
+		osd_deregister(OSD_KHELP, h->h_id);
+	} else {
+		KHELP_LIST_WLOCK();
+		/*
+		 * Keep list of helpers sorted in descending h_id order. Due to
+		 * the way osd_set() works, a sorted list ensures
+		 * khelp_init_osd() will operate with improved efficiency.
+		 */
+		TAILQ_FOREACH(tmph, &helpers, h_next) {
+			if (tmph->h_id < h->h_id) {
+				TAILQ_INSERT_BEFORE(tmph, h, h_next);
+				inserted = 1;
+				break;
+			}
+		}
+
+		if (!inserted)
+			TAILQ_INSERT_TAIL(&helpers, h, h_next);
+		KHELP_LIST_WUNLOCK();
+	}
+
+	return (error);
+}
+
+int
+khelp_deregister_helper(struct helper *h)
+{
+	struct helper *tmph;
+	int error, i;
+
+	KHELP_LIST_WLOCK();
+	if (h->h_refcount > 0)
+		error = EBUSY;
+	else {
+		error = ENOENT;
+		TAILQ_FOREACH(tmph, &helpers, h_next) {
+			if (tmph == h) {
+				TAILQ_REMOVE(&helpers, h, h_next);
+				error = 0;
+				break;
+			}
+		}
+	}
+	KHELP_LIST_WUNLOCK();
+
+	if (!error) {
+		for (i = 0; i < h->h_nhooks; i++)
+			hhook_remove_hook_lookup(&h->h_hooks[i]);
+		osd_deregister(OSD_KHELP, h->h_id);
+	}
+
+	return (error);
+}
+
+int
+khelp_init_osd(uint32_t classes, struct osd *hosd)
+{
+	struct helper *h;
+	void *hdata;
+	int error;
+
+	KASSERT(hosd != NULL, ("struct osd not initialised!"));
+
+	error = 0;
+
+	KHELP_LIST_RLOCK();
+	TAILQ_FOREACH(h, &helpers, h_next) {
+		/* If helper is correct class and needs to store OSD... */
+		if (h->h_classes & classes && h->h_flags & HELPER_NEEDS_OSD) {
+			hdata = uma_zalloc(h->h_zone, M_NOWAIT);
+			if (hdata == NULL) {
+				error = ENOMEM;
+				break;
+			}
+			osd_set(OSD_KHELP, hosd, h->h_id, hdata);
+			refcount_acquire(&h->h_refcount);
+		}
+	}
+
+	if (error) {
+		/* Delete OSD that was assigned prior to the error. */
+		TAILQ_FOREACH(h, &helpers, h_next) {
+			if (h->h_classes & classes)
+				khelp_remove_osd(h, hosd);
+		}
+	}
+	KHELP_LIST_RUNLOCK();
+
+	return (error);
+}
+
+int
+khelp_destroy_osd(struct osd *hosd)
+{
+	struct helper *h;
+	int error;
+
+	KASSERT(hosd != NULL, ("struct osd not initialised!"));
+
+	error = 0;
+
+	KHELP_LIST_RLOCK();
+	/*
+	 * Clean up all khelp related OSD.
+	 *
+	 * XXXLAS: Would be nice to use something like osd_exit() here but it
+	 * doesn't have the right semantics for this purpose.
+	 */
+	TAILQ_FOREACH(h, &helpers, h_next)
+		khelp_remove_osd(h, hosd);
+	KHELP_LIST_RUNLOCK();
+
+	return (error);
+}
+
+static inline void
+khelp_remove_osd(struct helper *h, struct osd *hosd)
+{
+	void *hdata;
+
+	if (h->h_flags & HELPER_NEEDS_OSD) {
+		/*
+		 * If the current helper uses OSD and calling osd_get()
+		 * on the helper's h_id returns non-NULL, the helper has
+		 * OSD attached to 'hosd' which needs to be cleaned up.
+		 */
+		hdata = osd_get(OSD_KHELP, hosd, h->h_id);
+		if (hdata != NULL) {
+			uma_zfree(h->h_zone, hdata);
+			osd_del(OSD_KHELP, hosd, h->h_id);
+			refcount_release(&h->h_refcount);
+		}
+	}
+}
+
+void *
+khelp_get_osd(struct osd *hosd, int32_t id)
+{
+
+	return (osd_get(OSD_KHELP, hosd, id));
+}
+
+int32_t
+khelp_get_id(char *hname)
+{
+	struct helper *h;
+	int32_t id;
+
+	id = -1;
+
+	KHELP_LIST_RLOCK();
+	TAILQ_FOREACH(h, &helpers, h_next) {
+		if (strncmp(h->h_name, hname, HELPER_NAME_MAXLEN) == 0) {
+			id = h->h_id;
+			break;
+		}
+	}
+	KHELP_LIST_RUNLOCK();
+
+	return (id);
+}
+
+int
+khelp_add_hhook(struct hookinfo *hki, uint32_t flags)
+{
+	int error;
+
+	/*
+	 * XXXLAS: Should probably include the functionality to update the
+	 * helper's h_hooks struct member.
+	 */
+	error = hhook_add_hook_lookup(hki, flags);
+
+	return (error);
+}
+
+int
+khelp_remove_hhook(struct hookinfo *hki)
+{
+	int error;
+
+	/*
+	 * XXXLAS: Should probably include the functionality to update the
+	 * helper's h_hooks struct member.
+	 */
+	error = hhook_remove_hook_lookup(hki);
+
+	return (error);
+}
+
+/*
+ * Private KPI between hhook and khelp that allows khelp modules to insert hook
+ * functions into hhook points which register after the modules were loaded.
+ */
+void
+khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags)
+{
+	struct helper *h;
+	int error, i;
+
+	KHELP_LIST_RLOCK();
+	TAILQ_FOREACH(h, &helpers, h_next) {
+		for (i = 0; i < h->h_nhooks; i++) {
+			if (hhh->hhh_type != h->h_hooks[i].hook_type ||
+			    hhh->hhh_id != h->h_hooks[i].hook_id)
+				continue;
+			error = hhook_add_hook(hhh, &h->h_hooks[i], flags);
+			if (error) {
+				printf("%s: \"%s\" khelp module unable to "
+				    "hook type %d id %d due to error %d\n",
+				    __func__, h->h_name,
+				    h->h_hooks[i].hook_type,
+				    h->h_hooks[i].hook_id, error);
+				error = 0;
+			}
+		}
+	}
+	KHELP_LIST_RUNLOCK();
+}
+
+int
+khelp_modevent(module_t mod, int event_type, void *data)
+{
+	struct khelp_modevent_data *kmd;
+	int error;
+
+	kmd = (struct khelp_modevent_data *)data;
+	error = 0;
+
+	switch(event_type) {
+	case MOD_LOAD:
+		if (kmd->helper->h_flags & HELPER_NEEDS_OSD) {
+			if (kmd->uma_zsize <= 0) {
+				printf("Use KHELP_DECLARE_MOD_UMA() instead!\n");
+				error = EDOOFUS;
+				break;
+			}
+			kmd->helper->h_zone = uma_zcreate(kmd->name,
+			    kmd->uma_zsize, kmd->umactor, kmd->umadtor, NULL,
+			    NULL, 0, 0);
+			if (kmd->helper->h_zone == NULL) {
+				error = ENOMEM;
+				break;
+			}
+		}
+		strlcpy(kmd->helper->h_name, kmd->name, HELPER_NAME_MAXLEN);
+		kmd->helper->h_hooks = kmd->hooks;
+		kmd->helper->h_nhooks = kmd->nhooks;
+		if (kmd->helper->mod_init != NULL)
+			error = kmd->helper->mod_init();
+		if (!error)
+			error = khelp_register_helper(kmd->helper);
+		break;
+
+	case MOD_QUIESCE:
+	case MOD_SHUTDOWN:
+	case MOD_UNLOAD:
+		error = khelp_deregister_helper(kmd->helper);
+		if (!error) {
+			if (kmd->helper->h_flags & HELPER_NEEDS_OSD)
+				uma_zdestroy(kmd->helper->h_zone);
+			if (kmd->helper->mod_destroy != NULL)
+				kmd->helper->mod_destroy();
+		} else if (error == ENOENT)
+			/* Do nothing and allow unload if helper not in list. */
+			error = 0;
+		else if (error == EBUSY)
+			printf("Khelp module \"%s\" can't unload until its "
+			    "refcount drops from %d to 0.\n", kmd->name,
+			    kmd->helper->h_refcount);
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c
new file mode 100644
index 0000000..969c513
--- /dev/null
+++ b/sys/kern/kern_kthread.c
@@ -0,0 +1,466 @@
+/*-
+ * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/signalvar.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/wait.h>
+#include <sys/sched.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+#include <machine/stdarg.h>
+
+/*
+ * Start a kernel process.  This is called after a fork() call in
+ * mi_startup() in the file kern/init_main.c.
+ *
+ * This function is used to start "internal" daemons and intended
+ * to be called from SYSINIT().
+ */
+void
+kproc_start(udata)
+	const void *udata;
+{
+	const struct kproc_desc	*kp = udata;
+	int error;
+
+	error = kproc_create((void (*)(void *))kp->func, NULL,
+		    kp->global_procpp, 0, 0, "%s", kp->arg0);
+	if (error)
+		panic("kproc_start: %s: error %d", kp->arg0, error);
+}
+
+/*
+ * Create a kernel process/thread/whatever.  It shares its address space
+ * with proc0 - ie: kernel only.
+ *
+ * func is the function to start.
+ * arg is the parameter to pass to function on first startup.
+ * newpp is the return value pointing to the thread's struct proc.
+ * flags are flags to fork1 (in unistd.h)
+ * fmt and following will be *printf'd into (*newpp)->p_comm (for ps, etc.).
+ */
+int
+kproc_create(void (*func)(void *), void *arg,
+    struct proc **newpp, int flags, int pages, const char *fmt, ...)
+{
+	int error;
+	va_list ap;
+	struct thread *td;
+	struct proc *p2;
+
+	if (!proc0.p_stats)
+		panic("kproc_create called too soon");
+
+	error = fork1(&thread0, RFMEM | RFFDG | RFPROC | RFSTOPPED | flags,
+	    pages, &p2, NULL, 0);
+	if (error)
+		return error;
+
+	/* save a global descriptor, if desired */
+	if (newpp != NULL)
+		*newpp = p2;
+
+	/* this is a non-swapped system process */
+	PROC_LOCK(p2);
+	td = FIRST_THREAD_IN_PROC(p2);
+	p2->p_flag |= P_SYSTEM | P_KTHREAD;
+	td->td_pflags |= TDP_KTHREAD;
+	mtx_lock(&p2->p_sigacts->ps_mtx);
+	p2->p_sigacts->ps_flag |= PS_NOCLDWAIT;
+	mtx_unlock(&p2->p_sigacts->ps_mtx);
+	PROC_UNLOCK(p2);
+
+	/* set up arg0 for 'ps', et al */
+	va_start(ap, fmt);
+	vsnprintf(p2->p_comm, sizeof(p2->p_comm), fmt, ap);
+	va_end(ap);
+	/* set up arg0 for 'ps', et al */
+	va_start(ap, fmt);
+	vsnprintf(td->td_name, sizeof(td->td_name), fmt, ap);
+	va_end(ap);
+#ifdef KTR
+	sched_clear_tdname(td);
+#endif
+
+	/* call the processes' main()... */
+	cpu_set_fork_handler(td, func, arg);
+
+	/* Avoid inheriting affinity from a random parent. */
+	cpuset_setthread(td->td_tid, cpuset_root);
+	thread_lock(td);
+	TD_SET_CAN_RUN(td);
+	sched_prio(td, PVM);
+	sched_user_prio(td, PUSER);
+
+	/* Delay putting it on the run queue until now. */
+	if (!(flags & RFSTOPPED))
+		sched_add(td, SRQ_BORING); 
+	thread_unlock(td);
+
+	return 0;
+}
+
+void
+kproc_exit(int ecode)
+{
+	struct thread *td;
+	struct proc *p;
+
+	td = curthread;
+	p = td->td_proc;
+
+	/*
+	 * Reparent curthread from proc0 to init so that the zombie
+	 * is harvested.
+	 */
+	sx_xlock(&proctree_lock);
+	PROC_LOCK(p);
+	proc_reparent(p, initproc);
+	PROC_UNLOCK(p);
+	sx_xunlock(&proctree_lock);
+
+	/*
+	 * Wakeup anyone waiting for us to exit.
+	 */
+	wakeup(p);
+
+	/* Buh-bye! */
+	exit1(td, W_EXITCODE(ecode, 0));
+}
+
+/*
+ * Advise a kernel process to suspend (or resume) in its main loop.
+ * Participation is voluntary.
+ */
+int
+kproc_suspend(struct proc *p, int timo)
+{
+	/*
+	 * Make sure this is indeed a system process and we can safely
+	 * use the p_siglist field.
+	 */
+	PROC_LOCK(p);
+	if ((p->p_flag & P_KTHREAD) == 0) {
+		PROC_UNLOCK(p);
+		return (EINVAL);
+	}
+	SIGADDSET(p->p_siglist, SIGSTOP);
+	wakeup(p);
+	return msleep(&p->p_siglist, &p->p_mtx, PPAUSE | PDROP, "suspkp", timo);
+}
+
+int
+kproc_resume(struct proc *p)
+{
+	/*
+	 * Make sure this is indeed a system process and we can safely
+	 * use the p_siglist field.
+	 */
+	PROC_LOCK(p);
+	if ((p->p_flag & P_KTHREAD) == 0) {
+		PROC_UNLOCK(p);
+		return (EINVAL);
+	}
+	SIGDELSET(p->p_siglist, SIGSTOP);
+	PROC_UNLOCK(p);
+	wakeup(&p->p_siglist);
+	return (0);
+}
+
+void
+kproc_suspend_check(struct proc *p)
+{
+	PROC_LOCK(p);
+	while (SIGISMEMBER(p->p_siglist, SIGSTOP)) {
+		wakeup(&p->p_siglist);
+		msleep(&p->p_siglist, &p->p_mtx, PPAUSE, "kpsusp", 0);
+	}
+	PROC_UNLOCK(p);
+}
+
+
+/*
+ * Start a kernel thread.  
+ *
+ * This function is used to start "internal" daemons and intended
+ * to be called from SYSINIT().
+ */
+
+void
+kthread_start(udata)
+	const void *udata;
+{
+	const struct kthread_desc	*kp = udata;
+	int error;
+
+	error = kthread_add((void (*)(void *))kp->func, NULL,
+		    NULL, kp->global_threadpp, 0, 0, "%s", kp->arg0);
+	if (error)
+		panic("kthread_start: %s: error %d", kp->arg0, error);
+}
+
+/*
+ * Create a kernel thread.  It shares its address space
+ * with proc0 - ie: kernel only.
+ *
+ * func is the function to start.
+ * arg is the parameter to pass to function on first startup.
+ * newtdp is the return value pointing to the thread's struct thread.
+ *  ** XXX fix this --> flags are flags to fork1 (in unistd.h) 
+ * fmt and following will be *printf'd into (*newtd)->td_name (for ps, etc.).
+ */
+int
+kthread_add(void (*func)(void *), void *arg, struct proc *p,
+    struct thread **newtdp, int flags, int pages, const char *fmt, ...)
+{
+	va_list ap;
+	struct thread *newtd, *oldtd;
+
+	if (!proc0.p_stats)
+		panic("kthread_add called too soon");
+
+	/* If no process supplied, put it on proc0 */
+	if (p == NULL)
+		p = &proc0;
+
+	/* Initialize our new td  */
+	newtd = thread_alloc(pages);
+	if (newtd == NULL)
+		return (ENOMEM);
+
+	PROC_LOCK(p);
+	oldtd = FIRST_THREAD_IN_PROC(p);
+
+	bzero(&newtd->td_startzero,
+	    __rangeof(struct thread, td_startzero, td_endzero));
+	bcopy(&oldtd->td_startcopy, &newtd->td_startcopy,
+	    __rangeof(struct thread, td_startcopy, td_endcopy));
+
+	/* set up arg0 for 'ps', et al */
+	va_start(ap, fmt);
+	vsnprintf(newtd->td_name, sizeof(newtd->td_name), fmt, ap);
+	va_end(ap);
+
+	newtd->td_proc = p;  /* needed for cpu_set_upcall */
+
+	/* XXX optimise this probably? */
+	/* On x86 (and probably the others too) it is way too full of junk */
+	/* Needs a better name */
+	cpu_set_upcall(newtd, oldtd);
+	/* put the designated function(arg) as the resume context */
+	cpu_set_fork_handler(newtd, func, arg);
+
+	newtd->td_pflags |= TDP_KTHREAD;
+	newtd->td_ucred = crhold(p->p_ucred);
+
+	/* this code almost the same as create_thread() in kern_thr.c */
+	p->p_flag |= P_HADTHREADS;
+	thread_link(newtd, p);
+	thread_lock(oldtd);
+	/* let the scheduler know about these things. */
+	sched_fork_thread(oldtd, newtd);
+	TD_SET_CAN_RUN(newtd);
+	thread_unlock(oldtd);
+	PROC_UNLOCK(p);
+
+	tidhash_add(newtd);
+
+	/* Avoid inheriting affinity from a random parent. */
+	cpuset_setthread(newtd->td_tid, cpuset_root);
+
+	/* Delay putting it on the run queue until now. */
+	if (!(flags & RFSTOPPED)) {
+		thread_lock(newtd);
+		sched_add(newtd, SRQ_BORING); 
+		thread_unlock(newtd);
+	}
+	if (newtdp)
+		*newtdp = newtd;
+	return 0;
+}
+
+void
+kthread_exit(void)
+{
+	struct proc *p;
+
+	p = curthread->td_proc;
+
+	/* A module may be waiting for us to exit. */
+	wakeup(curthread);
+
+	/*
+	 * The last exiting thread in a kernel process must tear down
+	 * the whole process.
+	 */
+	rw_wlock(&tidhash_lock);
+	PROC_LOCK(p);
+	if (p->p_numthreads == 1) {
+		PROC_UNLOCK(p);
+		rw_wunlock(&tidhash_lock);
+		kproc_exit(0);
+	}
+	LIST_REMOVE(curthread, td_hash);
+	rw_wunlock(&tidhash_lock);
+	PROC_SLOCK(p);
+	thread_exit();
+}
+
+/*
+ * Advise a kernel process to suspend (or resume) in its main loop.
+ * Participation is voluntary.
+ */
+int
+kthread_suspend(struct thread *td, int timo)
+{
+	struct proc *p;
+
+	p = td->td_proc;
+
+	/*
+	 * td_pflags should not be read by any thread other than
+	 * curthread, but as long as this flag is invariant during the
+	 * thread's lifetime, it is OK to check its state.
+	 */
+	if ((td->td_pflags & TDP_KTHREAD) == 0)
+		return (EINVAL);
+
+	/*
+	 * The caller of the primitive should have already checked that the
+	 * thread is up and running, thus not being blocked by other
+	 * conditions.
+	 */
+	PROC_LOCK(p);
+	thread_lock(td);
+	td->td_flags |= TDF_KTH_SUSP;
+	thread_unlock(td);
+	return (msleep(&td->td_flags, &p->p_mtx, PPAUSE | PDROP, "suspkt",
+	    timo));
+}
+
+/*
+ * Resume a thread previously put asleep with kthread_suspend().
+ */
+int
+kthread_resume(struct thread *td)
+{
+	struct proc *p;
+
+	p = td->td_proc;
+
+	/*
+	 * td_pflags should not be read by any thread other than
+	 * curthread, but as long as this flag is invariant during the
+	 * thread's lifetime, it is OK to check its state.
+	 */
+	if ((td->td_pflags & TDP_KTHREAD) == 0)
+		return (EINVAL);
+
+	PROC_LOCK(p);
+	thread_lock(td);
+	td->td_flags &= ~TDF_KTH_SUSP;
+	thread_unlock(td);
+	wakeup(&td->td_flags);
+	PROC_UNLOCK(p);
+	return (0);
+}
+
+/*
+ * Used by the thread to poll as to whether it should yield/sleep
+ * and notify the caller that is has happened.
+ */
+void
+kthread_suspend_check()
+{
+	struct proc *p;
+	struct thread *td;
+
+	td = curthread;
+	p = td->td_proc;
+
+	if ((td->td_pflags & TDP_KTHREAD) == 0)
+		panic("%s: curthread is not a valid kthread", __func__);
+
+	/*
+	 * As long as the double-lock protection is used when accessing the
+	 * TDF_KTH_SUSP flag, synchronizing the read operation via proc mutex
+	 * is fine.
+	 */
+	PROC_LOCK(p);
+	while (td->td_flags & TDF_KTH_SUSP) {
+		wakeup(&td->td_flags);
+		msleep(&td->td_flags, &p->p_mtx, PPAUSE, "ktsusp", 0);
+	}
+	PROC_UNLOCK(p);
+}
+
+int
+kproc_kthread_add(void (*func)(void *), void *arg,
+            struct proc **procptr, struct thread **tdptr,
+            int flags, int pages, const char *procname, const char *fmt, ...) 
+{
+	int error;
+	va_list ap;
+	char buf[100];
+	struct thread *td;
+
+	if (*procptr == 0) {
+		error = kproc_create(func, arg,
+		    	procptr, flags, pages, "%s", procname);
+		if (error)
+			return (error);
+		td = FIRST_THREAD_IN_PROC(*procptr);
+		if (tdptr)
+			*tdptr = td;
+		va_start(ap, fmt);
+		vsnprintf(td->td_name, sizeof(td->td_name), fmt, ap);
+		va_end(ap);
+#ifdef KTR
+		sched_clear_tdname(td);
+#endif
+		return (0); 
+	}
+	va_start(ap, fmt);
+	vsnprintf(buf, sizeof(buf), fmt, ap);
+	va_end(ap);
+	error = kthread_add(func, arg, *procptr,
+		    tdptr, flags, pages, "%s", buf);
+	return (error);
+}
diff --git a/sys/kern/kern_ktr.c b/sys/kern/kern_ktr.c
new file mode 100644
index 0000000..3202b9b
--- /dev/null
+++ b/sys/kern/kern_ktr.c
@@ -0,0 +1,495 @@
+/*-
+ * Copyright (c) 2000 John Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This module holds the global variables used by KTR and the ktr_tracepoint()
+ * function that does the actual tracing.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_ktr.h"
+#include "opt_alq.h"
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/alq.h>
+#include <sys/cons.h>
+#include <sys/cpuset.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+
+#include <machine/cpu.h>
+#ifdef __sparc64__
+#include <machine/ktr.h>
+#endif
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#include <ddb/db_output.h>
+#endif
+
+#ifndef KTR_BOOT_ENTRIES
+#define	KTR_BOOT_ENTRIES	1024
+#endif
+
+#ifndef KTR_ENTRIES
+#define	KTR_ENTRIES	1024
+#endif
+
+/* Limit the allocations to something manageable. */
+#define	KTR_ENTRIES_MAX	(8 * 1024 * 1024)
+
+#ifndef KTR_MASK
+#define	KTR_MASK	(0)
+#endif
+
+#ifndef KTR_CPUMASK
+#define	KTR_CPUMASK	CPUSET_FSET
+#endif
+
+#ifndef KTR_TIME
+#define	KTR_TIME	get_cyclecount()
+#endif
+
+#ifndef KTR_CPU
+#define	KTR_CPU		PCPU_GET(cpuid)
+#endif
+
+static MALLOC_DEFINE(M_KTR, "KTR", "KTR");
+
+FEATURE(ktr, "Kernel support for KTR kernel tracing facility");
+
+volatile int	ktr_idx = 0;
+int	ktr_mask = KTR_MASK;
+int	ktr_compile = KTR_COMPILE;
+int	ktr_entries = KTR_BOOT_ENTRIES;
+int	ktr_version = KTR_VERSION;
+struct	ktr_entry ktr_buf_init[KTR_BOOT_ENTRIES];
+struct	ktr_entry *ktr_buf = ktr_buf_init;
+cpuset_t ktr_cpumask = CPUSET_T_INITIALIZER(KTR_CPUMASK);
+static char ktr_cpumask_str[CPUSETBUFSIZ];
+
+TUNABLE_INT("debug.ktr.mask", &ktr_mask);
+
+TUNABLE_STR("debug.ktr.cpumask", ktr_cpumask_str, sizeof(ktr_cpumask_str));
+
+static SYSCTL_NODE(_debug, OID_AUTO, ktr, CTLFLAG_RD, 0, "KTR options");
+
+SYSCTL_INT(_debug_ktr, OID_AUTO, version, CTLFLAG_RD,
+    &ktr_version, 0, "Version of the KTR interface");
+
+SYSCTL_UINT(_debug_ktr, OID_AUTO, compile, CTLFLAG_RD,
+    &ktr_compile, 0, "Bitmask of KTR event classes compiled into the kernel");
+
+static void
+ktr_cpumask_initializer(void *dummy __unused)
+{
+
+	/*
+	 * TUNABLE_STR() runs with SI_ORDER_MIDDLE priority, thus it must be
+	 * already set, if necessary.
+	 */
+	if (ktr_cpumask_str[0] != '\0' &&
+	    cpusetobj_strscan(&ktr_cpumask, ktr_cpumask_str) == -1)
+		CPU_FILL(&ktr_cpumask);
+}
+SYSINIT(ktr_cpumask_initializer, SI_SUB_TUNABLES, SI_ORDER_ANY,
+    ktr_cpumask_initializer, NULL);
+
+static int
+sysctl_debug_ktr_cpumask(SYSCTL_HANDLER_ARGS)
+{
+	char lktr_cpumask_str[CPUSETBUFSIZ];
+	cpuset_t imask;
+	int error;
+
+	cpusetobj_strprint(lktr_cpumask_str, &ktr_cpumask);
+	error = sysctl_handle_string(oidp, lktr_cpumask_str,
+	    sizeof(lktr_cpumask_str), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (cpusetobj_strscan(&imask, lktr_cpumask_str) == -1)
+		return (EINVAL);
+	CPU_COPY(&imask, &ktr_cpumask);
+
+	return (error);
+}
+SYSCTL_PROC(_debug_ktr, OID_AUTO, cpumask,
+    CTLFLAG_RW | CTLFLAG_MPSAFE | CTLTYPE_STRING, NULL, 0,
+    sysctl_debug_ktr_cpumask, "S",
+    "Bitmask of CPUs on which KTR logging is enabled");
+
+static int
+sysctl_debug_ktr_clear(SYSCTL_HANDLER_ARGS)
+{
+	int clear, error;
+
+	clear = 0;
+	error = sysctl_handle_int(oidp, &clear, 0, req);
+	if (error || !req->newptr)
+		return (error);
+
+	if (clear) {
+		bzero(ktr_buf, sizeof(*ktr_buf) * ktr_entries);
+		ktr_idx = 0;
+	}
+
+	return (error);
+}
+SYSCTL_PROC(_debug_ktr, OID_AUTO, clear, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
+    sysctl_debug_ktr_clear, "I", "Clear KTR Buffer");
+
+/*
+ * This is a sysctl proc so that it is serialized as !MPSAFE along with
+ * the other ktr sysctl procs.
+ */
+static int
+sysctl_debug_ktr_mask(SYSCTL_HANDLER_ARGS)
+{
+	int mask, error;
+
+	mask = ktr_mask;
+	error = sysctl_handle_int(oidp, &mask, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	ktr_mask = mask;
+	return (error);
+}
+
+SYSCTL_PROC(_debug_ktr, OID_AUTO, mask, CTLTYPE_UINT|CTLFLAG_RW, 0, 0,
+    sysctl_debug_ktr_mask, "IU",
+    "Bitmask of KTR event classes for which logging is enabled");
+
+#if KTR_ENTRIES > KTR_BOOT_ENTRIES
+/*
+ * A simplified version of sysctl_debug_ktr_entries.
+ * No need to care about SMP, scheduling, etc.
+ */
+static void
+ktr_entries_initializer(void *dummy __unused)
+{
+	int mask;
+
+	/* Temporarily disable ktr in case malloc() is being traced. */
+	mask = ktr_mask;
+	ktr_mask = 0;
+	ktr_buf = malloc(sizeof(*ktr_buf) * KTR_ENTRIES, M_KTR,
+	    M_WAITOK | M_ZERO);
+	memcpy(ktr_buf, ktr_buf_init + ktr_idx,
+	    (KTR_BOOT_ENTRIES - ktr_idx) * sizeof(*ktr_buf));
+	if (ktr_idx != 0)
+		memcpy(ktr_buf + KTR_BOOT_ENTRIES - ktr_idx, ktr_buf_init,
+		    ktr_idx * sizeof(*ktr_buf));
+	ktr_entries = KTR_ENTRIES;
+	ktr_mask = mask;
+}
+SYSINIT(ktr_entries_initializer, SI_SUB_KMEM, SI_ORDER_ANY,
+    ktr_entries_initializer, NULL);
+#endif
+
+static int
+sysctl_debug_ktr_entries(SYSCTL_HANDLER_ARGS)
+{
+	int entries, error, mask;
+	struct ktr_entry *buf, *oldbuf;
+
+	entries = ktr_entries;
+	error = sysctl_handle_int(oidp, &entries, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	if (entries > KTR_ENTRIES_MAX)
+		return (ERANGE);
+	/* Disable ktr temporarily. */
+	mask = ktr_mask;
+	atomic_store_rel_int(&ktr_mask, 0);
+	/* Wait for threads to go idle. */
+	if ((error = quiesce_all_cpus("ktrent", PCATCH)) != 0) {
+		ktr_mask = mask;
+		return (error);
+	}
+	if (ktr_buf != ktr_buf_init)
+		oldbuf = ktr_buf;
+	else
+		oldbuf = NULL;
+	/* Allocate a new buffer. */
+	buf = malloc(sizeof(*buf) * entries, M_KTR, M_WAITOK | M_ZERO);
+	/* Install the new buffer and restart ktr. */
+	ktr_buf = buf;
+	ktr_entries = entries;
+	ktr_idx = 0;
+	atomic_store_rel_int(&ktr_mask, mask);
+	if (oldbuf != NULL)
+		free(oldbuf, M_KTR);
+
+	return (error);
+}
+
+SYSCTL_PROC(_debug_ktr, OID_AUTO, entries, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
+    sysctl_debug_ktr_entries, "I", "Number of entries in the KTR buffer");
+
+#ifdef KTR_VERBOSE
+int	ktr_verbose = KTR_VERBOSE;
+TUNABLE_INT("debug.ktr.verbose", &ktr_verbose);
+SYSCTL_INT(_debug_ktr, OID_AUTO, verbose, CTLFLAG_RW, &ktr_verbose, 0, "");
+#endif
+
+#ifdef KTR_ALQ
+struct alq *ktr_alq;
+char	ktr_alq_file[MAXPATHLEN] = "/tmp/ktr.out";
+int	ktr_alq_cnt = 0;
+int	ktr_alq_depth = KTR_ENTRIES;
+int	ktr_alq_enabled = 0;
+int	ktr_alq_failed = 0;
+int	ktr_alq_max = 0;
+
+SYSCTL_INT(_debug_ktr, OID_AUTO, alq_max, CTLFLAG_RW, &ktr_alq_max, 0,
+    "Maximum number of entries to write");
+SYSCTL_INT(_debug_ktr, OID_AUTO, alq_cnt, CTLFLAG_RD, &ktr_alq_cnt, 0,
+    "Current number of written entries");
+SYSCTL_INT(_debug_ktr, OID_AUTO, alq_failed, CTLFLAG_RD, &ktr_alq_failed, 0,
+    "Number of times we overran the buffer");
+SYSCTL_INT(_debug_ktr, OID_AUTO, alq_depth, CTLFLAG_RW, &ktr_alq_depth, 0,
+    "Number of items in the write buffer");
+SYSCTL_STRING(_debug_ktr, OID_AUTO, alq_file, CTLFLAG_RW, ktr_alq_file,
+    sizeof(ktr_alq_file), "KTR logging file");
+
+static int
+sysctl_debug_ktr_alq_enable(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	int enable;
+
+	enable = ktr_alq_enabled;
+
+	error = sysctl_handle_int(oidp, &enable, 0, req);
+	if (error || !req->newptr)
+		return (error);
+
+	if (enable) {
+		if (ktr_alq_enabled)
+			return (0);
+		error = alq_open(&ktr_alq, (const char *)ktr_alq_file,
+		    req->td->td_ucred, ALQ_DEFAULT_CMODE,
+		    sizeof(struct ktr_entry), ktr_alq_depth);
+		if (error == 0) {
+			ktr_alq_cnt = 0;
+			ktr_alq_failed = 0;
+			ktr_alq_enabled = 1;
+		}
+	} else {
+		if (ktr_alq_enabled == 0)
+			return (0);
+		ktr_alq_enabled = 0;
+		alq_close(ktr_alq);
+		ktr_alq = NULL;
+	}
+
+	return (error);
+}
+SYSCTL_PROC(_debug_ktr, OID_AUTO, alq_enable,
+    CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_debug_ktr_alq_enable,
+    "I", "Enable KTR logging");
+#endif
+
+void
+ktr_tracepoint(u_int mask, const char *file, int line, const char *format,
+    u_long arg1, u_long arg2, u_long arg3, u_long arg4, u_long arg5,
+    u_long arg6)
+{
+	struct ktr_entry *entry;
+#ifdef KTR_ALQ
+	struct ale *ale = NULL;
+#endif
+	int newindex, saveindex;
+#if defined(KTR_VERBOSE) || defined(KTR_ALQ)
+	struct thread *td;
+#endif
+	int cpu;
+
+	if (panicstr)
+		return;
+	if ((ktr_mask & mask) == 0 || ktr_buf == NULL)
+		return;
+	cpu = KTR_CPU;
+	if (!CPU_ISSET(cpu, &ktr_cpumask))
+		return;
+#if defined(KTR_VERBOSE) || defined(KTR_ALQ)
+	td = curthread;
+	if (td->td_pflags & TDP_INKTR)
+		return;
+	td->td_pflags |= TDP_INKTR;
+#endif
+#ifdef KTR_ALQ
+	if (ktr_alq_enabled) {
+		if (td->td_critnest == 0 &&
+		    (td->td_flags & TDF_IDLETD) == 0 &&
+		    td != ald_thread) {
+			if (ktr_alq_max && ktr_alq_cnt > ktr_alq_max)
+				goto done;
+			if ((ale = alq_get(ktr_alq, ALQ_NOWAIT)) == NULL) {
+				ktr_alq_failed++;
+				goto done;
+			}
+			ktr_alq_cnt++;
+			entry = (struct ktr_entry *)ale->ae_data;
+		} else {
+			goto done;
+		}
+	} else
+#endif
+	{
+		do {
+			saveindex = ktr_idx;
+			newindex = (saveindex + 1) % ktr_entries;
+		} while (atomic_cmpset_rel_int(&ktr_idx, saveindex, newindex) == 0);
+		entry = &ktr_buf[saveindex];
+	}
+	entry->ktr_timestamp = KTR_TIME;
+	entry->ktr_cpu = cpu;
+	entry->ktr_thread = curthread;
+	if (file != NULL)
+		while (strncmp(file, "../", 3) == 0)
+			file += 3;
+	entry->ktr_file = file;
+	entry->ktr_line = line;
+#ifdef KTR_VERBOSE
+	if (ktr_verbose) {
+#ifdef SMP
+		printf("cpu%d ", cpu);
+#endif
+		if (ktr_verbose > 1) {
+			printf("%s.%d\t", entry->ktr_file,
+			    entry->ktr_line);
+		}
+		printf(format, arg1, arg2, arg3, arg4, arg5, arg6);
+		printf("\n");
+	}
+#endif
+	entry->ktr_desc = format;
+	entry->ktr_parms[0] = arg1;
+	entry->ktr_parms[1] = arg2;
+	entry->ktr_parms[2] = arg3;
+	entry->ktr_parms[3] = arg4;
+	entry->ktr_parms[4] = arg5;
+	entry->ktr_parms[5] = arg6;
+#ifdef KTR_ALQ
+	if (ktr_alq_enabled && ale)
+		alq_post(ktr_alq, ale);
+done:
+#endif
+#if defined(KTR_VERBOSE) || defined(KTR_ALQ)
+	td->td_pflags &= ~TDP_INKTR;
+#endif
+}
+
+#ifdef DDB
+
+struct tstate {
+	int	cur;
+	int	first;
+};
+static	struct tstate tstate;
+static	int db_ktr_verbose;
+static	int db_mach_vtrace(void);
+
+DB_SHOW_COMMAND(ktr, db_ktr_all)
+{
+	
+	tstate.cur = (ktr_idx - 1) % ktr_entries;
+	tstate.first = -1;
+	db_ktr_verbose = 0;
+	db_ktr_verbose |= (strchr(modif, 'v') != NULL) ? 2 : 0;
+	db_ktr_verbose |= (strchr(modif, 'V') != NULL) ? 1 : 0; /* just timestap please */
+	if (strchr(modif, 'a') != NULL) {
+		db_disable_pager();
+		while (cncheckc() != -1)
+			if (db_mach_vtrace() == 0)
+				break;
+	} else {
+		while (!db_pager_quit)
+			if (db_mach_vtrace() == 0)
+				break;
+	}
+}
+
+static int
+db_mach_vtrace(void)
+{
+	struct ktr_entry	*kp;
+
+	if (tstate.cur == tstate.first || ktr_buf == NULL) {
+		db_printf("--- End of trace buffer ---\n");
+		return (0);
+	}
+	kp = &ktr_buf[tstate.cur];
+
+	/* Skip over unused entries. */
+	if (kp->ktr_desc == NULL) {
+		db_printf("--- End of trace buffer ---\n");
+		return (0);
+	}
+	db_printf("%d (%p", tstate.cur, kp->ktr_thread);
+#ifdef SMP
+	db_printf(":cpu%d", kp->ktr_cpu);
+#endif
+	db_printf(")");
+	if (db_ktr_verbose >= 1) {
+		db_printf(" %10.10lld", (long long)kp->ktr_timestamp);
+	}
+	if (db_ktr_verbose >= 2) {
+		db_printf(" %s.%d", kp->ktr_file, kp->ktr_line);
+	}
+	db_printf(": ");
+	db_printf(kp->ktr_desc, kp->ktr_parms[0], kp->ktr_parms[1],
+	    kp->ktr_parms[2], kp->ktr_parms[3], kp->ktr_parms[4],
+	    kp->ktr_parms[5]);
+	db_printf("\n");
+
+	if (tstate.first == -1)
+		tstate.first = tstate.cur;
+
+	if (--tstate.cur < 0)
+		tstate.cur = ktr_entries - 1;
+
+	return (1);
+}
+
+#endif	/* DDB */
diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
new file mode 100644
index 0000000..3b34fb0
--- /dev/null
+++ b/sys/kern/kern_ktrace.c
@@ -0,0 +1,1269 @@
+/*-
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.
+ * Copyright (c) 2005 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_ktrace.c	8.2 (Berkeley) 9/23/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/ktrace.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+
+#include <security/mac/mac_framework.h>
+
+/*
+ * The ktrace facility allows the tracing of certain key events in user space
+ * processes, such as system calls, signal delivery, context switches, and
+ * user generated events using utrace(2).  It works by streaming event
+ * records and data to a vnode associated with the process using the
+ * ktrace(2) system call.  In general, records can be written directly from
+ * the context that generates the event.  One important exception to this is
+ * during a context switch, where sleeping is not permitted.  To handle this
+ * case, trace events are generated using in-kernel ktr_request records, and
+ * then delivered to disk at a convenient moment -- either immediately, the
+ * next traceable event, at system call return, or at process exit.
+ *
+ * When dealing with multiple threads or processes writing to the same event
+ * log, ordering guarantees are weak: specifically, if an event has multiple
+ * records (i.e., system call enter and return), they may be interlaced with
+ * records from another event.  Process and thread ID information is provided
+ * in the record, and user applications can de-interlace events if required.
+ */
+
+static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
+
+#ifdef KTRACE
+
+FEATURE(ktrace, "Kernel support for system-call tracing");
+
+#ifndef KTRACE_REQUEST_POOL
+#define	KTRACE_REQUEST_POOL	100
+#endif
+
+struct ktr_request {
+	struct	ktr_header ktr_header;
+	void	*ktr_buffer;
+	union {
+		struct	ktr_proc_ctor ktr_proc_ctor;
+		struct	ktr_cap_fail ktr_cap_fail;
+		struct	ktr_syscall ktr_syscall;
+		struct	ktr_sysret ktr_sysret;
+		struct	ktr_genio ktr_genio;
+		struct	ktr_psig ktr_psig;
+		struct	ktr_csw ktr_csw;
+		struct	ktr_fault ktr_fault;
+		struct	ktr_faultend ktr_faultend;
+	} ktr_data;
+	STAILQ_ENTRY(ktr_request) ktr_list;
+};
+
+static int data_lengths[] = {
+	0,					/* none */
+	offsetof(struct ktr_syscall, ktr_args),	/* KTR_SYSCALL */
+	sizeof(struct ktr_sysret),		/* KTR_SYSRET */
+	0,					/* KTR_NAMEI */
+	sizeof(struct ktr_genio),		/* KTR_GENIO */
+	sizeof(struct ktr_psig),		/* KTR_PSIG */
+	sizeof(struct ktr_csw),			/* KTR_CSW */
+	0,					/* KTR_USER */
+	0,					/* KTR_STRUCT */
+	0,					/* KTR_SYSCTL */
+	sizeof(struct ktr_proc_ctor),		/* KTR_PROCCTOR */
+	0,					/* KTR_PROCDTOR */
+	sizeof(struct ktr_cap_fail),		/* KTR_CAPFAIL */
+	sizeof(struct ktr_fault),		/* KTR_FAULT */
+	sizeof(struct ktr_faultend),		/* KTR_FAULTEND */
+};
+
+static STAILQ_HEAD(, ktr_request) ktr_free;
+
+static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");
+
+static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
+TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);
+
+static u_int ktr_geniosize = PAGE_SIZE;
+TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize);
+SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
+    0, "Maximum size of genio event payload");
+
+static int print_message = 1;
+static struct mtx ktrace_mtx;
+static struct sx ktrace_sx;
+
+static void ktrace_init(void *dummy);
+static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
+static u_int ktrace_resize_pool(u_int oldsize, u_int newsize);
+static struct ktr_request *ktr_getrequest_entered(struct thread *td, int type);
+static struct ktr_request *ktr_getrequest(int type);
+static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
+static void ktr_freeproc(struct proc *p, struct ucred **uc,
+    struct vnode **vp);
+static void ktr_freerequest(struct ktr_request *req);
+static void ktr_freerequest_locked(struct ktr_request *req);
+static void ktr_writerequest(struct thread *td, struct ktr_request *req);
+static int ktrcanset(struct thread *,struct proc *);
+static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
+static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
+static void ktrprocctor_entered(struct thread *, struct proc *);
+
+/*
+ * ktrace itself generates events, such as context switches, which we do not
+ * wish to trace.  Maintain a flag, TDP_INKTRACE, on each thread to determine
+ * whether or not it is in a region where tracing of events should be
+ * suppressed.
+ */
+static void
+ktrace_enter(struct thread *td)
+{
+
+	KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
+	td->td_pflags |= TDP_INKTRACE;
+}
+
+static void
+ktrace_exit(struct thread *td)
+{
+
+	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
+	td->td_pflags &= ~TDP_INKTRACE;
+}
+
+static void
+ktrace_assert(struct thread *td)
+{
+
+	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
+}
+
+static void
+ktrace_init(void *dummy)
+{
+	struct ktr_request *req;
+	int i;
+
+	mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
+	sx_init(&ktrace_sx, "ktrace_sx");
+	STAILQ_INIT(&ktr_free);
+	for (i = 0; i < ktr_requestpool; i++) {
+		req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
+		STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
+	}
+}
+SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
+
+static int
+sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
+{
+	struct thread *td;
+	u_int newsize, oldsize, wantsize;
+	int error;
+
+	/* Handle easy read-only case first to avoid warnings from GCC. */
+	if (!req->newptr) {
+		oldsize = ktr_requestpool;
+		return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
+	}
+
+	error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
+	if (error)
+		return (error);
+	td = curthread;
+	ktrace_enter(td);
+	oldsize = ktr_requestpool;
+	newsize = ktrace_resize_pool(oldsize, wantsize);
+	ktrace_exit(td);
+	error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
+	if (error)
+		return (error);
+	if (wantsize > oldsize && newsize < wantsize)
+		return (ENOSPC);
+	return (0);
+}
+SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT|CTLFLAG_RW,
+    &ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU",
+    "Pool buffer size for ktrace(1)");
+
+static u_int
+ktrace_resize_pool(u_int oldsize, u_int newsize)
+{
+	STAILQ_HEAD(, ktr_request) ktr_new;
+	struct ktr_request *req;
+	int bound;
+
+	print_message = 1;
+	bound = newsize - oldsize;
+	if (bound == 0)
+		return (ktr_requestpool);
+	if (bound < 0) {
+		mtx_lock(&ktrace_mtx);
+		/* Shrink pool down to newsize if possible. */
+		while (bound++ < 0) {
+			req = STAILQ_FIRST(&ktr_free);
+			if (req == NULL)
+				break;
+			STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
+			ktr_requestpool--;
+			free(req, M_KTRACE);
+		}
+	} else {
+		/* Grow pool up to newsize. */
+		STAILQ_INIT(&ktr_new);
+		while (bound-- > 0) {
+			req = malloc(sizeof(struct ktr_request), M_KTRACE,
+			    M_WAITOK);
+			STAILQ_INSERT_HEAD(&ktr_new, req, ktr_list);
+		}
+		mtx_lock(&ktrace_mtx);
+		STAILQ_CONCAT(&ktr_free, &ktr_new);
+		ktr_requestpool += (newsize - oldsize);
+	}
+	mtx_unlock(&ktrace_mtx);
+	return (ktr_requestpool);
+}
+
+/* ktr_getrequest() assumes that ktr_comm[] is the same size as td_name[]. */
+CTASSERT(sizeof(((struct ktr_header *)NULL)->ktr_comm) ==
+    (sizeof((struct thread *)NULL)->td_name));
+
+static struct ktr_request *
+ktr_getrequest_entered(struct thread *td, int type)
+{
+	struct ktr_request *req;
+	struct proc *p = td->td_proc;
+	int pm;
+
+	mtx_lock(&ktrace_mtx);
+	if (!KTRCHECK(td, type)) {
+		mtx_unlock(&ktrace_mtx);
+		return (NULL);
+	}
+	req = STAILQ_FIRST(&ktr_free);
+	if (req != NULL) {
+		STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
+		req->ktr_header.ktr_type = type;
+		if (p->p_traceflag & KTRFAC_DROP) {
+			req->ktr_header.ktr_type |= KTR_DROP;
+			p->p_traceflag &= ~KTRFAC_DROP;
+		}
+		mtx_unlock(&ktrace_mtx);
+		microtime(&req->ktr_header.ktr_time);
+		req->ktr_header.ktr_pid = p->p_pid;
+		req->ktr_header.ktr_tid = td->td_tid;
+		bcopy(td->td_name, req->ktr_header.ktr_comm,
+		    sizeof(req->ktr_header.ktr_comm));
+		req->ktr_buffer = NULL;
+		req->ktr_header.ktr_len = 0;
+	} else {
+		p->p_traceflag |= KTRFAC_DROP;
+		pm = print_message;
+		print_message = 0;
+		mtx_unlock(&ktrace_mtx);
+		if (pm)
+			printf("Out of ktrace request objects.\n");
+	}
+	return (req);
+}
+
+static struct ktr_request *
+ktr_getrequest(int type)
+{
+	struct thread *td = curthread;
+	struct ktr_request *req;
+
+	ktrace_enter(td);
+	req = ktr_getrequest_entered(td, type);
+	if (req == NULL)
+		ktrace_exit(td);
+
+	return (req);
+}
+
+/*
+ * Some trace generation environments don't permit direct access to VFS,
+ * such as during a context switch where sleeping is not allowed.  Under these
+ * circumstances, queue a request to the thread to be written asynchronously
+ * later.
+ */
+static void
+ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
+{
+
+	mtx_lock(&ktrace_mtx);
+	STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
+	mtx_unlock(&ktrace_mtx);
+}
+
+/*
+ * Drain any pending ktrace records from the per-thread queue to disk.  This
+ * is used both internally before committing other records, and also on
+ * system call return.  We drain all the ones we can find at the time when
+ * drain is requested, but don't keep draining after that as those events
+ * may be approximately "after" the current event.
+ */
+static void
+ktr_drain(struct thread *td)
+{
+	struct ktr_request *queued_req;
+	STAILQ_HEAD(, ktr_request) local_queue;
+
+	ktrace_assert(td);
+	sx_assert(&ktrace_sx, SX_XLOCKED);
+
+	STAILQ_INIT(&local_queue);
+
+	if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
+		mtx_lock(&ktrace_mtx);
+		STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
+		mtx_unlock(&ktrace_mtx);
+
+		while ((queued_req = STAILQ_FIRST(&local_queue))) {
+			STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
+			ktr_writerequest(td, queued_req);
+			ktr_freerequest(queued_req);
+		}
+	}
+}
+
+/*
+ * Submit a trace record for immediate commit to disk -- to be used only
+ * where entering VFS is OK.  First drain any pending records that may have
+ * been cached in the thread.
+ */
+static void
+ktr_submitrequest(struct thread *td, struct ktr_request *req)
+{
+
+	ktrace_assert(td);
+
+	sx_xlock(&ktrace_sx);
+	ktr_drain(td);
+	ktr_writerequest(td, req);
+	ktr_freerequest(req);
+	sx_xunlock(&ktrace_sx);
+	ktrace_exit(td);
+}
+
+static void
+ktr_freerequest(struct ktr_request *req)
+{
+
+	mtx_lock(&ktrace_mtx);
+	ktr_freerequest_locked(req);
+	mtx_unlock(&ktrace_mtx);
+}
+
+static void
+ktr_freerequest_locked(struct ktr_request *req)
+{
+
+	mtx_assert(&ktrace_mtx, MA_OWNED);
+	if (req->ktr_buffer != NULL)
+		free(req->ktr_buffer, M_KTRACE);
+	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
+}
+
+/*
+ * Disable tracing for a process and release all associated resources.
+ * The caller is responsible for releasing a reference on the returned
+ * vnode and credentials.
+ */
+static void
+ktr_freeproc(struct proc *p, struct ucred **uc, struct vnode **vp)
+{
+	struct ktr_request *req;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	mtx_assert(&ktrace_mtx, MA_OWNED);
+	*uc = p->p_tracecred;
+	p->p_tracecred = NULL;
+	if (vp != NULL)
+		*vp = p->p_tracevp;
+	p->p_tracevp = NULL;
+	p->p_traceflag = 0;
+	while ((req = STAILQ_FIRST(&p->p_ktr)) != NULL) {
+		STAILQ_REMOVE_HEAD(&p->p_ktr, ktr_list);
+		ktr_freerequest_locked(req);
+	}
+}
+
+void
+ktrsyscall(code, narg, args)
+	int code, narg;
+	register_t args[];
+{
+	struct ktr_request *req;
+	struct ktr_syscall *ktp;
+	size_t buflen;
+	char *buf = NULL;
+
+	buflen = sizeof(register_t) * narg;
+	if (buflen > 0) {
+		buf = malloc(buflen, M_KTRACE, M_WAITOK);
+		bcopy(args, buf, buflen);
+	}
+	req = ktr_getrequest(KTR_SYSCALL);
+	if (req == NULL) {
+		if (buf != NULL)
+			free(buf, M_KTRACE);
+		return;
+	}
+	ktp = &req->ktr_data.ktr_syscall;
+	ktp->ktr_code = code;
+	ktp->ktr_narg = narg;
+	if (buflen > 0) {
+		req->ktr_header.ktr_len = buflen;
+		req->ktr_buffer = buf;
+	}
+	ktr_submitrequest(curthread, req);
+}
+
+void
+ktrsysret(code, error, retval)
+	int code, error;
+	register_t retval;
+{
+	struct ktr_request *req;
+	struct ktr_sysret *ktp;
+
+	req = ktr_getrequest(KTR_SYSRET);
+	if (req == NULL)
+		return;
+	ktp = &req->ktr_data.ktr_sysret;
+	ktp->ktr_code = code;
+	ktp->ktr_error = error;
+	ktp->ktr_retval = ((error == 0) ? retval: 0);		/* what about val2 ? */
+	ktr_submitrequest(curthread, req);
+}
+
+/*
+ * When a setuid process execs, disable tracing.
+ *
+ * XXX: We toss any pending asynchronous records.
+ */
+void
+ktrprocexec(struct proc *p, struct ucred **uc, struct vnode **vp)
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	mtx_lock(&ktrace_mtx);
+	ktr_freeproc(p, uc, vp);
+	mtx_unlock(&ktrace_mtx);
+}
+
+/*
+ * When a process exits, drain per-process asynchronous trace records
+ * and disable tracing.
+ */
+void
+ktrprocexit(struct thread *td)
+{
+	struct ktr_request *req;
+	struct proc *p;
+	struct ucred *cred;
+	struct vnode *vp;
+
+	p = td->td_proc;
+	if (p->p_traceflag == 0)
+		return;
+
+	ktrace_enter(td);
+	req = ktr_getrequest_entered(td, KTR_PROCDTOR);
+	if (req != NULL)
+		ktr_enqueuerequest(td, req);
+	sx_xlock(&ktrace_sx);
+	ktr_drain(td);
+	sx_xunlock(&ktrace_sx);
+	PROC_LOCK(p);
+	mtx_lock(&ktrace_mtx);
+	ktr_freeproc(p, &cred, &vp);
+	mtx_unlock(&ktrace_mtx);
+	PROC_UNLOCK(p);
+	if (vp != NULL)
+		vrele(vp);
+	if (cred != NULL)
+		crfree(cred);
+	ktrace_exit(td);
+}
+
+static void
+ktrprocctor_entered(struct thread *td, struct proc *p)
+{
+	struct ktr_proc_ctor *ktp;
+	struct ktr_request *req;
+	struct thread *td2;
+
+	ktrace_assert(td);
+	td2 = FIRST_THREAD_IN_PROC(p);
+	req = ktr_getrequest_entered(td2, KTR_PROCCTOR);
+	if (req == NULL)
+		return;
+	ktp = &req->ktr_data.ktr_proc_ctor;
+	ktp->sv_flags = p->p_sysent->sv_flags;
+	ktr_enqueuerequest(td2, req);
+}
+
+void
+ktrprocctor(struct proc *p)
+{
+	struct thread *td = curthread;
+
+	if ((p->p_traceflag & KTRFAC_MASK) == 0)
+		return;
+
+	ktrace_enter(td);
+	ktrprocctor_entered(td, p);
+	ktrace_exit(td);
+}
+
+/*
+ * When a process forks, enable tracing in the new process if needed.
+ */
+void
+ktrprocfork(struct proc *p1, struct proc *p2)
+{
+
+	PROC_LOCK(p1);
+	mtx_lock(&ktrace_mtx);
+	KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
+	if (p1->p_traceflag & KTRFAC_INHERIT) {
+		p2->p_traceflag = p1->p_traceflag;
+		if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
+			VREF(p2->p_tracevp);
+			KASSERT(p1->p_tracecred != NULL,
+			    ("ktrace vnode with no cred"));
+			p2->p_tracecred = crhold(p1->p_tracecred);
+		}
+	}
+	mtx_unlock(&ktrace_mtx);
+	PROC_UNLOCK(p1);
+
+	ktrprocctor(p2);
+}
+
+/*
+ * When a thread returns, drain any asynchronous records generated by the
+ * system call.
+ */
+void
+ktruserret(struct thread *td)
+{
+
+	ktrace_enter(td);
+	sx_xlock(&ktrace_sx);
+	ktr_drain(td);
+	sx_xunlock(&ktrace_sx);
+	ktrace_exit(td);
+}
+
+void
+ktrnamei(path)
+	char *path;
+{
+	struct ktr_request *req;
+	int namelen;
+	char *buf = NULL;
+
+	namelen = strlen(path);
+	if (namelen > 0) {
+		buf = malloc(namelen, M_KTRACE, M_WAITOK);
+		bcopy(path, buf, namelen);
+	}
+	req = ktr_getrequest(KTR_NAMEI);
+	if (req == NULL) {
+		if (buf != NULL)
+			free(buf, M_KTRACE);
+		return;
+	}
+	if (namelen > 0) {
+		req->ktr_header.ktr_len = namelen;
+		req->ktr_buffer = buf;
+	}
+	ktr_submitrequest(curthread, req);
+}
+
+void
+ktrsysctl(name, namelen)
+	int *name;
+	u_int namelen;
+{
+	struct ktr_request *req;
+	u_int mib[CTL_MAXNAME + 2];
+	char *mibname;
+	size_t mibnamelen;
+	int error;
+
+	/* Lookup name of mib. */    
+	KASSERT(namelen <= CTL_MAXNAME, ("sysctl MIB too long"));
+	mib[0] = 0;
+	mib[1] = 1;
+	bcopy(name, mib + 2, namelen * sizeof(*name));
+	mibnamelen = 128;
+	mibname = malloc(mibnamelen, M_KTRACE, M_WAITOK);
+	error = kernel_sysctl(curthread, mib, namelen + 2, mibname, &mibnamelen,
+	    NULL, 0, &mibnamelen, 0);
+	if (error) {
+		free(mibname, M_KTRACE);
+		return;
+	}
+	req = ktr_getrequest(KTR_SYSCTL);
+	if (req == NULL) {
+		free(mibname, M_KTRACE);
+		return;
+	}
+	req->ktr_header.ktr_len = mibnamelen;
+	req->ktr_buffer = mibname;
+	ktr_submitrequest(curthread, req);
+}
+
+void
+ktrgenio(fd, rw, uio, error)
+	int fd;
+	enum uio_rw rw;
+	struct uio *uio;
+	int error;
+{
+	struct ktr_request *req;
+	struct ktr_genio *ktg;
+	int datalen;
+	char *buf;
+
+	if (error) {
+		free(uio, M_IOV);
+		return;
+	}
+	uio->uio_offset = 0;
+	uio->uio_rw = UIO_WRITE;
+	datalen = MIN(uio->uio_resid, ktr_geniosize);
+	buf = malloc(datalen, M_KTRACE, M_WAITOK);
+	error = uiomove(buf, datalen, uio);
+	free(uio, M_IOV);
+	if (error) {
+		free(buf, M_KTRACE);
+		return;
+	}
+	req = ktr_getrequest(KTR_GENIO);
+	if (req == NULL) {
+		free(buf, M_KTRACE);
+		return;
+	}
+	ktg = &req->ktr_data.ktr_genio;
+	ktg->ktr_fd = fd;
+	ktg->ktr_rw = rw;
+	req->ktr_header.ktr_len = datalen;
+	req->ktr_buffer = buf;
+	ktr_submitrequest(curthread, req);
+}
+
+void
+ktrpsig(sig, action, mask, code)
+	int sig;
+	sig_t action;
+	sigset_t *mask;
+	int code;
+{
+	struct thread *td = curthread;
+	struct ktr_request *req;
+	struct ktr_psig	*kp;
+
+	req = ktr_getrequest(KTR_PSIG);
+	if (req == NULL)
+		return;
+	kp = &req->ktr_data.ktr_psig;
+	kp->signo = (char)sig;
+	kp->action = action;
+	kp->mask = *mask;
+	kp->code = code;
+	ktr_enqueuerequest(td, req);
+	ktrace_exit(td);
+}
+
+void
+ktrcsw(out, user, wmesg)
+	int out, user;
+	const char *wmesg;
+{
+	struct thread *td = curthread;
+	struct ktr_request *req;
+	struct ktr_csw *kc;
+
+	req = ktr_getrequest(KTR_CSW);
+	if (req == NULL)
+		return;
+	kc = &req->ktr_data.ktr_csw;
+	kc->out = out;
+	kc->user = user;
+	if (wmesg != NULL)
+		strlcpy(kc->wmesg, wmesg, sizeof(kc->wmesg));
+	else
+		bzero(kc->wmesg, sizeof(kc->wmesg));
+	ktr_enqueuerequest(td, req);
+	ktrace_exit(td);
+}
+
+void
+ktrstruct(name, data, datalen)
+	const char *name;
+	void *data;
+	size_t datalen;
+{
+	struct ktr_request *req;
+	char *buf = NULL;
+	size_t buflen;
+
+	if (!data)
+		datalen = 0;
+	buflen = strlen(name) + 1 + datalen;
+	buf = malloc(buflen, M_KTRACE, M_WAITOK);
+	strcpy(buf, name);
+	bcopy(data, buf + strlen(name) + 1, datalen);
+	if ((req = ktr_getrequest(KTR_STRUCT)) == NULL) {
+		free(buf, M_KTRACE);
+		return;
+	}
+	req->ktr_buffer = buf;
+	req->ktr_header.ktr_len = buflen;
+	ktr_submitrequest(curthread, req);
+}
+
+void
+ktrcapfail(type, needed, held)
+	enum ktr_cap_fail_type type;
+	const cap_rights_t *needed;
+	const cap_rights_t *held;
+{
+	struct thread *td = curthread;
+	struct ktr_request *req;
+	struct ktr_cap_fail *kcf;
+
+	req = ktr_getrequest(KTR_CAPFAIL);
+	if (req == NULL)
+		return;
+	kcf = &req->ktr_data.ktr_cap_fail;
+	kcf->cap_type = type;
+	kcf->cap_needed = *needed;
+	kcf->cap_held = *held;
+	ktr_enqueuerequest(td, req);
+	ktrace_exit(td);
+}
+
+void
+ktrfault(vaddr, type)
+	vm_offset_t vaddr;
+	int type;
+{
+	struct thread *td = curthread;
+	struct ktr_request *req;
+	struct ktr_fault *kf;
+
+	req = ktr_getrequest(KTR_FAULT);
+	if (req == NULL)
+		return;
+	kf = &req->ktr_data.ktr_fault;
+	kf->vaddr = vaddr;
+	kf->type = type;
+	ktr_enqueuerequest(td, req);
+	ktrace_exit(td);
+}
+
+void
+ktrfaultend(result)
+	int result;
+{
+	struct thread *td = curthread;
+	struct ktr_request *req;
+	struct ktr_faultend *kf;
+
+	req = ktr_getrequest(KTR_FAULTEND);
+	if (req == NULL)
+		return;
+	kf = &req->ktr_data.ktr_faultend;
+	kf->result = result;
+	ktr_enqueuerequest(td, req);
+	ktrace_exit(td);
+}
+#endif /* KTRACE */
+
+/* Interface and common routines */
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktrace_args {
+	char	*fname;
+	int	ops;
+	int	facs;
+	int	pid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_ktrace(td, uap)
+	struct thread *td;
+	register struct ktrace_args *uap;
+{
+#ifdef KTRACE
+	register struct vnode *vp = NULL;
+	register struct proc *p;
+	struct pgrp *pg;
+	int facs = uap->facs & ~KTRFAC_ROOT;
+	int ops = KTROP(uap->ops);
+	int descend = uap->ops & KTRFLAG_DESCEND;
+	int nfound, ret = 0;
+	int flags, error = 0;
+	struct nameidata nd;
+	struct ucred *cred;
+
+	/*
+	 * Need something to (un)trace.
+	 */
+	if (ops != KTROP_CLEARFILE && facs == 0)
+		return (EINVAL);
+
+	ktrace_enter(td);
+	if (ops != KTROP_CLEAR) {
+		/*
+		 * an operation which requires a file argument.
+		 */
+		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->fname, td);
+		flags = FREAD | FWRITE | O_NOFOLLOW;
+		error = vn_open(&nd, &flags, 0, NULL);
+		if (error) {
+			ktrace_exit(td);
+			return (error);
+		}
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vp = nd.ni_vp;
+		VOP_UNLOCK(vp, 0);
+		if (vp->v_type != VREG) {
+			(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
+			ktrace_exit(td);
+			return (EACCES);
+		}
+	}
+	/*
+	 * Clear all uses of the tracefile.
+	 */
+	if (ops == KTROP_CLEARFILE) {
+		int vrele_count;
+
+		vrele_count = 0;
+		sx_slock(&allproc_lock);
+		FOREACH_PROC_IN_SYSTEM(p) {
+			PROC_LOCK(p);
+			if (p->p_tracevp == vp) {
+				if (ktrcanset(td, p)) {
+					mtx_lock(&ktrace_mtx);
+					ktr_freeproc(p, &cred, NULL);
+					mtx_unlock(&ktrace_mtx);
+					vrele_count++;
+					crfree(cred);
+				} else
+					error = EPERM;
+			}
+			PROC_UNLOCK(p);
+		}
+		sx_sunlock(&allproc_lock);
+		if (vrele_count > 0) {
+			while (vrele_count-- > 0)
+				vrele(vp);
+		}
+		goto done;
+	}
+	/*
+	 * do it
+	 */
+	sx_slock(&proctree_lock);
+	if (uap->pid < 0) {
+		/*
+		 * by process group
+		 */
+		pg = pgfind(-uap->pid);
+		if (pg == NULL) {
+			sx_sunlock(&proctree_lock);
+			error = ESRCH;
+			goto done;
+		}
+		/*
+		 * ktrops() may call vrele(). Lock pg_members
+		 * by the proctree_lock rather than pg_mtx.
+		 */
+		PGRP_UNLOCK(pg);
+		nfound = 0;
+		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+			PROC_LOCK(p);
+			if (p->p_state == PRS_NEW ||
+			    p_cansee(td, p) != 0) {
+				PROC_UNLOCK(p); 
+				continue;
+			}
+			nfound++;
+			if (descend)
+				ret |= ktrsetchildren(td, p, ops, facs, vp);
+			else
+				ret |= ktrops(td, p, ops, facs, vp);
+		}
+		if (nfound == 0) {
+			sx_sunlock(&proctree_lock);
+			error = ESRCH;
+			goto done;
+		}
+	} else {
+		/*
+		 * by pid
+		 */
+		p = pfind(uap->pid);
+		if (p == NULL)
+			error = ESRCH;
+		else
+			error = p_cansee(td, p);
+		if (error) {
+			if (p != NULL)
+				PROC_UNLOCK(p);
+			sx_sunlock(&proctree_lock);
+			goto done;
+		}
+		if (descend)
+			ret |= ktrsetchildren(td, p, ops, facs, vp);
+		else
+			ret |= ktrops(td, p, ops, facs, vp);
+	}
+	sx_sunlock(&proctree_lock);
+	if (!ret)
+		error = EPERM;
+done:
+	if (vp != NULL)
+		(void) vn_close(vp, FWRITE, td->td_ucred, td);
+	ktrace_exit(td);
+	return (error);
+#else /* !KTRACE */
+	return (ENOSYS);
+#endif /* KTRACE */
+}
+
+/* ARGSUSED */
+int
+sys_utrace(td, uap)
+	struct thread *td;
+	register struct utrace_args *uap;
+{
+
+#ifdef KTRACE
+	struct ktr_request *req;
+	void *cp;
+	int error;
+
+	if (!KTRPOINT(td, KTR_USER))
+		return (0);
+	if (uap->len > KTR_USER_MAXLEN)
+		return (EINVAL);
+	cp = malloc(uap->len, M_KTRACE, M_WAITOK);
+	error = copyin(uap->addr, cp, uap->len);
+	if (error) {
+		free(cp, M_KTRACE);
+		return (error);
+	}
+	req = ktr_getrequest(KTR_USER);
+	if (req == NULL) {
+		free(cp, M_KTRACE);
+		return (ENOMEM);
+	}
+	req->ktr_buffer = cp;
+	req->ktr_header.ktr_len = uap->len;
+	ktr_submitrequest(td, req);
+	return (0);
+#else /* !KTRACE */
+	return (ENOSYS);
+#endif /* KTRACE */
+}
+
+#ifdef KTRACE
+static int
+ktrops(td, p, ops, facs, vp)
+	struct thread *td;
+	struct proc *p;
+	int ops, facs;
+	struct vnode *vp;
+{
+	struct vnode *tracevp = NULL;
+	struct ucred *tracecred = NULL;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if (!ktrcanset(td, p)) {
+		PROC_UNLOCK(p);
+		return (0);
+	}
+	if (p->p_flag & P_WEXIT) {
+		/* If the process is exiting, just ignore it. */
+		PROC_UNLOCK(p);
+		return (1);
+	}
+	mtx_lock(&ktrace_mtx);
+	if (ops == KTROP_SET) {
+		if (p->p_tracevp != vp) {
+			/*
+			 * if trace file already in use, relinquish below
+			 */
+			tracevp = p->p_tracevp;
+			VREF(vp);
+			p->p_tracevp = vp;
+		}
+		if (p->p_tracecred != td->td_ucred) {
+			tracecred = p->p_tracecred;
+			p->p_tracecred = crhold(td->td_ucred);
+		}
+		p->p_traceflag |= facs;
+		if (priv_check(td, PRIV_KTRACE) == 0)
+			p->p_traceflag |= KTRFAC_ROOT;
+	} else {
+		/* KTROP_CLEAR */
+		if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0)
+			/* no more tracing */
+			ktr_freeproc(p, &tracecred, &tracevp);
+	}
+	mtx_unlock(&ktrace_mtx);
+	if ((p->p_traceflag & KTRFAC_MASK) != 0)
+		ktrprocctor_entered(td, p);
+	PROC_UNLOCK(p);
+	if (tracevp != NULL)
+		vrele(tracevp);
+	if (tracecred != NULL)
+		crfree(tracecred);
+
+	return (1);
+}
+
+static int
+ktrsetchildren(td, top, ops, facs, vp)
+	struct thread *td;
+	struct proc *top;
+	int ops, facs;
+	struct vnode *vp;
+{
+	register struct proc *p;
+	register int ret = 0;
+
+	p = top;
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	sx_assert(&proctree_lock, SX_LOCKED);
+	for (;;) {
+		ret |= ktrops(td, p, ops, facs, vp);
+		/*
+		 * If this process has children, descend to them next,
+		 * otherwise do any siblings, and if done with this level,
+		 * follow back up the tree (but not past top).
+		 */
+		if (!LIST_EMPTY(&p->p_children))
+			p = LIST_FIRST(&p->p_children);
+		else for (;;) {
+			if (p == top)
+				return (ret);
+			if (LIST_NEXT(p, p_sibling)) {
+				p = LIST_NEXT(p, p_sibling);
+				break;
+			}
+			p = p->p_pptr;
+		}
+		PROC_LOCK(p);
+	}
+	/*NOTREACHED*/
+}
+
+static void
+ktr_writerequest(struct thread *td, struct ktr_request *req)
+{
+	struct ktr_header *kth;
+	struct vnode *vp;
+	struct proc *p;
+	struct ucred *cred;
+	struct uio auio;
+	struct iovec aiov[3];
+	struct mount *mp;
+	int datalen, buflen, vrele_count;
+	int error;
+
+	/*
+	 * We hold the vnode and credential for use in I/O in case ktrace is
+	 * disabled on the process as we write out the request.
+	 *
+	 * XXXRW: This is not ideal: we could end up performing a write after
+	 * the vnode has been closed.
+	 */
+	mtx_lock(&ktrace_mtx);
+	vp = td->td_proc->p_tracevp;
+	cred = td->td_proc->p_tracecred;
+
+	/*
+	 * If vp is NULL, the vp has been cleared out from under this
+	 * request, so just drop it.  Make sure the credential and vnode are
+	 * in sync: we should have both or neither.
+	 */
+	if (vp == NULL) {
+		KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
+		mtx_unlock(&ktrace_mtx);
+		return;
+	}
+	VREF(vp);
+	KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
+	crhold(cred);
+	mtx_unlock(&ktrace_mtx);
+
+	kth = &req->ktr_header;
+	KASSERT(((u_short)kth->ktr_type & ~KTR_DROP) <
+	    sizeof(data_lengths) / sizeof(data_lengths[0]),
+	    ("data_lengths array overflow"));
+	datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
+	buflen = kth->ktr_len;
+	auio.uio_iov = &aiov[0];
+	auio.uio_offset = 0;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_rw = UIO_WRITE;
+	aiov[0].iov_base = (caddr_t)kth;
+	aiov[0].iov_len = sizeof(struct ktr_header);
+	auio.uio_resid = sizeof(struct ktr_header);
+	auio.uio_iovcnt = 1;
+	auio.uio_td = td;
+	if (datalen != 0) {
+		aiov[1].iov_base = (caddr_t)&req->ktr_data;
+		aiov[1].iov_len = datalen;
+		auio.uio_resid += datalen;
+		auio.uio_iovcnt++;
+		kth->ktr_len += datalen;
+	}
+	if (buflen != 0) {
+		KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
+		aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
+		aiov[auio.uio_iovcnt].iov_len = buflen;
+		auio.uio_resid += buflen;
+		auio.uio_iovcnt++;
+	}
+
+	vn_start_write(vp, &mp, V_WAIT);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef MAC
+	error = mac_vnode_check_write(cred, NOCRED, vp);
+	if (error == 0)
+#endif
+		error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	crfree(cred);
+	if (!error) {
+		vrele(vp);
+		return;
+	}
+
+	/*
+	 * If error encountered, give up tracing on this vnode.  We defer
+	 * all the vrele()'s on the vnode until after we are finished walking
+	 * the various lists to avoid needlessly holding locks.
+	 * NB: at this point we still hold the vnode reference that must
+	 * not go away as we need the valid vnode to compare with. Thus let
+	 * vrele_count start at 1 and the reference will be freed
+	 * by the loop at the end after our last use of vp.
+	 */
+	log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
+	    error);
+	vrele_count = 1;
+	/*
+	 * First, clear this vnode from being used by any processes in the
+	 * system.
+	 * XXX - If one process gets an EPERM writing to the vnode, should
+	 * we really do this?  Other processes might have suitable
+	 * credentials for the operation.
+	 */
+	cred = NULL;
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		if (p->p_tracevp == vp) {
+			mtx_lock(&ktrace_mtx);
+			ktr_freeproc(p, &cred, NULL);
+			mtx_unlock(&ktrace_mtx);
+			vrele_count++;
+		}
+		PROC_UNLOCK(p);
+		if (cred != NULL) {
+			crfree(cred);
+			cred = NULL;
+		}
+	}
+	sx_sunlock(&allproc_lock);
+
+	while (vrele_count-- > 0)
+		vrele(vp);
+}
+
+/*
+ * Return true if caller has permission to set the ktracing state
+ * of target.  Essentially, the target can't possess any
+ * more permissions than the caller.  KTRFAC_ROOT signifies that
+ * root previously set the tracing status on the target process, and
+ * so, only root may further change it.
+ */
+static int
+ktrcanset(td, targetp)
+	struct thread *td;
+	struct proc *targetp;
+{
+
+	PROC_LOCK_ASSERT(targetp, MA_OWNED);
+	if (targetp->p_traceflag & KTRFAC_ROOT &&
+	    priv_check(td, PRIV_KTRACE))
+		return (0);
+
+	if (p_candebug(td, targetp) != 0)
+		return (0);
+
+	return (1);
+}
+
+#endif /* KTRACE */
diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c
new file mode 100644
index 0000000..7d32260
--- /dev/null
+++ b/sys/kern/kern_linker.c
@@ -0,0 +1,2162 @@
+/*-
+ * Copyright (c) 1997-2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_kld.h"
+#include "opt_hwpmc_hooks.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/module.h>
+#include <sys/mount.h>
+#include <sys/linker.h>
+#include <sys/eventhandler.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/libkern.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+
+#include <net/vnet.h>
+
+#include <security/mac/mac_framework.h>
+
+#include "linker_if.h"
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#ifdef KLD_DEBUG
+int kld_debug = 0;
+SYSCTL_INT(_debug, OID_AUTO, kld_debug, CTLFLAG_RW | CTLFLAG_TUN,
+    &kld_debug, 0, "Set various levels of KLD debug");
+TUNABLE_INT("debug.kld_debug", &kld_debug);
+#endif
+
+/*
+ * static char *linker_search_path(const char *name, struct mod_depend
+ * *verinfo);
+ */
+static const char 	*linker_basename(const char *path);
+
+/*
+ * Find a currently loaded file given its filename.
+ */
+static linker_file_t linker_find_file_by_name(const char* _filename);
+
+/*
+ * Find a currently loaded file given its file id.
+ */
+static linker_file_t linker_find_file_by_id(int _fileid);
+
+/* Metadata from the static kernel */
+SET_DECLARE(modmetadata_set, struct mod_metadata);
+
+MALLOC_DEFINE(M_LINKER, "linker", "kernel linker");
+
+linker_file_t linker_kernel_file;
+
+static struct sx kld_sx;	/* kernel linker lock */
+
+/*
+ * Load counter used by clients to determine if a linker file has been
+ * re-loaded. This counter is incremented for each file load.
+ */
+static int loadcnt;
+
+static linker_class_list_t classes;
+static linker_file_list_t linker_files;
+static int next_file_id = 1;
+static int linker_no_more_classes = 0;
+
+#define	LINKER_GET_NEXT_FILE_ID(a) do {					\
+	linker_file_t lftmp;						\
+									\
+	if (!cold)							\
+		sx_assert(&kld_sx, SA_XLOCKED);				\
+retry:									\
+	TAILQ_FOREACH(lftmp, &linker_files, link) {			\
+		if (next_file_id == lftmp->id) {			\
+			next_file_id++;					\
+			goto retry;					\
+		}							\
+	}								\
+	(a) = next_file_id;						\
+} while(0)
+
+
+/* XXX wrong name; we're looking at version provision tags here, not modules */
+typedef TAILQ_HEAD(, modlist) modlisthead_t;
+struct modlist {
+	TAILQ_ENTRY(modlist) link;	/* chain together all modules */
+	linker_file_t   container;
+	const char 	*name;
+	int             version;
+};
+typedef struct modlist *modlist_t;
+static modlisthead_t found_modules;
+
+static int	linker_file_add_dependency(linker_file_t file,
+		    linker_file_t dep);
+static caddr_t	linker_file_lookup_symbol_internal(linker_file_t file,
+		    const char* name, int deps);
+static int	linker_load_module(const char *kldname,
+		    const char *modname, struct linker_file *parent,
+		    struct mod_depend *verinfo, struct linker_file **lfpp);
+static modlist_t modlist_lookup2(const char *name, struct mod_depend *verinfo);
+
+static void
+linker_init(void *arg)
+{
+
+	sx_init(&kld_sx, "kernel linker");
+	TAILQ_INIT(&classes);
+	TAILQ_INIT(&linker_files);
+}
+
+SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0);
+
+static void
+linker_stop_class_add(void *arg)
+{
+
+	linker_no_more_classes = 1;
+}
+
+SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL);
+
+int
+linker_add_class(linker_class_t lc)
+{
+
+	/*
+	 * We disallow any class registration past SI_ORDER_ANY
+	 * of SI_SUB_KLD.  We bump the reference count to keep the
+	 * ops from being freed.
+	 */
+	if (linker_no_more_classes == 1)
+		return (EPERM);
+	kobj_class_compile((kobj_class_t) lc);
+	((kobj_class_t)lc)->refs++;	/* XXX: kobj_mtx */
+	TAILQ_INSERT_TAIL(&classes, lc, link);
+	return (0);
+}
+
+static void
+linker_file_sysinit(linker_file_t lf)
+{
+	struct sysinit **start, **stop, **sipp, **xipp, *save;
+
+	KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n",
+	    lf->filename));
+
+	sx_assert(&kld_sx, SA_XLOCKED);
+
+	if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0)
+		return;
+	/*
+	 * Perform a bubble sort of the system initialization objects by
+	 * their subsystem (primary key) and order (secondary key).
+	 *
+	 * Since some things care about execution order, this is the operation
+	 * which ensures continued function.
+	 */
+	for (sipp = start; sipp < stop; sipp++) {
+		for (xipp = sipp + 1; xipp < stop; xipp++) {
+			if ((*sipp)->subsystem < (*xipp)->subsystem ||
+			    ((*sipp)->subsystem == (*xipp)->subsystem &&
+			    (*sipp)->order <= (*xipp)->order))
+				continue;	/* skip */
+			save = *sipp;
+			*sipp = *xipp;
+			*xipp = save;
+		}
+	}
+
+	/*
+	 * Traverse the (now) ordered list of system initialization tasks.
+	 * Perform each task, and continue on to the next task.
+	 */
+	sx_xunlock(&kld_sx);
+	mtx_lock(&Giant);
+	for (sipp = start; sipp < stop; sipp++) {
+		if ((*sipp)->subsystem == SI_SUB_DUMMY)
+			continue;	/* skip dummy task(s) */
+
+		/* Call function */
+		(*((*sipp)->func)) ((*sipp)->udata);
+	}
+	mtx_unlock(&Giant);
+	sx_xlock(&kld_sx);
+}
+
+static void
+linker_file_sysuninit(linker_file_t lf)
+{
+	struct sysinit **start, **stop, **sipp, **xipp, *save;
+
+	KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n",
+	    lf->filename));
+
+	sx_assert(&kld_sx, SA_XLOCKED);
+
+	if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop,
+	    NULL) != 0)
+		return;
+
+	/*
+	 * Perform a reverse bubble sort of the system initialization objects
+	 * by their subsystem (primary key) and order (secondary key).
+	 *
+	 * Since some things care about execution order, this is the operation
+	 * which ensures continued function.
+	 */
+	for (sipp = start; sipp < stop; sipp++) {
+		for (xipp = sipp + 1; xipp < stop; xipp++) {
+			if ((*sipp)->subsystem > (*xipp)->subsystem ||
+			    ((*sipp)->subsystem == (*xipp)->subsystem &&
+			    (*sipp)->order >= (*xipp)->order))
+				continue;	/* skip */
+			save = *sipp;
+			*sipp = *xipp;
+			*xipp = save;
+		}
+	}
+
+	/*
+	 * Traverse the (now) ordered list of system initialization tasks.
+	 * Perform each task, and continue on to the next task.
+	 */
+	sx_xunlock(&kld_sx);
+	mtx_lock(&Giant);
+	for (sipp = start; sipp < stop; sipp++) {
+		if ((*sipp)->subsystem == SI_SUB_DUMMY)
+			continue;	/* skip dummy task(s) */
+
+		/* Call function */
+		(*((*sipp)->func)) ((*sipp)->udata);
+	}
+	mtx_unlock(&Giant);
+	sx_xlock(&kld_sx);
+}
+
+static void
+linker_file_register_sysctls(linker_file_t lf)
+{
+	struct sysctl_oid **start, **stop, **oidp;
+
+	KLD_DPF(FILE,
+	    ("linker_file_register_sysctls: registering SYSCTLs for %s\n",
+	    lf->filename));
+
+	sx_assert(&kld_sx, SA_XLOCKED);
+
+	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
+		return;
+
+	sx_xunlock(&kld_sx);
+	sysctl_lock();
+	for (oidp = start; oidp < stop; oidp++)
+		sysctl_register_oid(*oidp);
+	sysctl_unlock();
+	sx_xlock(&kld_sx);
+}
+
+static void
+linker_file_unregister_sysctls(linker_file_t lf)
+{
+	struct sysctl_oid **start, **stop, **oidp;
+
+	KLD_DPF(FILE, ("linker_file_unregister_sysctls: unregistering SYSCTLs"
+	    " for %s\n", lf->filename));
+
+	sx_assert(&kld_sx, SA_XLOCKED);
+
+	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
+		return;
+
+	sx_xunlock(&kld_sx);
+	sysctl_lock();
+	for (oidp = start; oidp < stop; oidp++)
+		sysctl_unregister_oid(*oidp);
+	sysctl_unlock();
+	sx_xlock(&kld_sx);
+}
+
+static int
+linker_file_register_modules(linker_file_t lf)
+{
+	struct mod_metadata **start, **stop, **mdp;
+	const moduledata_t *moddata;
+	int first_error, error;
+
+	KLD_DPF(FILE, ("linker_file_register_modules: registering modules"
+	    " in %s\n", lf->filename));
+
+	sx_assert(&kld_sx, SA_XLOCKED);
+
+	if (linker_file_lookup_set(lf, "modmetadata_set", &start,
+	    &stop, NULL) != 0) {
+		/*
+		 * This fallback should be unnecessary, but if we get booted
+		 * from boot2 instead of loader and we are missing our
+		 * metadata then we have to try the best we can.
+		 */
+		if (lf == linker_kernel_file) {
+			start = SET_BEGIN(modmetadata_set);
+			stop = SET_LIMIT(modmetadata_set);
+		} else
+			return (0);
+	}
+	first_error = 0;
+	for (mdp = start; mdp < stop; mdp++) {
+		if ((*mdp)->md_type != MDT_MODULE)
+			continue;
+		moddata = (*mdp)->md_data;
+		KLD_DPF(FILE, ("Registering module %s in %s\n",
+		    moddata->name, lf->filename));
+		error = module_register(moddata, lf);
+		if (error) {
+			printf("Module %s failed to register: %d\n",
+			    moddata->name, error);
+			if (first_error == 0)
+				first_error = error;
+		}
+	}
+	return (first_error);
+}
+
+static void
+linker_init_kernel_modules(void)
+{
+
+	sx_xlock(&kld_sx);
+	linker_file_register_modules(linker_kernel_file);
+	sx_xunlock(&kld_sx);
+}
+
+SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules,
+    0);
+
+static int
+linker_load_file(const char *filename, linker_file_t *result)
+{
+	linker_class_t lc;
+	linker_file_t lf;
+	int foundfile, error, modules;
+
+	/* Refuse to load modules if securelevel raised */
+	if (prison0.pr_securelevel > 0)
+		return (EPERM);
+
+	sx_assert(&kld_sx, SA_XLOCKED);
+	lf = linker_find_file_by_name(filename);
+	if (lf) {
+		KLD_DPF(FILE, ("linker_load_file: file %s is already loaded,"
+		    " incrementing refs\n", filename));
+		*result = lf;
+		lf->refs++;
+		return (0);
+	}
+	foundfile = 0;
+	error = 0;
+
+	/*
+	 * We do not need to protect (lock) classes here because there is
+	 * no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY)
+	 * and there is no class deregistration mechanism at this time.
+	 */
+	TAILQ_FOREACH(lc, &classes, link) {
+		KLD_DPF(FILE, ("linker_load_file: trying to load %s\n",
+		    filename));
+		error = LINKER_LOAD_FILE(lc, filename, &lf);
+		/*
+		 * If we got something other than ENOENT, then it exists but
+		 * we cannot load it for some other reason.
+		 */
+		if (error != ENOENT)
+			foundfile = 1;
+		if (lf) {
+			error = linker_file_register_modules(lf);
+			if (error == EEXIST) {
+				linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+				return (error);
+			}
+			modules = !TAILQ_EMPTY(&lf->modules);
+			linker_file_register_sysctls(lf);
+			linker_file_sysinit(lf);
+			lf->flags |= LINKER_FILE_LINKED;
+
+			/*
+			 * If all of the modules in this file failed
+			 * to load, unload the file and return an
+			 * error of ENOEXEC.
+			 */
+			if (modules && TAILQ_EMPTY(&lf->modules)) {
+				linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+				return (ENOEXEC);
+			}
+			*result = lf;
+			return (0);
+		}
+	}
+	/*
+	 * Less than ideal, but tells the user whether it failed to load or
+	 * the module was not found.
+	 */
+	if (foundfile) {
+
+		/*
+		 * If the file type has not been recognized by the last try
+		 * printout a message before to fail.
+		 */
+		if (error == ENOSYS)
+			printf("linker_load_file: Unsupported file type\n");
+
+		/*
+		 * Format not recognized or otherwise unloadable.
+		 * When loading a module that is statically built into
+		 * the kernel EEXIST percolates back up as the return
+		 * value.  Preserve this so that apps like sysinstall
+		 * can recognize this special case and not post bogus
+		 * dialog boxes.
+		 */
+		if (error != EEXIST)
+			error = ENOEXEC;
+	} else
+		error = ENOENT;		/* Nothing found */
+	return (error);
+}
+
+int
+linker_reference_module(const char *modname, struct mod_depend *verinfo,
+    linker_file_t *result)
+{
+	modlist_t mod;
+	int error;
+
+	sx_xlock(&kld_sx);
+	if ((mod = modlist_lookup2(modname, verinfo)) != NULL) {
+		*result = mod->container;
+		(*result)->refs++;
+		sx_xunlock(&kld_sx);
+		return (0);
+	}
+
+	error = linker_load_module(NULL, modname, NULL, verinfo, result);
+	sx_xunlock(&kld_sx);
+	return (error);
+}
+
+int
+linker_release_module(const char *modname, struct mod_depend *verinfo,
+    linker_file_t lf)
+{
+	modlist_t mod;
+	int error;
+
+	sx_xlock(&kld_sx);
+	if (lf == NULL) {
+		KASSERT(modname != NULL,
+		    ("linker_release_module: no file or name"));
+		mod = modlist_lookup2(modname, verinfo);
+		if (mod == NULL) {
+			sx_xunlock(&kld_sx);
+			return (ESRCH);
+		}
+		lf = mod->container;
+	} else
+		KASSERT(modname == NULL && verinfo == NULL,
+		    ("linker_release_module: both file and name"));
+	error =	linker_file_unload(lf, LINKER_UNLOAD_NORMAL);
+	sx_xunlock(&kld_sx);
+	return (error);
+}
+
+static linker_file_t
+linker_find_file_by_name(const char *filename)
+{
+	linker_file_t lf;
+	char *koname;
+
+	koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
+	sprintf(koname, "%s.ko", filename);
+
+	sx_assert(&kld_sx, SA_XLOCKED);
+	TAILQ_FOREACH(lf, &linker_files, link) {
+		if (strcmp(lf->filename, koname) == 0)
+			break;
+		if (strcmp(lf->filename, filename) == 0)
+			break;
+	}
+	free(koname, M_LINKER);
+	return (lf);
+}
+
+static linker_file_t
+linker_find_file_by_id(int fileid)
+{
+	linker_file_t lf;
+
+	sx_assert(&kld_sx, SA_XLOCKED);
+	TAILQ_FOREACH(lf, &linker_files, link)
+		if (lf->id == fileid && lf->flags & LINKER_FILE_LINKED)
+			break;
+	return (lf);
+}
+
+int
+linker_file_foreach(linker_predicate_t *predicate, void *context)
+{
+	linker_file_t lf;
+	int retval = 0;
+
+	sx_xlock(&kld_sx);
+	TAILQ_FOREACH(lf, &linker_files, link) {
+		retval = predicate(lf, context);
+		if (retval != 0)
+			break;
+	}
+	sx_xunlock(&kld_sx);
+	return (retval);
+}
+
+linker_file_t
+linker_make_file(const char *pathname, linker_class_t lc)
+{
+	linker_file_t lf;
+	const char *filename;
+
+	if (!cold)
+		sx_assert(&kld_sx, SA_XLOCKED);
+	filename = linker_basename(pathname);
+
+	KLD_DPF(FILE, ("linker_make_file: new file, filename='%s' for pathname='%s'\n", filename, pathname));
+	lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK);
+	if (lf == NULL)
+		return (NULL);
+	lf->refs = 1;
+	lf->userrefs = 0;
+	lf->flags = 0;
+	lf->filename = strdup(filename, M_LINKER);
+	lf->pathname = strdup(pathname, M_LINKER);
+	LINKER_GET_NEXT_FILE_ID(lf->id);
+	lf->ndeps = 0;
+	lf->deps = NULL;
+	lf->loadcnt = ++loadcnt;
+	STAILQ_INIT(&lf->common);
+	TAILQ_INIT(&lf->modules);
+	TAILQ_INSERT_TAIL(&linker_files, lf, link);
+	return (lf);
+}
+
+int
+linker_file_unload(linker_file_t file, int flags)
+{
+	module_t mod, next;
+	modlist_t ml, nextml;
+	struct common_symbol *cp;
+	int error, i;
+
+	/* Refuse to unload modules if securelevel raised. */
+	if (prison0.pr_securelevel > 0)
+		return (EPERM);
+
+	sx_assert(&kld_sx, SA_XLOCKED);
+	KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs));
+
+	/* Easy case of just dropping a reference. */
+	if (file->refs > 1) {
+		file->refs--;
+		return (0);
+	}
+
+	KLD_DPF(FILE, ("linker_file_unload: file is unloading,"
+	    " informing modules\n"));
+
+	/*
+	 * Quiesce all the modules to give them a chance to veto the unload.
+	 */
+	MOD_SLOCK;
+	for (mod = TAILQ_FIRST(&file->modules); mod;
+	     mod = module_getfnext(mod)) {
+
+		error = module_quiesce(mod);
+		if (error != 0 && flags != LINKER_UNLOAD_FORCE) {
+			KLD_DPF(FILE, ("linker_file_unload: module %s"
+			    " vetoed unload\n", module_getname(mod)));
+			/*
+			 * XXX: Do we need to tell all the quiesced modules
+			 * that they can resume work now via a new module
+			 * event?
+			 */
+			MOD_SUNLOCK;
+			return (error);
+		}
+	}
+	MOD_SUNLOCK;
+
+	/*
+	 * Inform any modules associated with this file that they are
+	 * being unloaded.
+	 */
+	MOD_XLOCK;
+	for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
+		next = module_getfnext(mod);
+		MOD_XUNLOCK;
+
+		/*
+		 * Give the module a chance to veto the unload.
+		 */
+		if ((error = module_unload(mod)) != 0) {
+#ifdef KLD_DEBUG
+			MOD_SLOCK;
+			KLD_DPF(FILE, ("linker_file_unload: module %s"
+			    " failed unload\n", module_getname(mod)));
+			MOD_SUNLOCK;
+#endif
+			return (error);
+		}
+		MOD_XLOCK;
+		module_release(mod);
+	}
+	MOD_XUNLOCK;
+
+	TAILQ_FOREACH_SAFE(ml, &found_modules, link, nextml) {
+		if (ml->container == file) {
+			TAILQ_REMOVE(&found_modules, ml, link);
+			free(ml, M_LINKER);
+		}
+	}
+
+	/*
+	 * Don't try to run SYSUNINITs if we are unloaded due to a
+	 * link error.
+	 */
+	if (file->flags & LINKER_FILE_LINKED) {
+		file->flags &= ~LINKER_FILE_LINKED;
+		linker_file_sysuninit(file);
+		linker_file_unregister_sysctls(file);
+	}
+	TAILQ_REMOVE(&linker_files, file, link);
+
+	if (file->deps) {
+		for (i = 0; i < file->ndeps; i++)
+			linker_file_unload(file->deps[i], flags);
+		free(file->deps, M_LINKER);
+		file->deps = NULL;
+	}
+	while ((cp = STAILQ_FIRST(&file->common)) != NULL) {
+		STAILQ_REMOVE_HEAD(&file->common, link);
+		free(cp, M_LINKER);
+	}
+
+	LINKER_UNLOAD(file);
+	if (file->filename) {
+		free(file->filename, M_LINKER);
+		file->filename = NULL;
+	}
+	if (file->pathname) {
+		free(file->pathname, M_LINKER);
+		file->pathname = NULL;
+	}
+	kobj_delete((kobj_t) file, M_LINKER);
+	return (0);
+}
+
+int
+linker_ctf_get(linker_file_t file, linker_ctf_t *lc)
+{
+	return (LINKER_CTF_GET(file, lc));
+}
+
+static int
+linker_file_add_dependency(linker_file_t file, linker_file_t dep)
+{
+	linker_file_t *newdeps;
+
+	sx_assert(&kld_sx, SA_XLOCKED);
+	newdeps = malloc((file->ndeps + 1) * sizeof(linker_file_t *),
+	    M_LINKER, M_WAITOK | M_ZERO);
+	if (newdeps == NULL)
+		return (ENOMEM);
+
+	if (file->deps) {
+		bcopy(file->deps, newdeps,
+		    file->ndeps * sizeof(linker_file_t *));
+		free(file->deps, M_LINKER);
+	}
+	file->deps = newdeps;
+	file->deps[file->ndeps] = dep;
+	file->ndeps++;
+	KLD_DPF(FILE, ("linker_file_add_dependency:"
+	    " adding %s as dependency for %s\n", 
+	    dep->filename, file->filename));
+	return (0);
+}
+
+/*
+ * Locate a linker set and its contents.  This is a helper function to avoid
+ * linker_if.h exposure elsewhere.  Note: firstp and lastp are really void **.
+ * This function is used in this file so we can avoid having lots of (void **)
+ * casts.
+ */
+int
+linker_file_lookup_set(linker_file_t file, const char *name,
+    void *firstp, void *lastp, int *countp)
+{
+
+	sx_assert(&kld_sx, SA_LOCKED);
+	return (LINKER_LOOKUP_SET(file, name, firstp, lastp, countp));
+}
+
+/*
+ * List all functions in a file.
+ */
+int
+linker_file_function_listall(linker_file_t lf,
+    linker_function_nameval_callback_t callback_func, void *arg)
+{
+	return (LINKER_EACH_FUNCTION_NAMEVAL(lf, callback_func, arg));
+}
+
+caddr_t
+linker_file_lookup_symbol(linker_file_t file, const char *name, int deps)
+{
+	caddr_t sym;
+	int locked;
+
+	locked = sx_xlocked(&kld_sx);
+	if (!locked)
+		sx_xlock(&kld_sx);
+	sym = linker_file_lookup_symbol_internal(file, name, deps);
+	if (!locked)
+		sx_xunlock(&kld_sx);
+	return (sym);
+}
+
+static caddr_t
+linker_file_lookup_symbol_internal(linker_file_t file, const char *name,
+    int deps)
+{
+	c_linker_sym_t sym;
+	linker_symval_t symval;
+	caddr_t address;
+	size_t common_size = 0;
+	int i;
+
+	sx_assert(&kld_sx, SA_XLOCKED);
+	KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%p, name=%s, deps=%d\n",
+	    file, name, deps));
+
+	if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) {
+		LINKER_SYMBOL_VALUES(file, sym, &symval);
+		if (symval.value == 0)
+			/*
+			 * For commons, first look them up in the
+			 * dependencies and only allocate space if not found
+			 * there.
+			 */
+			common_size = symval.size;
+		else {
+			KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol"
+			    ".value=%p\n", symval.value));
+			return (symval.value);
+		}
+	}
+	if (deps) {
+		for (i = 0; i < file->ndeps; i++) {
+			address = linker_file_lookup_symbol_internal(
+			    file->deps[i], name, 0);
+			if (address) {
+				KLD_DPF(SYM, ("linker_file_lookup_symbol:"
+				    " deps value=%p\n", address));
+				return (address);
+			}
+		}
+	}
+	if (common_size > 0) {
+		/*
+		 * This is a common symbol which was not found in the
+		 * dependencies.  We maintain a simple common symbol table in
+		 * the file object.
+		 */
+		struct common_symbol *cp;
+
+		STAILQ_FOREACH(cp, &file->common, link) {
+			if (strcmp(cp->name, name) == 0) {
+				KLD_DPF(SYM, ("linker_file_lookup_symbol:"
+				    " old common value=%p\n", cp->address));
+				return (cp->address);
+			}
+		}
+		/*
+		 * Round the symbol size up to align.
+		 */
+		common_size = (common_size + sizeof(int) - 1) & -sizeof(int);
+		cp = malloc(sizeof(struct common_symbol)
+		    + common_size + strlen(name) + 1, M_LINKER,
+		    M_WAITOK | M_ZERO);
+		cp->address = (caddr_t)(cp + 1);
+		cp->name = cp->address + common_size;
+		strcpy(cp->name, name);
+		bzero(cp->address, common_size);
+		STAILQ_INSERT_TAIL(&file->common, cp, link);
+
+		KLD_DPF(SYM, ("linker_file_lookup_symbol: new common"
+		    " value=%p\n", cp->address));
+		return (cp->address);
+	}
+	KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n"));
+	return (0);
+}
+
+/*
+ * Both DDB and stack(9) rely on the kernel linker to provide forward and
+ * backward lookup of symbols.  However, DDB and sometimes stack(9) need to
+ * do this in a lockfree manner.  We provide a set of internal helper
+ * routines to perform these operations without locks, and then wrappers that
+ * optionally lock.
+ *
+ * linker_debug_lookup() is ifdef DDB as currently it's only used by DDB.
+ */
+#ifdef DDB
+static int
+linker_debug_lookup(const char *symstr, c_linker_sym_t *sym)
+{
+	linker_file_t lf;
+
+	TAILQ_FOREACH(lf, &linker_files, link) {
+		if (LINKER_LOOKUP_SYMBOL(lf, symstr, sym) == 0)
+			return (0);
+	}
+	return (ENOENT);
+}
+#endif
+
+static int
+linker_debug_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp)
+{
+	linker_file_t lf;
+	c_linker_sym_t best, es;
+	u_long diff, bestdiff, off;
+
+	best = 0;
+	off = (uintptr_t)value;
+	bestdiff = off;
+	TAILQ_FOREACH(lf, &linker_files, link) {
+		if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0)
+			continue;
+		if (es != 0 && diff < bestdiff) {
+			best = es;
+			bestdiff = diff;
+		}
+		if (bestdiff == 0)
+			break;
+	}
+	if (best) {
+		*sym = best;
+		*diffp = bestdiff;
+		return (0);
+	} else {
+		*sym = 0;
+		*diffp = off;
+		return (ENOENT);
+	}
+}
+
+static int
+linker_debug_symbol_values(c_linker_sym_t sym, linker_symval_t *symval)
+{
+	linker_file_t lf;
+
+	TAILQ_FOREACH(lf, &linker_files, link) {
+		if (LINKER_SYMBOL_VALUES(lf, sym, symval) == 0)
+			return (0);
+	}
+	return (ENOENT);
+}
+
+static int
+linker_debug_search_symbol_name(caddr_t value, char *buf, u_int buflen,
+    long *offset)
+{
+	linker_symval_t symval;
+	c_linker_sym_t sym;
+	int error;
+
+	*offset = 0;
+	error = linker_debug_search_symbol(value, &sym, offset);
+	if (error)
+		return (error);
+	error = linker_debug_symbol_values(sym, &symval);
+	if (error)
+		return (error);
+	strlcpy(buf, symval.name, buflen);
+	return (0);
+}
+
+/*
+ * DDB Helpers.  DDB has to look across multiple files with their own symbol
+ * tables and string tables.
+ *
+ * Note that we do not obey list locking protocols here.  We really don't need
+ * DDB to hang because somebody's got the lock held.  We'll take the chance
+ * that the files list is inconsistant instead.
+ */
+#ifdef DDB
+int
+linker_ddb_lookup(const char *symstr, c_linker_sym_t *sym)
+{
+
+	return (linker_debug_lookup(symstr, sym));
+}
+#endif
+
+int
+linker_ddb_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp)
+{
+
+	return (linker_debug_search_symbol(value, sym, diffp));
+}
+
+int
+linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval)
+{
+
+	return (linker_debug_symbol_values(sym, symval));
+}
+
+int
+linker_ddb_search_symbol_name(caddr_t value, char *buf, u_int buflen,
+    long *offset)
+{
+
+	return (linker_debug_search_symbol_name(value, buf, buflen, offset));
+}
+
+/*
+ * stack(9) helper for non-debugging environemnts.  Unlike DDB helpers, we do
+ * obey locking protocols, and offer a significantly less complex interface.
+ */
+int
+linker_search_symbol_name(caddr_t value, char *buf, u_int buflen,
+    long *offset)
+{
+	int error;
+
+	sx_xlock(&kld_sx);
+	error = linker_debug_search_symbol_name(value, buf, buflen, offset);
+	sx_xunlock(&kld_sx);
+	return (error);
+}
+
+/*
+ * Syscalls.
+ */
+int
+kern_kldload(struct thread *td, const char *file, int *fileid)
+{
+	const char *kldname, *modname;
+	linker_file_t lf;
+	int error;
+
+	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
+		return (error);
+
+	if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0)
+		return (error);
+
+	/*
+	 * It is possible that kldloaded module will attach a new ifnet,
+	 * so vnet context must be set when this ocurs.
+	 */
+	CURVNET_SET(TD_TO_VNET(td));
+
+	/*
+	 * If file does not contain a qualified name or any dot in it
+	 * (kldname.ko, or kldname.ver.ko) treat it as an interface
+	 * name.
+	 */
+	if (strchr(file, '/') || strchr(file, '.')) {
+		kldname = file;
+		modname = NULL;
+	} else {
+		kldname = NULL;
+		modname = file;
+	}
+
+	sx_xlock(&kld_sx);
+	error = linker_load_module(kldname, modname, NULL, NULL, &lf);
+	if (error) {
+		sx_xunlock(&kld_sx);
+		goto done;
+	}
+	lf->userrefs++;
+	if (fileid != NULL)
+		*fileid = lf->id;
+
+	sx_downgrade(&kld_sx);
+	EVENTHANDLER_INVOKE(kld_load, lf);
+	sx_sunlock(&kld_sx);
+
+done:
+	CURVNET_RESTORE();
+	return (error);
+}
+
+int
+sys_kldload(struct thread *td, struct kldload_args *uap)
+{
+	char *pathname = NULL;
+	int error, fileid;
+
+	td->td_retval[0] = -1;
+
+	pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL);
+	if (error == 0) {
+		error = kern_kldload(td, pathname, &fileid);
+		if (error == 0)
+			td->td_retval[0] = fileid;
+	}
+	free(pathname, M_TEMP);
+	return (error);
+}
+
+int
+kern_kldunload(struct thread *td, int fileid, int flags)
+{
+	linker_file_t lf;
+	char *filename = NULL;
+	caddr_t address;
+	size_t size;
+	int error = 0;
+
+	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
+		return (error);
+
+	if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0)
+		return (error);
+
+	CURVNET_SET(TD_TO_VNET(td));
+	sx_xlock(&kld_sx);
+	lf = linker_find_file_by_id(fileid);
+	if (lf) {
+		KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs));
+
+		EVENTHANDLER_INVOKE(kld_unload_try, lf, &error);
+		if (error != 0)
+			error = EBUSY;
+		else if (lf->userrefs == 0) {
+			/*
+			 * XXX: maybe LINKER_UNLOAD_FORCE should override ?
+			 */
+			printf("kldunload: attempt to unload file that was"
+			    " loaded by the kernel\n");
+			error = EBUSY;
+		} else {
+			/* Save data needed for the kld_unload callbacks. */
+			filename = strdup(lf->filename, M_TEMP);
+			address = lf->address;
+			size = lf->size;
+
+			lf->userrefs--;
+			error = linker_file_unload(lf, flags);
+			if (error)
+				lf->userrefs++;
+		}
+	} else
+		error = ENOENT;
+
+	if (error == 0) {
+		sx_downgrade(&kld_sx);
+		EVENTHANDLER_INVOKE(kld_unload, filename, address, size);
+		sx_sunlock(&kld_sx);
+	} else
+		sx_xunlock(&kld_sx);
+	free(filename, M_TEMP);
+
+	CURVNET_RESTORE();
+	return (error);
+}
+
+int
+sys_kldunload(struct thread *td, struct kldunload_args *uap)
+{
+
+	return (kern_kldunload(td, uap->fileid, LINKER_UNLOAD_NORMAL));
+}
+
+int
+sys_kldunloadf(struct thread *td, struct kldunloadf_args *uap)
+{
+
+	if (uap->flags != LINKER_UNLOAD_NORMAL &&
+	    uap->flags != LINKER_UNLOAD_FORCE)
+		return (EINVAL);
+	return (kern_kldunload(td, uap->fileid, uap->flags));
+}
+
+int
+sys_kldfind(struct thread *td, struct kldfind_args *uap)
+{
+	char *pathname;
+	const char *filename;
+	linker_file_t lf;
+	int error;
+
+#ifdef MAC
+	error = mac_kld_check_stat(td->td_ucred);
+	if (error)
+		return (error);
+#endif
+
+	td->td_retval[0] = -1;
+
+	pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	if ((error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL)) != 0)
+		goto out;
+
+	filename = linker_basename(pathname);
+	sx_xlock(&kld_sx);
+	lf = linker_find_file_by_name(filename);
+	if (lf)
+		td->td_retval[0] = lf->id;
+	else
+		error = ENOENT;
+	sx_xunlock(&kld_sx);
+out:
+	free(pathname, M_TEMP);
+	return (error);
+}
+
+int
+sys_kldnext(struct thread *td, struct kldnext_args *uap)
+{
+	linker_file_t lf;
+	int error = 0;
+
+#ifdef MAC
+	error = mac_kld_check_stat(td->td_ucred);
+	if (error)
+		return (error);
+#endif
+
+	sx_xlock(&kld_sx);
+	if (uap->fileid == 0)
+		lf = TAILQ_FIRST(&linker_files);
+	else {
+		lf = linker_find_file_by_id(uap->fileid);
+		if (lf == NULL) {
+			error = ENOENT;
+			goto out;
+		}
+		lf = TAILQ_NEXT(lf, link);
+	}
+
+	/* Skip partially loaded files. */
+	while (lf != NULL && !(lf->flags & LINKER_FILE_LINKED))
+		lf = TAILQ_NEXT(lf, link);
+
+	if (lf)
+		td->td_retval[0] = lf->id;
+	else
+		td->td_retval[0] = 0;
+out:
+	sx_xunlock(&kld_sx);
+	return (error);
+}
+
+int
+sys_kldstat(struct thread *td, struct kldstat_args *uap)
+{
+	struct kld_file_stat stat;
+	int error, version;
+
+	/*
+	 * Check the version of the user's structure.
+	 */
+	if ((error = copyin(&uap->stat->version, &version, sizeof(version)))
+	    != 0)
+		return (error);
+	if (version != sizeof(struct kld_file_stat_1) &&
+	    version != sizeof(struct kld_file_stat))
+		return (EINVAL);
+
+	error = kern_kldstat(td, uap->fileid, &stat);
+	if (error != 0)
+		return (error);
+	return (copyout(&stat, uap->stat, version));
+}
+
+int
+kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat)
+{
+	linker_file_t lf;
+	int namelen;
+#ifdef MAC
+	int error;
+
+	error = mac_kld_check_stat(td->td_ucred);
+	if (error)
+		return (error);
+#endif
+
+	sx_xlock(&kld_sx);
+	lf = linker_find_file_by_id(fileid);
+	if (lf == NULL) {
+		sx_xunlock(&kld_sx);
+		return (ENOENT);
+	}
+
+	/* Version 1 fields: */
+	namelen = strlen(lf->filename) + 1;
+	if (namelen > MAXPATHLEN)
+		namelen = MAXPATHLEN;
+	bcopy(lf->filename, &stat->name[0], namelen);
+	stat->refs = lf->refs;
+	stat->id = lf->id;
+	stat->address = lf->address;
+	stat->size = lf->size;
+	/* Version 2 fields: */
+	namelen = strlen(lf->pathname) + 1;
+	if (namelen > MAXPATHLEN)
+		namelen = MAXPATHLEN;
+	bcopy(lf->pathname, &stat->pathname[0], namelen);
+	sx_xunlock(&kld_sx);
+
+	td->td_retval[0] = 0;
+	return (0);
+}
+
+int
+sys_kldfirstmod(struct thread *td, struct kldfirstmod_args *uap)
+{
+	linker_file_t lf;
+	module_t mp;
+	int error = 0;
+
+#ifdef MAC
+	error = mac_kld_check_stat(td->td_ucred);
+	if (error)
+		return (error);
+#endif
+
+	sx_xlock(&kld_sx);
+	lf = linker_find_file_by_id(uap->fileid);
+	if (lf) {
+		MOD_SLOCK;
+		mp = TAILQ_FIRST(&lf->modules);
+		if (mp != NULL)
+			td->td_retval[0] = module_getid(mp);
+		else
+			td->td_retval[0] = 0;
+		MOD_SUNLOCK;
+	} else
+		error = ENOENT;
+	sx_xunlock(&kld_sx);
+	return (error);
+}
+
+int
+sys_kldsym(struct thread *td, struct kldsym_args *uap)
+{
+	char *symstr = NULL;
+	c_linker_sym_t sym;
+	linker_symval_t symval;
+	linker_file_t lf;
+	struct kld_sym_lookup lookup;
+	int error = 0;
+
+#ifdef MAC
+	error = mac_kld_check_stat(td->td_ucred);
+	if (error)
+		return (error);
+#endif
+
+	if ((error = copyin(uap->data, &lookup, sizeof(lookup))) != 0)
+		return (error);
+	if (lookup.version != sizeof(lookup) ||
+	    uap->cmd != KLDSYM_LOOKUP)
+		return (EINVAL);
+	symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0)
+		goto out;
+	sx_xlock(&kld_sx);
+	if (uap->fileid != 0) {
+		lf = linker_find_file_by_id(uap->fileid);
+		if (lf == NULL)
+			error = ENOENT;
+		else if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
+		    LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
+			lookup.symvalue = (uintptr_t) symval.value;
+			lookup.symsize = symval.size;
+			error = copyout(&lookup, uap->data, sizeof(lookup));
+		} else
+			error = ENOENT;
+	} else {
+		TAILQ_FOREACH(lf, &linker_files, link) {
+			if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
+			    LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
+				lookup.symvalue = (uintptr_t)symval.value;
+				lookup.symsize = symval.size;
+				error = copyout(&lookup, uap->data,
+				    sizeof(lookup));
+				break;
+			}
+		}
+		if (lf == NULL)
+			error = ENOENT;
+	}
+	sx_xunlock(&kld_sx);
+out:
+	free(symstr, M_TEMP);
+	return (error);
+}
+
+/*
+ * Preloaded module support
+ */
+
+static modlist_t
+modlist_lookup(const char *name, int ver)
+{
+	modlist_t mod;
+
+	TAILQ_FOREACH(mod, &found_modules, link) {
+		if (strcmp(mod->name, name) == 0 &&
+		    (ver == 0 || mod->version == ver))
+			return (mod);
+	}
+	return (NULL);
+}
+
+static modlist_t
+modlist_lookup2(const char *name, struct mod_depend *verinfo)
+{
+	modlist_t mod, bestmod;
+	int ver;
+
+	if (verinfo == NULL)
+		return (modlist_lookup(name, 0));
+	bestmod = NULL;
+	TAILQ_FOREACH(mod, &found_modules, link) {
+		if (strcmp(mod->name, name) != 0)
+			continue;
+		ver = mod->version;
+		if (ver == verinfo->md_ver_preferred)
+			return (mod);
+		if (ver >= verinfo->md_ver_minimum &&
+		    ver <= verinfo->md_ver_maximum &&
+		    (bestmod == NULL || ver > bestmod->version))
+			bestmod = mod;
+	}
+	return (bestmod);
+}
+
+static modlist_t
+modlist_newmodule(const char *modname, int version, linker_file_t container)
+{
+	modlist_t mod;
+
+	mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT | M_ZERO);
+	if (mod == NULL)
+		panic("no memory for module list");
+	mod->container = container;
+	mod->name = modname;
+	mod->version = version;
+	TAILQ_INSERT_TAIL(&found_modules, mod, link);
+	return (mod);
+}
+
+static void
+linker_addmodules(linker_file_t lf, struct mod_metadata **start,
+    struct mod_metadata **stop, int preload)
+{
+	struct mod_metadata *mp, **mdp;
+	const char *modname;
+	int ver;
+
+	for (mdp = start; mdp < stop; mdp++) {
+		mp = *mdp;
+		if (mp->md_type != MDT_VERSION)
+			continue;
+		modname = mp->md_cval;
+		ver = ((struct mod_version *)mp->md_data)->mv_version;
+		if (modlist_lookup(modname, ver) != NULL) {
+			printf("module %s already present!\n", modname);
+			/* XXX what can we do? this is a build error. :-( */
+			continue;
+		}
+		modlist_newmodule(modname, ver, lf);
+	}
+}
+
+static void
+linker_preload(void *arg)
+{
+	caddr_t modptr;
+	const char *modname, *nmodname;
+	char *modtype;
+	linker_file_t lf, nlf;
+	linker_class_t lc;
+	int error;
+	linker_file_list_t loaded_files;
+	linker_file_list_t depended_files;
+	struct mod_metadata *mp, *nmp;
+	struct mod_metadata **start, **stop, **mdp, **nmdp;
+	struct mod_depend *verinfo;
+	int nver;
+	int resolves;
+	modlist_t mod;
+	struct sysinit **si_start, **si_stop;
+
+	TAILQ_INIT(&loaded_files);
+	TAILQ_INIT(&depended_files);
+	TAILQ_INIT(&found_modules);
+	error = 0;
+
+	modptr = NULL;
+	sx_xlock(&kld_sx);
+	while ((modptr = preload_search_next_name(modptr)) != NULL) {
+		modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+		modtype = (char *)preload_search_info(modptr, MODINFO_TYPE);
+		if (modname == NULL) {
+			printf("Preloaded module at %p does not have a"
+			    " name!\n", modptr);
+			continue;
+		}
+		if (modtype == NULL) {
+			printf("Preloaded module at %p does not have a type!\n",
+			    modptr);
+			continue;
+		}
+		if (bootverbose)
+			printf("Preloaded %s \"%s\" at %p.\n", modtype, modname,
+			    modptr);
+		lf = NULL;
+		TAILQ_FOREACH(lc, &classes, link) {
+			error = LINKER_LINK_PRELOAD(lc, modname, &lf);
+			if (!error)
+				break;
+			lf = NULL;
+		}
+		if (lf)
+			TAILQ_INSERT_TAIL(&loaded_files, lf, loaded);
+	}
+
+	/*
+	 * First get a list of stuff in the kernel.
+	 */
+	if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start,
+	    &stop, NULL) == 0)
+		linker_addmodules(linker_kernel_file, start, stop, 1);
+
+	/*
+	 * This is a once-off kinky bubble sort to resolve relocation
+	 * dependency requirements.
+	 */
+restart:
+	TAILQ_FOREACH(lf, &loaded_files, loaded) {
+		error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
+		    &stop, NULL);
+		/*
+		 * First, look to see if we would successfully link with this
+		 * stuff.
+		 */
+		resolves = 1;	/* unless we know otherwise */
+		if (!error) {
+			for (mdp = start; mdp < stop; mdp++) {
+				mp = *mdp;
+				if (mp->md_type != MDT_DEPEND)
+					continue;
+				modname = mp->md_cval;
+				verinfo = mp->md_data;
+				for (nmdp = start; nmdp < stop; nmdp++) {
+					nmp = *nmdp;
+					if (nmp->md_type != MDT_VERSION)
+						continue;
+					nmodname = nmp->md_cval;
+					if (strcmp(modname, nmodname) == 0)
+						break;
+				}
+				if (nmdp < stop)   /* it's a self reference */
+					continue;
+
+				/*
+				 * ok, the module isn't here yet, we
+				 * are not finished
+				 */
+				if (modlist_lookup2(modname, verinfo) == NULL)
+					resolves = 0;
+			}
+		}
+		/*
+		 * OK, if we found our modules, we can link.  So, "provide"
+		 * the modules inside and add it to the end of the link order
+		 * list.
+		 */
+		if (resolves) {
+			if (!error) {
+				for (mdp = start; mdp < stop; mdp++) {
+					mp = *mdp;
+					if (mp->md_type != MDT_VERSION)
+						continue;
+					modname = mp->md_cval;
+					nver = ((struct mod_version *)
+					    mp->md_data)->mv_version;
+					if (modlist_lookup(modname,
+					    nver) != NULL) {
+						printf("module %s already"
+						    " present!\n", modname);
+						TAILQ_REMOVE(&loaded_files,
+						    lf, loaded);
+						linker_file_unload(lf,
+						    LINKER_UNLOAD_FORCE);
+						/* we changed tailq next ptr */
+						goto restart;
+					}
+					modlist_newmodule(modname, nver, lf);
+				}
+			}
+			TAILQ_REMOVE(&loaded_files, lf, loaded);
+			TAILQ_INSERT_TAIL(&depended_files, lf, loaded);
+			/*
+			 * Since we provided modules, we need to restart the
+			 * sort so that the previous files that depend on us
+			 * have a chance. Also, we've busted the tailq next
+			 * pointer with the REMOVE.
+			 */
+			goto restart;
+		}
+	}
+
+	/*
+	 * At this point, we check to see what could not be resolved..
+	 */
+	while ((lf = TAILQ_FIRST(&loaded_files)) != NULL) {
+		TAILQ_REMOVE(&loaded_files, lf, loaded);
+		printf("KLD file %s is missing dependencies\n", lf->filename);
+		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+	}
+
+	/*
+	 * We made it. Finish off the linking in the order we determined.
+	 */
+	TAILQ_FOREACH_SAFE(lf, &depended_files, loaded, nlf) {
+		if (linker_kernel_file) {
+			linker_kernel_file->refs++;
+			error = linker_file_add_dependency(lf,
+			    linker_kernel_file);
+			if (error)
+				panic("cannot add dependency");
+		}
+		lf->userrefs++;	/* so we can (try to) kldunload it */
+		error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
+		    &stop, NULL);
+		if (!error) {
+			for (mdp = start; mdp < stop; mdp++) {
+				mp = *mdp;
+				if (mp->md_type != MDT_DEPEND)
+					continue;
+				modname = mp->md_cval;
+				verinfo = mp->md_data;
+				mod = modlist_lookup2(modname, verinfo);
+				if (mod == NULL) {
+					printf("KLD file %s - cannot find "
+					    "dependency \"%s\"\n",
+					    lf->filename, modname);
+					goto fail;
+				}
+				/* Don't count self-dependencies */
+				if (lf == mod->container)
+					continue;
+				mod->container->refs++;
+				error = linker_file_add_dependency(lf,
+				    mod->container);
+				if (error)
+					panic("cannot add dependency");
+			}
+		}
+		/*
+		 * Now do relocation etc using the symbol search paths
+		 * established by the dependencies
+		 */
+		error = LINKER_LINK_PRELOAD_FINISH(lf);
+		if (error) {
+			printf("KLD file %s - could not finalize loading\n",
+			    lf->filename);
+			goto fail;
+		}
+		linker_file_register_modules(lf);
+		if (linker_file_lookup_set(lf, "sysinit_set", &si_start,
+		    &si_stop, NULL) == 0)
+			sysinit_add(si_start, si_stop);
+		linker_file_register_sysctls(lf);
+		lf->flags |= LINKER_FILE_LINKED;
+		continue;
+fail:
+		TAILQ_REMOVE(&depended_files, lf, loaded);
+		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+	}
+	sx_xunlock(&kld_sx);
+	/* woohoo! we made it! */
+}
+
+SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0);
+
+/*
+ * Search for a not-loaded module by name.
+ *
+ * Modules may be found in the following locations:
+ *
+ * - preloaded (result is just the module name) - on disk (result is full path
+ * to module)
+ *
+ * If the module name is qualified in any way (contains path, etc.) the we
+ * simply return a copy of it.
+ *
+ * The search path can be manipulated via sysctl.  Note that we use the ';'
+ * character as a separator to be consistent with the bootloader.
+ */
+
+static char linker_hintfile[] = "linker.hints";
+static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules";
+
+SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RW, linker_path,
+    sizeof(linker_path), "module load search path");
+
+TUNABLE_STR("module_path", linker_path, sizeof(linker_path));
+
+static char *linker_ext_list[] = {
+	"",
+	".ko",
+	NULL
+};
+
+/*
+ * Check if file actually exists either with or without extension listed in
+ * the linker_ext_list. (probably should be generic for the rest of the
+ * kernel)
+ */
+static char *
+linker_lookup_file(const char *path, int pathlen, const char *name,
+    int namelen, struct vattr *vap)
+{
+	struct nameidata nd;
+	struct thread *td = curthread;	/* XXX */
+	char *result, **cpp, *sep;
+	int error, len, extlen, reclen, flags;
+	enum vtype type;
+
+	extlen = 0;
+	for (cpp = linker_ext_list; *cpp; cpp++) {
+		len = strlen(*cpp);
+		if (len > extlen)
+			extlen = len;
+	}
+	extlen++;		/* trailing '\0' */
+	sep = (path[pathlen - 1] != '/') ? "/" : "";
+
+	reclen = pathlen + strlen(sep) + namelen + extlen + 1;
+	result = malloc(reclen, M_LINKER, M_WAITOK);
+	for (cpp = linker_ext_list; *cpp; cpp++) {
+		snprintf(result, reclen, "%.*s%s%.*s%s", pathlen, path, sep,
+		    namelen, name, *cpp);
+		/*
+		 * Attempt to open the file, and return the path if
+		 * we succeed and it's a regular file.
+		 */
+		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, td);
+		flags = FREAD;
+		error = vn_open(&nd, &flags, 0, NULL);
+		if (error == 0) {
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+			type = nd.ni_vp->v_type;
+			if (vap)
+				VOP_GETATTR(nd.ni_vp, vap, td->td_ucred);
+			VOP_UNLOCK(nd.ni_vp, 0);
+			vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+			if (type == VREG)
+				return (result);
+		}
+	}
+	free(result, M_LINKER);
+	return (NULL);
+}
+
+#define	INT_ALIGN(base, ptr)	ptr =					\
+	(base) + (((ptr) - (base) + sizeof(int) - 1) & ~(sizeof(int) - 1))
+
+/*
+ * Lookup KLD which contains requested module in the "linker.hints" file. If
+ * version specification is available, then try to find the best KLD.
+ * Otherwise just find the latest one.
+ */
+static char *
+linker_hints_lookup(const char *path, int pathlen, const char *modname,
+    int modnamelen, struct mod_depend *verinfo)
+{
+	struct thread *td = curthread;	/* XXX */
+	struct ucred *cred = td ? td->td_ucred : NULL;
+	struct nameidata nd;
+	struct vattr vattr, mattr;
+	u_char *hints = NULL;
+	u_char *cp, *recptr, *bufend, *result, *best, *pathbuf, *sep;
+	int error, ival, bestver, *intp, found, flags, clen, blen;
+	ssize_t reclen;
+
+	result = NULL;
+	bestver = found = 0;
+
+	sep = (path[pathlen - 1] != '/') ? "/" : "";
+	reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen +
+	    strlen(sep) + 1;
+	pathbuf = malloc(reclen, M_LINKER, M_WAITOK);
+	snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep,
+	    linker_hintfile);
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pathbuf, td);
+	flags = FREAD;
+	error = vn_open(&nd, &flags, 0, NULL);
+	if (error)
+		goto bad;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (nd.ni_vp->v_type != VREG)
+		goto bad;
+	best = cp = NULL;
+	error = VOP_GETATTR(nd.ni_vp, &vattr, cred);
+	if (error)
+		goto bad;
+	/*
+	 * XXX: we need to limit this number to some reasonable value
+	 */
+	if (vattr.va_size > 100 * 1024) {
+		printf("hints file too large %ld\n", (long)vattr.va_size);
+		goto bad;
+	}
+	hints = malloc(vattr.va_size, M_TEMP, M_WAITOK);
+	if (hints == NULL)
+		goto bad;
+	error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0,
+	    UIO_SYSSPACE, IO_NODELOCKED, cred, NOCRED, &reclen, td);
+	if (error)
+		goto bad;
+	VOP_UNLOCK(nd.ni_vp, 0);
+	vn_close(nd.ni_vp, FREAD, cred, td);
+	nd.ni_vp = NULL;
+	if (reclen != 0) {
+		printf("can't read %zd\n", reclen);
+		goto bad;
+	}
+	intp = (int *)hints;
+	ival = *intp++;
+	if (ival != LINKER_HINTS_VERSION) {
+		printf("hints file version mismatch %d\n", ival);
+		goto bad;
+	}
+	bufend = hints + vattr.va_size;
+	recptr = (u_char *)intp;
+	clen = blen = 0;
+	while (recptr < bufend && !found) {
+		intp = (int *)recptr;
+		reclen = *intp++;
+		ival = *intp++;
+		cp = (char *)intp;
+		switch (ival) {
+		case MDT_VERSION:
+			clen = *cp++;
+			if (clen != modnamelen || bcmp(cp, modname, clen) != 0)
+				break;
+			cp += clen;
+			INT_ALIGN(hints, cp);
+			ival = *(int *)cp;
+			cp += sizeof(int);
+			clen = *cp++;
+			if (verinfo == NULL ||
+			    ival == verinfo->md_ver_preferred) {
+				found = 1;
+				break;
+			}
+			if (ival >= verinfo->md_ver_minimum &&
+			    ival <= verinfo->md_ver_maximum &&
+			    ival > bestver) {
+				bestver = ival;
+				best = cp;
+				blen = clen;
+			}
+			break;
+		default:
+			break;
+		}
+		recptr += reclen + sizeof(int);
+	}
+	/*
+	 * Finally check if KLD is in the place
+	 */
+	if (found)
+		result = linker_lookup_file(path, pathlen, cp, clen, &mattr);
+	else if (best)
+		result = linker_lookup_file(path, pathlen, best, blen, &mattr);
+
+	/*
+	 * KLD is newer than hints file. What we should do now?
+	 */
+	if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >))
+		printf("warning: KLD '%s' is newer than the linker.hints"
+		    " file\n", result);
+bad:
+	free(pathbuf, M_LINKER);
+	if (hints)
+		free(hints, M_TEMP);
+	if (nd.ni_vp != NULL) {
+		VOP_UNLOCK(nd.ni_vp, 0);
+		vn_close(nd.ni_vp, FREAD, cred, td);
+	}
+	/*
+	 * If nothing found or hints is absent - fallback to the old
+	 * way by using "kldname[.ko]" as module name.
+	 */
+	if (!found && !bestver && result == NULL)
+		result = linker_lookup_file(path, pathlen, modname,
+		    modnamelen, NULL);
+	return (result);
+}
+
+/*
+ * Lookup KLD which contains requested module in the all directories.
+ */
+static char *
+linker_search_module(const char *modname, int modnamelen,
+    struct mod_depend *verinfo)
+{
+	char *cp, *ep, *result;
+
+	/*
+	 * traverse the linker path
+	 */
+	for (cp = linker_path; *cp; cp = ep + 1) {
+		/* find the end of this component */
+		for (ep = cp; (*ep != 0) && (*ep != ';'); ep++);
+		result = linker_hints_lookup(cp, ep - cp, modname,
+		    modnamelen, verinfo);
+		if (result != NULL)
+			return (result);
+		if (*ep == 0)
+			break;
+	}
+	return (NULL);
+}
+
+/*
+ * Search for module in all directories listed in the linker_path.
+ */
+static char *
+linker_search_kld(const char *name)
+{
+	char *cp, *ep, *result;
+	int len;
+
+	/* qualified at all? */
+	if (strchr(name, '/'))
+		return (strdup(name, M_LINKER));
+
+	/* traverse the linker path */
+	len = strlen(name);
+	for (ep = linker_path; *ep; ep++) {
+		cp = ep;
+		/* find the end of this component */
+		for (; *ep != 0 && *ep != ';'; ep++);
+		result = linker_lookup_file(cp, ep - cp, name, len, NULL);
+		if (result != NULL)
+			return (result);
+	}
+	return (NULL);
+}
+
+static const char *
+linker_basename(const char *path)
+{
+	const char *filename;
+
+	filename = strrchr(path, '/');
+	if (filename == NULL)
+		return path;
+	if (filename[1])
+		filename++;
+	return (filename);
+}
+
+#ifdef HWPMC_HOOKS
+/*
+ * Inform hwpmc about the set of kernel modules currently loaded.
+ */
+void *
+linker_hwpmc_list_objects(void)
+{
+	linker_file_t lf;
+	struct pmckern_map_in *kobase;
+	int i, nmappings;
+
+	nmappings = 0;
+	sx_slock(&kld_sx);
+	TAILQ_FOREACH(lf, &linker_files, link)
+		nmappings++;
+
+	/* Allocate nmappings + 1 entries. */
+	kobase = malloc((nmappings + 1) * sizeof(struct pmckern_map_in),
+	    M_LINKER, M_WAITOK | M_ZERO);
+	i = 0;
+	TAILQ_FOREACH(lf, &linker_files, link) {
+
+		/* Save the info for this linker file. */
+		kobase[i].pm_file = lf->filename;
+		kobase[i].pm_address = (uintptr_t)lf->address;
+		i++;
+	}
+	sx_sunlock(&kld_sx);
+
+	KASSERT(i > 0, ("linker_hpwmc_list_objects: no kernel objects?"));
+
+	/* The last entry of the malloced area comprises of all zeros. */
+	KASSERT(kobase[i].pm_file == NULL,
+	    ("linker_hwpmc_list_objects: last object not NULL"));
+
+	return ((void *)kobase);
+}
+#endif
+
+/*
+ * Find a file which contains given module and load it, if "parent" is not
+ * NULL, register a reference to it.
+ */
+static int
+linker_load_module(const char *kldname, const char *modname,
+    struct linker_file *parent, struct mod_depend *verinfo,
+    struct linker_file **lfpp)
+{
+	linker_file_t lfdep;
+	const char *filename;
+	char *pathname;
+	int error;
+
+	sx_assert(&kld_sx, SA_XLOCKED);
+	if (modname == NULL) {
+		/*
+ 		 * We have to load KLD
+ 		 */
+		KASSERT(verinfo == NULL, ("linker_load_module: verinfo"
+		    " is not NULL"));
+		pathname = linker_search_kld(kldname);
+	} else {
+		if (modlist_lookup2(modname, verinfo) != NULL)
+			return (EEXIST);
+		if (kldname != NULL)
+			pathname = strdup(kldname, M_LINKER);
+		else if (rootvnode == NULL)
+			pathname = NULL;
+		else
+			/*
+			 * Need to find a KLD with required module
+			 */
+			pathname = linker_search_module(modname,
+			    strlen(modname), verinfo);
+	}
+	if (pathname == NULL)
+		return (ENOENT);
+
+	/*
+	 * Can't load more than one file with the same basename XXX:
+	 * Actually it should be possible to have multiple KLDs with
+	 * the same basename but different path because they can
+	 * provide different versions of the same modules.
+	 */
+	filename = linker_basename(pathname);
+	if (linker_find_file_by_name(filename))
+		error = EEXIST;
+	else do {
+		error = linker_load_file(pathname, &lfdep);
+		if (error)
+			break;
+		if (modname && verinfo &&
+		    modlist_lookup2(modname, verinfo) == NULL) {
+			linker_file_unload(lfdep, LINKER_UNLOAD_FORCE);
+			error = ENOENT;
+			break;
+		}
+		if (parent) {
+			error = linker_file_add_dependency(parent, lfdep);
+			if (error)
+				break;
+		}
+		if (lfpp)
+			*lfpp = lfdep;
+	} while (0);
+	free(pathname, M_LINKER);
+	return (error);
+}
+
+/*
+ * This routine is responsible for finding dependencies of userland initiated
+ * kldload(2)'s of files.
+ */
+int
+linker_load_dependencies(linker_file_t lf)
+{
+	linker_file_t lfdep;
+	struct mod_metadata **start, **stop, **mdp, **nmdp;
+	struct mod_metadata *mp, *nmp;
+	struct mod_depend *verinfo;
+	modlist_t mod;
+	const char *modname, *nmodname;
+	int ver, error = 0, count;
+
+	/*
+	 * All files are dependant on /kernel.
+	 */
+	sx_assert(&kld_sx, SA_XLOCKED);
+	if (linker_kernel_file) {
+		linker_kernel_file->refs++;
+		error = linker_file_add_dependency(lf, linker_kernel_file);
+		if (error)
+			return (error);
+	}
+	if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop,
+	    &count) != 0)
+		return (0);
+	for (mdp = start; mdp < stop; mdp++) {
+		mp = *mdp;
+		if (mp->md_type != MDT_VERSION)
+			continue;
+		modname = mp->md_cval;
+		ver = ((struct mod_version *)mp->md_data)->mv_version;
+		mod = modlist_lookup(modname, ver);
+		if (mod != NULL) {
+			printf("interface %s.%d already present in the KLD"
+			    " '%s'!\n", modname, ver,
+			    mod->container->filename);
+			return (EEXIST);
+		}
+	}
+
+	for (mdp = start; mdp < stop; mdp++) {
+		mp = *mdp;
+		if (mp->md_type != MDT_DEPEND)
+			continue;
+		modname = mp->md_cval;
+		verinfo = mp->md_data;
+		nmodname = NULL;
+		for (nmdp = start; nmdp < stop; nmdp++) {
+			nmp = *nmdp;
+			if (nmp->md_type != MDT_VERSION)
+				continue;
+			nmodname = nmp->md_cval;
+			if (strcmp(modname, nmodname) == 0)
+				break;
+		}
+		if (nmdp < stop)/* early exit, it's a self reference */
+			continue;
+		mod = modlist_lookup2(modname, verinfo);
+		if (mod) {	/* woohoo, it's loaded already */
+			lfdep = mod->container;
+			lfdep->refs++;
+			error = linker_file_add_dependency(lf, lfdep);
+			if (error)
+				break;
+			continue;
+		}
+		error = linker_load_module(NULL, modname, lf, verinfo, NULL);
+		if (error) {
+			printf("KLD %s: depends on %s - not available or"
+			    " version mismatch\n", lf->filename, modname);
+			break;
+		}
+	}
+
+	if (error)
+		return (error);
+	linker_addmodules(lf, start, stop, 0);
+	return (error);
+}
+
+static int
+sysctl_kern_function_list_iterate(const char *name, void *opaque)
+{
+	struct sysctl_req *req;
+
+	req = opaque;
+	return (SYSCTL_OUT(req, name, strlen(name) + 1));
+}
+
+/*
+ * Export a nul-separated, double-nul-terminated list of all function names
+ * in the kernel.
+ */
+static int
+sysctl_kern_function_list(SYSCTL_HANDLER_ARGS)
+{
+	linker_file_t lf;
+	int error;
+
+#ifdef MAC
+	error = mac_kld_check_stat(req->td->td_ucred);
+	if (error)
+		return (error);
+#endif
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sx_xlock(&kld_sx);
+	TAILQ_FOREACH(lf, &linker_files, link) {
+		error = LINKER_EACH_FUNCTION_NAME(lf,
+		    sysctl_kern_function_list_iterate, req);
+		if (error) {
+			sx_xunlock(&kld_sx);
+			return (error);
+		}
+	}
+	sx_xunlock(&kld_sx);
+	return (SYSCTL_OUT(req, "", 1));
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLTYPE_OPAQUE | CTLFLAG_RD,
+    NULL, 0, sysctl_kern_function_list, "", "kernel function list");
diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
new file mode 100644
index 0000000..87dca63
--- /dev/null
+++ b/sys/kern/kern_lock.c
@@ -0,0 +1,1505 @@
+/*-
+ * Copyright (c) 2008 Attilio Rao <attilio@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice(s), this list of conditions and the following disclaimer as
+ *    the first lines of this file unmodified other than the possible
+ *    addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice(s), this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include "opt_adaptive_lockmgrs.h"
+#include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/lock_profile.h>
+#include <sys/lockmgr.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sleepqueue.h>
+#ifdef DEBUG_LOCKS
+#include <sys/stack.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <machine/cpu.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+PMC_SOFT_DECLARE( , , lock, failed);
+#endif
+
+CTASSERT(((LK_ADAPTIVE | LK_NOSHARE) & LO_CLASSFLAGS) ==
+    (LK_ADAPTIVE | LK_NOSHARE));
+CTASSERT(LK_UNLOCKED == (LK_UNLOCKED &
+    ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS)));
+
+#define	SQ_EXCLUSIVE_QUEUE	0
+#define	SQ_SHARED_QUEUE		1
+
+#ifndef INVARIANTS
+#define	_lockmgr_assert(lk, what, file, line)
+#define	TD_LOCKS_INC(td)
+#define	TD_LOCKS_DEC(td)
+#else
+#define	TD_LOCKS_INC(td)	((td)->td_locks++)
+#define	TD_LOCKS_DEC(td)	((td)->td_locks--)
+#endif
+#define	TD_SLOCKS_INC(td)	((td)->td_lk_slocks++)
+#define	TD_SLOCKS_DEC(td)	((td)->td_lk_slocks--)
+
+#ifndef DEBUG_LOCKS
+#define	STACK_PRINT(lk)
+#define	STACK_SAVE(lk)
+#define	STACK_ZERO(lk)
+#else
+#define	STACK_PRINT(lk)	stack_print_ddb(&(lk)->lk_stack)
+#define	STACK_SAVE(lk)	stack_save(&(lk)->lk_stack)
+#define	STACK_ZERO(lk)	stack_zero(&(lk)->lk_stack)
+#endif
+
+#define	LOCK_LOG2(lk, string, arg1, arg2)				\
+	if (LOCK_LOG_TEST(&(lk)->lock_object, 0))			\
+		CTR2(KTR_LOCK, (string), (arg1), (arg2))
+#define	LOCK_LOG3(lk, string, arg1, arg2, arg3)				\
+	if (LOCK_LOG_TEST(&(lk)->lock_object, 0))			\
+		CTR3(KTR_LOCK, (string), (arg1), (arg2), (arg3))
+
+#define	GIANT_DECLARE							\
+	int _i = 0;							\
+	WITNESS_SAVE_DECL(Giant)
+#define	GIANT_RESTORE() do {						\
+	if (_i > 0) {							\
+		while (_i--)						\
+			mtx_lock(&Giant);				\
+		WITNESS_RESTORE(&Giant.lock_object, Giant);		\
+	}								\
+} while (0)
+#define	GIANT_SAVE() do {						\
+	if (mtx_owned(&Giant)) {					\
+		WITNESS_SAVE(&Giant.lock_object, Giant);		\
+		while (mtx_owned(&Giant)) {				\
+			_i++;						\
+			mtx_unlock(&Giant);				\
+		}							\
+	}								\
+} while (0)
+
+#define	LK_CAN_SHARE(x)							\
+	(((x) & LK_SHARE) && (((x) & LK_EXCLUSIVE_WAITERS) == 0 ||	\
+	((x) & LK_EXCLUSIVE_SPINNERS) == 0 ||				\
+	curthread->td_lk_slocks || (curthread->td_pflags & TDP_DEADLKTREAT)))
+#define	LK_TRYOP(x)							\
+	((x) & LK_NOWAIT)
+
+#define	LK_CAN_WITNESS(x)						\
+	(((x) & LK_NOWITNESS) == 0 && !LK_TRYOP(x))
+#define	LK_TRYWIT(x)							\
+	(LK_TRYOP(x) ? LOP_TRYLOCK : 0)
+
+#define	LK_CAN_ADAPT(lk, f)						\
+	(((lk)->lock_object.lo_flags & LK_ADAPTIVE) != 0 &&		\
+	((f) & LK_SLEEPFAIL) == 0)
+
+#define	lockmgr_disowned(lk)						\
+	(((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == LK_KERNPROC)
+
+#define	lockmgr_xlocked(lk)						\
+	(((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == (uintptr_t)curthread)
+
+static void	assert_lockmgr(const struct lock_object *lock, int how);
+#ifdef DDB
+static void	db_show_lockmgr(const struct lock_object *lock);
+#endif
+static void	lock_lockmgr(struct lock_object *lock, int how);
+#ifdef KDTRACE_HOOKS
+static int	owner_lockmgr(const struct lock_object *lock,
+		    struct thread **owner);
+#endif
+static int	unlock_lockmgr(struct lock_object *lock);
+
+struct lock_class lock_class_lockmgr = {
+	.lc_name = "lockmgr",
+	.lc_flags = LC_RECURSABLE | LC_SLEEPABLE | LC_SLEEPLOCK | LC_UPGRADABLE,
+	.lc_assert = assert_lockmgr,
+#ifdef DDB
+	.lc_ddb_show = db_show_lockmgr,
+#endif
+	.lc_lock = lock_lockmgr,
+	.lc_unlock = unlock_lockmgr,
+#ifdef KDTRACE_HOOKS
+	.lc_owner = owner_lockmgr,
+#endif
+};
+
+#ifdef ADAPTIVE_LOCKMGRS
+static u_int alk_retries = 10;
+static u_int alk_loops = 10000;
+static SYSCTL_NODE(_debug, OID_AUTO, lockmgr, CTLFLAG_RD, NULL,
+    "lockmgr debugging");
+SYSCTL_UINT(_debug_lockmgr, OID_AUTO, retries, CTLFLAG_RW, &alk_retries, 0, "");
+SYSCTL_UINT(_debug_lockmgr, OID_AUTO, loops, CTLFLAG_RW, &alk_loops, 0, "");
+#endif
+
+static __inline struct thread *
+lockmgr_xholder(const struct lock *lk)
+{
+	uintptr_t x;
+
+	x = lk->lk_lock;
+	return ((x & LK_SHARE) ? NULL : (struct thread *)LK_HOLDER(x));
+}
+
+/*
+ * It assumes sleepq_lock held and returns with this one unheld.
+ * It also assumes the generic interlock is sane and previously checked.
+ * If LK_INTERLOCK is specified the interlock is not reacquired after the
+ * sleep.
+ */
+static __inline int
+sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk,
+    const char *wmesg, int pri, int timo, int queue)
+{
+	GIANT_DECLARE;
+	struct lock_class *class;
+	int catch, error;
+
+	class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
+	catch = pri & PCATCH;
+	pri &= PRIMASK;
+	error = 0;
+
+	LOCK_LOG3(lk, "%s: %p blocking on the %s sleepqueue", __func__, lk,
+	    (queue == SQ_EXCLUSIVE_QUEUE) ? "exclusive" : "shared");
+
+	if (flags & LK_INTERLOCK)
+		class->lc_unlock(ilk);
+	if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0)
+		lk->lk_exslpfail++;
+	GIANT_SAVE();
+	sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ?
+	    SLEEPQ_INTERRUPTIBLE : 0), queue);
+	if ((flags & LK_TIMELOCK) && timo)
+		sleepq_set_timeout(&lk->lock_object, timo);
+
+	/*
+	 * Decisional switch for real sleeping.
+	 */
+	if ((flags & LK_TIMELOCK) && timo && catch)
+		error = sleepq_timedwait_sig(&lk->lock_object, pri);
+	else if ((flags & LK_TIMELOCK) && timo)
+		error = sleepq_timedwait(&lk->lock_object, pri);
+	else if (catch)
+		error = sleepq_wait_sig(&lk->lock_object, pri);
+	else
+		sleepq_wait(&lk->lock_object, pri);
+	GIANT_RESTORE();
+	if ((flags & LK_SLEEPFAIL) && error == 0)
+		error = ENOLCK;
+
+	return (error);
+}
+
+static __inline int
+wakeupshlk(struct lock *lk, const char *file, int line)
+{
+	uintptr_t v, x;
+	u_int realexslp;
+	int queue, wakeup_swapper;
+
+	WITNESS_UNLOCK(&lk->lock_object, 0, file, line);
+	LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, file, line);
+
+	wakeup_swapper = 0;
+	for (;;) {
+		x = lk->lk_lock;
+
+		/*
+		 * If there is more than one shared lock held, just drop one
+		 * and return.
+		 */
+		if (LK_SHARERS(x) > 1) {
+			if (atomic_cmpset_rel_ptr(&lk->lk_lock, x,
+			    x - LK_ONE_SHARER))
+				break;
+			continue;
+		}
+
+		/*
+		 * If there are not waiters on the exclusive queue, drop the
+		 * lock quickly.
+		 */
+		if ((x & LK_ALL_WAITERS) == 0) {
+			MPASS((x & ~LK_EXCLUSIVE_SPINNERS) ==
+			    LK_SHARERS_LOCK(1));
+			if (atomic_cmpset_rel_ptr(&lk->lk_lock, x, LK_UNLOCKED))
+				break;
+			continue;
+		}
+
+		/*
+		 * We should have a sharer with waiters, so enter the hard
+		 * path in order to handle wakeups correctly.
+		 */
+		sleepq_lock(&lk->lock_object);
+		x = lk->lk_lock & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
+		v = LK_UNLOCKED;
+
+		/*
+		 * If the lock has exclusive waiters, give them preference in
+		 * order to avoid deadlock with shared runners up.
+		 * If interruptible sleeps left the exclusive queue empty
+		 * avoid a starvation for the threads sleeping on the shared
+		 * queue by giving them precedence and cleaning up the
+		 * exclusive waiters bit anyway.
+		 * Please note that lk_exslpfail count may be lying about
+		 * the real number of waiters with the LK_SLEEPFAIL flag on
+		 * because they may be used in conjuction with interruptible
+		 * sleeps so lk_exslpfail might be considered an 'upper limit'
+		 * bound, including the edge cases.
+		 */
+		realexslp = sleepq_sleepcnt(&lk->lock_object,
+		    SQ_EXCLUSIVE_QUEUE);
+		if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
+			if (lk->lk_exslpfail < realexslp) {
+				lk->lk_exslpfail = 0;
+				queue = SQ_EXCLUSIVE_QUEUE;
+				v |= (x & LK_SHARED_WAITERS);
+			} else {
+				lk->lk_exslpfail = 0;
+				LOCK_LOG2(lk,
+				    "%s: %p has only LK_SLEEPFAIL sleepers",
+				    __func__, lk);
+				LOCK_LOG2(lk,
+			    "%s: %p waking up threads on the exclusive queue",
+				    __func__, lk);
+				wakeup_swapper =
+				    sleepq_broadcast(&lk->lock_object,
+				    SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
+				queue = SQ_SHARED_QUEUE;
+			}
+				
+		} else {
+
+			/*
+			 * Exclusive waiters sleeping with LK_SLEEPFAIL on
+			 * and using interruptible sleeps/timeout may have
+			 * left spourious lk_exslpfail counts on, so clean
+			 * it up anyway.
+			 */
+			lk->lk_exslpfail = 0;
+			queue = SQ_SHARED_QUEUE;
+		}
+
+		if (!atomic_cmpset_rel_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x,
+		    v)) {
+			sleepq_release(&lk->lock_object);
+			continue;
+		}
+		LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue",
+		    __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
+		    "exclusive");
+		wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK,
+		    0, queue);
+		sleepq_release(&lk->lock_object);
+		break;
+	}
+
+	lock_profile_release_lock(&lk->lock_object);
+	TD_LOCKS_DEC(curthread);
+	TD_SLOCKS_DEC(curthread);
+	return (wakeup_swapper);
+}
+
+static void
+assert_lockmgr(const struct lock_object *lock, int what)
+{
+
+	panic("lockmgr locks do not support assertions");
+}
+
+static void
+lock_lockmgr(struct lock_object *lock, int how)
+{
+
+	panic("lockmgr locks do not support sleep interlocking");
+}
+
+static int
+unlock_lockmgr(struct lock_object *lock)
+{
+
+	panic("lockmgr locks do not support sleep interlocking");
+}
+
+#ifdef KDTRACE_HOOKS
+static int
+owner_lockmgr(const struct lock_object *lock, struct thread **owner)
+{
+
+	panic("lockmgr locks do not support owner inquiring");
+}
+#endif
+
+void
+lockinit(struct lock *lk, int pri, const char *wmesg, int timo, int flags)
+{
+	int iflags;
+
+	MPASS((flags & ~LK_INIT_MASK) == 0);
+	ASSERT_ATOMIC_LOAD_PTR(lk->lk_lock,
+            ("%s: lockmgr not aligned for %s: %p", __func__, wmesg,
+            &lk->lk_lock));
+
+	iflags = LO_SLEEPABLE | LO_UPGRADABLE;
+	if (flags & LK_CANRECURSE)
+		iflags |= LO_RECURSABLE;
+	if ((flags & LK_NODUP) == 0)
+		iflags |= LO_DUPOK;
+	if (flags & LK_NOPROFILE)
+		iflags |= LO_NOPROFILE;
+	if ((flags & LK_NOWITNESS) == 0)
+		iflags |= LO_WITNESS;
+	if (flags & LK_QUIET)
+		iflags |= LO_QUIET;
+	if (flags & LK_IS_VNODE)
+		iflags |= LO_IS_VNODE;
+	iflags |= flags & (LK_ADAPTIVE | LK_NOSHARE);
+
+	lock_init(&lk->lock_object, &lock_class_lockmgr, wmesg, NULL, iflags);
+	lk->lk_lock = LK_UNLOCKED;
+	lk->lk_recurse = 0;
+	lk->lk_exslpfail = 0;
+	lk->lk_timo = timo;
+	lk->lk_pri = pri;
+	STACK_ZERO(lk);
+}
+
+/*
+ * XXX: Gross hacks to manipulate external lock flags after
+ * initialization.  Used for certain vnode and buf locks.
+ */
+void
+lockallowshare(struct lock *lk)
+{
+
+	lockmgr_assert(lk, KA_XLOCKED);
+	lk->lock_object.lo_flags &= ~LK_NOSHARE;
+}
+
+void
+lockallowrecurse(struct lock *lk)
+{
+
+	lockmgr_assert(lk, KA_XLOCKED);
+	lk->lock_object.lo_flags |= LO_RECURSABLE;
+}
+
+void
+lockdisablerecurse(struct lock *lk)
+{
+
+	lockmgr_assert(lk, KA_XLOCKED);
+	lk->lock_object.lo_flags &= ~LO_RECURSABLE;
+}
+
+void
+lockdestroy(struct lock *lk)
+{
+
+	KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held"));
+	KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed"));
+	KASSERT(lk->lk_exslpfail == 0, ("lockmgr still exclusive waiters"));
+	lock_destroy(&lk->lock_object);
+}
+
+int
+__lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk,
+    const char *wmesg, int pri, int timo, const char *file, int line)
+{
+	GIANT_DECLARE;
+	struct lock_class *class;
+	const char *iwmesg;
+	uintptr_t tid, v, x;
+	u_int op, realexslp;
+	int error, ipri, itimo, queue, wakeup_swapper;
+#ifdef LOCK_PROFILING
+	uint64_t waittime = 0;
+	int contested = 0;
+#endif
+#ifdef ADAPTIVE_LOCKMGRS
+	volatile struct thread *owner;
+	u_int i, spintries = 0;
+#endif
+
+	error = 0;
+	tid = (uintptr_t)curthread;
+	op = (flags & LK_TYPE_MASK);
+	iwmesg = (wmesg == LK_WMESG_DEFAULT) ? lk->lock_object.lo_name : wmesg;
+	ipri = (pri == LK_PRIO_DEFAULT) ? lk->lk_pri : pri;
+	itimo = (timo == LK_TIMO_DEFAULT) ? lk->lk_timo : timo;
+
+	MPASS((flags & ~LK_TOTAL_MASK) == 0);
+	KASSERT((op & (op - 1)) == 0,
+	    ("%s: Invalid requested operation @ %s:%d", __func__, file, line));
+	KASSERT((flags & (LK_NOWAIT | LK_SLEEPFAIL)) == 0 ||
+	    (op != LK_DOWNGRADE && op != LK_RELEASE),
+	    ("%s: Invalid flags in regard of the operation desired @ %s:%d",
+	    __func__, file, line));
+	KASSERT((flags & LK_INTERLOCK) == 0 || ilk != NULL,
+	    ("%s: LK_INTERLOCK passed without valid interlock @ %s:%d",
+	    __func__, file, line));
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("%s: idle thread %p on lockmgr %s @ %s:%d", __func__, curthread,
+	    lk->lock_object.lo_name, file, line));
+
+	class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
+	if (panicstr != NULL) {
+		if (flags & LK_INTERLOCK)
+			class->lc_unlock(ilk);
+		return (0);
+	}
+
+	if (lk->lock_object.lo_flags & LK_NOSHARE) {
+		switch (op) {
+		case LK_SHARED:
+			op = LK_EXCLUSIVE;
+			break;
+		case LK_UPGRADE:
+		case LK_DOWNGRADE:
+			_lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED,
+			    file, line);
+			if (flags & LK_INTERLOCK)
+				class->lc_unlock(ilk);
+			return (0);
+		}
+	}
+
+	wakeup_swapper = 0;
+	switch (op) {
+	case LK_SHARED:
+		if (LK_CAN_WITNESS(flags))
+			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER,
+			    file, line, flags & LK_INTERLOCK ? ilk : NULL);
+		for (;;) {
+			x = lk->lk_lock;
+
+			/*
+			 * If no other thread has an exclusive lock, or
+			 * no exclusive waiter is present, bump the count of
+			 * sharers.  Since we have to preserve the state of
+			 * waiters, if we fail to acquire the shared lock
+			 * loop back and retry.
+			 */
+			if (LK_CAN_SHARE(x)) {
+				if (atomic_cmpset_acq_ptr(&lk->lk_lock, x,
+				    x + LK_ONE_SHARER))
+					break;
+				continue;
+			}
+#ifdef HWPMC_HOOKS
+			PMC_SOFT_CALL( , , lock, failed);
+#endif
+			lock_profile_obtain_lock_failed(&lk->lock_object,
+			    &contested, &waittime);
+
+			/*
+			 * If the lock is already held by curthread in
+			 * exclusive way avoid a deadlock.
+			 */
+			if (LK_HOLDER(x) == tid) {
+				LOCK_LOG2(lk,
+				    "%s: %p already held in exclusive mode",
+				    __func__, lk);
+				error = EDEADLK;
+				break;
+			}
+
+			/*
+			 * If the lock is expected to not sleep just give up
+			 * and return.
+			 */
+			if (LK_TRYOP(flags)) {
+				LOCK_LOG2(lk, "%s: %p fails the try operation",
+				    __func__, lk);
+				error = EBUSY;
+				break;
+			}
+
+#ifdef ADAPTIVE_LOCKMGRS
+			/*
+			 * If the owner is running on another CPU, spin until
+			 * the owner stops running or the state of the lock
+			 * changes.  We need a double-state handle here
+			 * because for a failed acquisition the lock can be
+			 * either held in exclusive mode or shared mode
+			 * (for the writer starvation avoidance technique).
+			 */
+			if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
+			    LK_HOLDER(x) != LK_KERNPROC) {
+				owner = (struct thread *)LK_HOLDER(x);
+				if (LOCK_LOG_TEST(&lk->lock_object, 0))
+					CTR3(KTR_LOCK,
+					    "%s: spinning on %p held by %p",
+					    __func__, lk, owner);
+
+				/*
+				 * If we are holding also an interlock drop it
+				 * in order to avoid a deadlock if the lockmgr
+				 * owner is adaptively spinning on the
+				 * interlock itself.
+				 */
+				if (flags & LK_INTERLOCK) {
+					class->lc_unlock(ilk);
+					flags &= ~LK_INTERLOCK;
+				}
+				GIANT_SAVE();
+				while (LK_HOLDER(lk->lk_lock) ==
+				    (uintptr_t)owner && TD_IS_RUNNING(owner))
+					cpu_spinwait();
+				GIANT_RESTORE();
+				continue;
+			} else if (LK_CAN_ADAPT(lk, flags) &&
+			    (x & LK_SHARE) != 0 && LK_SHARERS(x) &&
+			    spintries < alk_retries) {
+				if (flags & LK_INTERLOCK) {
+					class->lc_unlock(ilk);
+					flags &= ~LK_INTERLOCK;
+				}
+				GIANT_SAVE();
+				spintries++;
+				for (i = 0; i < alk_loops; i++) {
+					if (LOCK_LOG_TEST(&lk->lock_object, 0))
+						CTR4(KTR_LOCK,
+				    "%s: shared spinning on %p with %u and %u",
+						    __func__, lk, spintries, i);
+					x = lk->lk_lock;
+					if ((x & LK_SHARE) == 0 ||
+					    LK_CAN_SHARE(x) != 0)
+						break;
+					cpu_spinwait();
+				}
+				GIANT_RESTORE();
+				if (i != alk_loops)
+					continue;
+			}
+#endif
+
+			/*
+			 * Acquire the sleepqueue chain lock because we
+			 * probabilly will need to manipulate waiters flags.
+			 */
+			sleepq_lock(&lk->lock_object);
+			x = lk->lk_lock;
+
+			/*
+			 * if the lock can be acquired in shared mode, try
+			 * again.
+			 */
+			if (LK_CAN_SHARE(x)) {
+				sleepq_release(&lk->lock_object);
+				continue;
+			}
+
+#ifdef ADAPTIVE_LOCKMGRS
+			/*
+			 * The current lock owner might have started executing
+			 * on another CPU (or the lock could have changed
+			 * owner) while we were waiting on the turnstile
+			 * chain lock.  If so, drop the turnstile lock and try
+			 * again.
+			 */
+			if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
+			    LK_HOLDER(x) != LK_KERNPROC) {
+				owner = (struct thread *)LK_HOLDER(x);
+				if (TD_IS_RUNNING(owner)) {
+					sleepq_release(&lk->lock_object);
+					continue;
+				}
+			}
+#endif
+
+			/*
+			 * Try to set the LK_SHARED_WAITERS flag.  If we fail,
+			 * loop back and retry.
+			 */
+			if ((x & LK_SHARED_WAITERS) == 0) {
+				if (!atomic_cmpset_acq_ptr(&lk->lk_lock, x,
+				    x | LK_SHARED_WAITERS)) {
+					sleepq_release(&lk->lock_object);
+					continue;
+				}
+				LOCK_LOG2(lk, "%s: %p set shared waiters flag",
+				    __func__, lk);
+			}
+
+			/*
+			 * As far as we have been unable to acquire the
+			 * shared lock and the shared waiters flag is set,
+			 * we will sleep.
+			 */
+			error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
+			    SQ_SHARED_QUEUE);
+			flags &= ~LK_INTERLOCK;
+			if (error) {
+				LOCK_LOG3(lk,
+				    "%s: interrupted sleep for %p with %d",
+				    __func__, lk, error);
+				break;
+			}
+			LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
+			    __func__, lk);
+		}
+		if (error == 0) {
+			lock_profile_obtain_lock_success(&lk->lock_object,
+			    contested, waittime, file, line);
+			LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, file,
+			    line);
+			WITNESS_LOCK(&lk->lock_object, LK_TRYWIT(flags), file,
+			    line);
+			TD_LOCKS_INC(curthread);
+			TD_SLOCKS_INC(curthread);
+			STACK_SAVE(lk);
+		}
+		break;
+	case LK_UPGRADE:
+		_lockmgr_assert(lk, KA_SLOCKED, file, line);
+		v = lk->lk_lock;
+		x = v & LK_ALL_WAITERS;
+		v &= LK_EXCLUSIVE_SPINNERS;
+
+		/*
+		 * Try to switch from one shared lock to an exclusive one.
+		 * We need to preserve waiters flags during the operation.
+		 */
+		if (atomic_cmpset_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x | v,
+		    tid | x)) {
+			LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file,
+			    line);
+			WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE |
+			    LK_TRYWIT(flags), file, line);
+			TD_SLOCKS_DEC(curthread);
+			break;
+		}
+
+		/*
+		 * We have been unable to succeed in upgrading, so just
+		 * give up the shared lock.
+		 */
+		wakeup_swapper |= wakeupshlk(lk, file, line);
+
+		/* FALLTHROUGH */
+	case LK_EXCLUSIVE:
+		if (LK_CAN_WITNESS(flags))
+			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
+			    LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
+			    ilk : NULL);
+
+		/*
+		 * If curthread already holds the lock and this one is
+		 * allowed to recurse, simply recurse on it.
+		 */
+		if (lockmgr_xlocked(lk)) {
+			if ((flags & LK_CANRECURSE) == 0 &&
+			    (lk->lock_object.lo_flags & LO_RECURSABLE) == 0) {
+
+				/*
+				 * If the lock is expected to not panic just
+				 * give up and return.
+				 */
+				if (LK_TRYOP(flags)) {
+					LOCK_LOG2(lk,
+					    "%s: %p fails the try operation",
+					    __func__, lk);
+					error = EBUSY;
+					break;
+				}
+				if (flags & LK_INTERLOCK)
+					class->lc_unlock(ilk);
+		panic("%s: recursing on non recursive lockmgr %s @ %s:%d\n",
+				    __func__, iwmesg, file, line);
+			}
+			lk->lk_recurse++;
+			LOCK_LOG2(lk, "%s: %p recursing", __func__, lk);
+			LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0,
+			    lk->lk_recurse, file, line);
+			WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
+			    LK_TRYWIT(flags), file, line);
+			TD_LOCKS_INC(curthread);
+			break;
+		}
+
+		while (!atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED,
+		    tid)) {
+#ifdef HWPMC_HOOKS
+			PMC_SOFT_CALL( , , lock, failed);
+#endif
+			lock_profile_obtain_lock_failed(&lk->lock_object,
+			    &contested, &waittime);
+
+			/*
+			 * If the lock is expected to not sleep just give up
+			 * and return.
+			 */
+			if (LK_TRYOP(flags)) {
+				LOCK_LOG2(lk, "%s: %p fails the try operation",
+				    __func__, lk);
+				error = EBUSY;
+				break;
+			}
+
+#ifdef ADAPTIVE_LOCKMGRS
+			/*
+			 * If the owner is running on another CPU, spin until
+			 * the owner stops running or the state of the lock
+			 * changes.
+			 */
+			x = lk->lk_lock;
+			if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
+			    LK_HOLDER(x) != LK_KERNPROC) {
+				owner = (struct thread *)LK_HOLDER(x);
+				if (LOCK_LOG_TEST(&lk->lock_object, 0))
+					CTR3(KTR_LOCK,
+					    "%s: spinning on %p held by %p",
+					    __func__, lk, owner);
+
+				/*
+				 * If we are holding also an interlock drop it
+				 * in order to avoid a deadlock if the lockmgr
+				 * owner is adaptively spinning on the
+				 * interlock itself.
+				 */
+				if (flags & LK_INTERLOCK) {
+					class->lc_unlock(ilk);
+					flags &= ~LK_INTERLOCK;
+				}
+				GIANT_SAVE();
+				while (LK_HOLDER(lk->lk_lock) ==
+				    (uintptr_t)owner && TD_IS_RUNNING(owner))
+					cpu_spinwait();
+				GIANT_RESTORE();
+				continue;
+			} else if (LK_CAN_ADAPT(lk, flags) &&
+			    (x & LK_SHARE) != 0 && LK_SHARERS(x) &&
+			    spintries < alk_retries) {
+				if ((x & LK_EXCLUSIVE_SPINNERS) == 0 &&
+				    !atomic_cmpset_ptr(&lk->lk_lock, x,
+				    x | LK_EXCLUSIVE_SPINNERS))
+					continue;
+				if (flags & LK_INTERLOCK) {
+					class->lc_unlock(ilk);
+					flags &= ~LK_INTERLOCK;
+				}
+				GIANT_SAVE();
+				spintries++;
+				for (i = 0; i < alk_loops; i++) {
+					if (LOCK_LOG_TEST(&lk->lock_object, 0))
+						CTR4(KTR_LOCK,
+				    "%s: shared spinning on %p with %u and %u",
+						    __func__, lk, spintries, i);
+					if ((lk->lk_lock &
+					    LK_EXCLUSIVE_SPINNERS) == 0)
+						break;
+					cpu_spinwait();
+				}
+				GIANT_RESTORE();
+				if (i != alk_loops)
+					continue;
+			}
+#endif
+
+			/*
+			 * Acquire the sleepqueue chain lock because we
+			 * probabilly will need to manipulate waiters flags.
+			 */
+			sleepq_lock(&lk->lock_object);
+			x = lk->lk_lock;
+
+			/*
+			 * if the lock has been released while we spun on
+			 * the sleepqueue chain lock just try again.
+			 */
+			if (x == LK_UNLOCKED) {
+				sleepq_release(&lk->lock_object);
+				continue;
+			}
+
+#ifdef ADAPTIVE_LOCKMGRS
+			/*
+			 * The current lock owner might have started executing
+			 * on another CPU (or the lock could have changed
+			 * owner) while we were waiting on the turnstile
+			 * chain lock.  If so, drop the turnstile lock and try
+			 * again.
+			 */
+			if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
+			    LK_HOLDER(x) != LK_KERNPROC) {
+				owner = (struct thread *)LK_HOLDER(x);
+				if (TD_IS_RUNNING(owner)) {
+					sleepq_release(&lk->lock_object);
+					continue;
+				}
+			}
+#endif
+
+			/*
+			 * The lock can be in the state where there is a
+			 * pending queue of waiters, but still no owner.
+			 * This happens when the lock is contested and an
+			 * owner is going to claim the lock.
+			 * If curthread is the one successfully acquiring it
+			 * claim lock ownership and return, preserving waiters
+			 * flags.
+			 */
+			v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
+			if ((x & ~v) == LK_UNLOCKED) {
+				v &= ~LK_EXCLUSIVE_SPINNERS;
+				if (atomic_cmpset_acq_ptr(&lk->lk_lock, x,
+				    tid | v)) {
+					sleepq_release(&lk->lock_object);
+					LOCK_LOG2(lk,
+					    "%s: %p claimed by a new writer",
+					    __func__, lk);
+					break;
+				}
+				sleepq_release(&lk->lock_object);
+				continue;
+			}
+
+			/*
+			 * Try to set the LK_EXCLUSIVE_WAITERS flag.  If we
+			 * fail, loop back and retry.
+			 */
+			if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
+				if (!atomic_cmpset_ptr(&lk->lk_lock, x,
+				    x | LK_EXCLUSIVE_WAITERS)) {
+					sleepq_release(&lk->lock_object);
+					continue;
+				}
+				LOCK_LOG2(lk, "%s: %p set excl waiters flag",
+				    __func__, lk);
+			}
+
+			/*
+			 * As far as we have been unable to acquire the
+			 * exclusive lock and the exclusive waiters flag
+			 * is set, we will sleep.
+			 */
+			error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
+			    SQ_EXCLUSIVE_QUEUE);
+			flags &= ~LK_INTERLOCK;
+			if (error) {
+				LOCK_LOG3(lk,
+				    "%s: interrupted sleep for %p with %d",
+				    __func__, lk, error);
+				break;
+			}
+			LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
+			    __func__, lk);
+		}
+		if (error == 0) {
+			lock_profile_obtain_lock_success(&lk->lock_object,
+			    contested, waittime, file, line);
+			LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0,
+			    lk->lk_recurse, file, line);
+			WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
+			    LK_TRYWIT(flags), file, line);
+			TD_LOCKS_INC(curthread);
+			STACK_SAVE(lk);
+		}
+		break;
+	case LK_DOWNGRADE:
+		_lockmgr_assert(lk, KA_XLOCKED, file, line);
+		LOCK_LOG_LOCK("XDOWNGRADE", &lk->lock_object, 0, 0, file, line);
+		WITNESS_DOWNGRADE(&lk->lock_object, 0, file, line);
+
+		/*
+		 * Panic if the lock is recursed.
+		 */
+		if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) {
+			if (flags & LK_INTERLOCK)
+				class->lc_unlock(ilk);
+			panic("%s: downgrade a recursed lockmgr %s @ %s:%d\n",
+			    __func__, iwmesg, file, line);
+		}
+		TD_SLOCKS_INC(curthread);
+
+		/*
+		 * In order to preserve waiters flags, just spin.
+		 */
+		for (;;) {
+			x = lk->lk_lock;
+			MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
+			x &= LK_ALL_WAITERS;
+			if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
+			    LK_SHARERS_LOCK(1) | x))
+				break;
+			cpu_spinwait();
+		}
+		break;
+	case LK_RELEASE:
+		_lockmgr_assert(lk, KA_LOCKED, file, line);
+		x = lk->lk_lock;
+
+		if ((x & LK_SHARE) == 0) {
+
+			/*
+			 * As first option, treact the lock as if it has not
+			 * any waiter.
+			 * Fix-up the tid var if the lock has been disowned.
+			 */
+			if (LK_HOLDER(x) == LK_KERNPROC)
+				tid = LK_KERNPROC;
+			else {
+				WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE,
+				    file, line);
+				TD_LOCKS_DEC(curthread);
+			}
+			LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0,
+			    lk->lk_recurse, file, line);
+
+			/*
+			 * The lock is held in exclusive mode.
+			 * If the lock is recursed also, then unrecurse it.
+			 */
+			if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) {
+				LOCK_LOG2(lk, "%s: %p unrecursing", __func__,
+				    lk);
+				lk->lk_recurse--;
+				break;
+			}
+			if (tid != LK_KERNPROC)
+				lock_profile_release_lock(&lk->lock_object);
+
+			if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid,
+			    LK_UNLOCKED))
+				break;
+
+			sleepq_lock(&lk->lock_object);
+			x = lk->lk_lock;
+			v = LK_UNLOCKED;
+
+			/*
+		 	 * If the lock has exclusive waiters, give them
+			 * preference in order to avoid deadlock with
+			 * shared runners up.
+			 * If interruptible sleeps left the exclusive queue
+			 * empty avoid a starvation for the threads sleeping
+			 * on the shared queue by giving them precedence
+			 * and cleaning up the exclusive waiters bit anyway.
+			 * Please note that lk_exslpfail count may be lying
+			 * about the real number of waiters with the
+			 * LK_SLEEPFAIL flag on because they may be used in
+			 * conjuction with interruptible sleeps so
+			 * lk_exslpfail might be considered an 'upper limit'
+			 * bound, including the edge cases.
+			 */
+			MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
+			realexslp = sleepq_sleepcnt(&lk->lock_object,
+			    SQ_EXCLUSIVE_QUEUE);
+			if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
+				if (lk->lk_exslpfail < realexslp) {
+					lk->lk_exslpfail = 0;
+					queue = SQ_EXCLUSIVE_QUEUE;
+					v |= (x & LK_SHARED_WAITERS);
+				} else {
+					lk->lk_exslpfail = 0;
+					LOCK_LOG2(lk,
+					"%s: %p has only LK_SLEEPFAIL sleepers",
+					    __func__, lk);
+					LOCK_LOG2(lk,
+			"%s: %p waking up threads on the exclusive queue",
+					    __func__, lk);
+					wakeup_swapper =
+					    sleepq_broadcast(&lk->lock_object,
+					    SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
+					queue = SQ_SHARED_QUEUE;
+				}
+			} else {
+
+				/*
+				 * Exclusive waiters sleeping with LK_SLEEPFAIL
+				 * on and using interruptible sleeps/timeout
+				 * may have left spourious lk_exslpfail counts
+				 * on, so clean it up anyway. 
+				 */
+				lk->lk_exslpfail = 0;
+				queue = SQ_SHARED_QUEUE;
+			}
+
+			LOCK_LOG3(lk,
+			    "%s: %p waking up threads on the %s queue",
+			    __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
+			    "exclusive");
+			atomic_store_rel_ptr(&lk->lk_lock, v);
+			wakeup_swapper |= sleepq_broadcast(&lk->lock_object,
+			    SLEEPQ_LK, 0, queue);
+			sleepq_release(&lk->lock_object);
+			break;
+		} else
+			wakeup_swapper = wakeupshlk(lk, file, line);
+		break;
+	case LK_DRAIN:
+		if (LK_CAN_WITNESS(flags))
+			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
+			    LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
+			    ilk : NULL);
+
+		/*
+		 * Trying to drain a lock we already own will result in a
+		 * deadlock.
+		 */
+		if (lockmgr_xlocked(lk)) {
+			if (flags & LK_INTERLOCK)
+				class->lc_unlock(ilk);
+			panic("%s: draining %s with the lock held @ %s:%d\n",
+			    __func__, iwmesg, file, line);
+		}
+
+		while (!atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) {
+#ifdef HWPMC_HOOKS
+			PMC_SOFT_CALL( , , lock, failed);
+#endif
+			lock_profile_obtain_lock_failed(&lk->lock_object,
+			    &contested, &waittime);
+
+			/*
+			 * If the lock is expected to not sleep just give up
+			 * and return.
+			 */
+			if (LK_TRYOP(flags)) {
+				LOCK_LOG2(lk, "%s: %p fails the try operation",
+				    __func__, lk);
+				error = EBUSY;
+				break;
+			}
+
+			/*
+			 * Acquire the sleepqueue chain lock because we
+			 * probabilly will need to manipulate waiters flags.
+			 */
+			sleepq_lock(&lk->lock_object);
+			x = lk->lk_lock;
+
+			/*
+			 * if the lock has been released while we spun on
+			 * the sleepqueue chain lock just try again.
+			 */
+			if (x == LK_UNLOCKED) {
+				sleepq_release(&lk->lock_object);
+				continue;
+			}
+
+			v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
+			if ((x & ~v) == LK_UNLOCKED) {
+				v = (x & ~LK_EXCLUSIVE_SPINNERS);
+
+				/*
+				 * If interruptible sleeps left the exclusive
+				 * queue empty avoid a starvation for the
+				 * threads sleeping on the shared queue by
+				 * giving them precedence and cleaning up the
+				 * exclusive waiters bit anyway.
+				 * Please note that lk_exslpfail count may be
+				 * lying about the real number of waiters with
+				 * the LK_SLEEPFAIL flag on because they may
+				 * be used in conjuction with interruptible
+				 * sleeps so lk_exslpfail might be considered
+				 * an 'upper limit' bound, including the edge
+				 * cases.
+				 */
+				if (v & LK_EXCLUSIVE_WAITERS) {
+					queue = SQ_EXCLUSIVE_QUEUE;
+					v &= ~LK_EXCLUSIVE_WAITERS;
+				} else {
+
+					/*
+					 * Exclusive waiters sleeping with
+					 * LK_SLEEPFAIL on and using
+					 * interruptible sleeps/timeout may
+					 * have left spourious lk_exslpfail
+					 * counts on, so clean it up anyway.
+					 */
+					MPASS(v & LK_SHARED_WAITERS);
+					lk->lk_exslpfail = 0;
+					queue = SQ_SHARED_QUEUE;
+					v &= ~LK_SHARED_WAITERS;
+				}
+				if (queue == SQ_EXCLUSIVE_QUEUE) {
+					realexslp =
+					    sleepq_sleepcnt(&lk->lock_object,
+					    SQ_EXCLUSIVE_QUEUE);
+					if (lk->lk_exslpfail >= realexslp) {
+						lk->lk_exslpfail = 0;
+						queue = SQ_SHARED_QUEUE;
+						v &= ~LK_SHARED_WAITERS;
+						if (realexslp != 0) {
+							LOCK_LOG2(lk,
+					"%s: %p has only LK_SLEEPFAIL sleepers",
+							    __func__, lk);
+							LOCK_LOG2(lk,
+			"%s: %p waking up threads on the exclusive queue",
+							    __func__, lk);
+							wakeup_swapper =
+							    sleepq_broadcast(
+							    &lk->lock_object,
+							    SLEEPQ_LK, 0,
+							    SQ_EXCLUSIVE_QUEUE);
+						}
+					} else
+						lk->lk_exslpfail = 0;
+				}
+				if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) {
+					sleepq_release(&lk->lock_object);
+					continue;
+				}
+				LOCK_LOG3(lk,
+				"%s: %p waking up all threads on the %s queue",
+				    __func__, lk, queue == SQ_SHARED_QUEUE ?
+				    "shared" : "exclusive");
+				wakeup_swapper |= sleepq_broadcast(
+				    &lk->lock_object, SLEEPQ_LK, 0, queue);
+
+				/*
+				 * If shared waiters have been woken up we need
+				 * to wait for one of them to acquire the lock
+				 * before to set the exclusive waiters in
+				 * order to avoid a deadlock.
+				 */
+				if (queue == SQ_SHARED_QUEUE) {
+					for (v = lk->lk_lock;
+					    (v & LK_SHARE) && !LK_SHARERS(v);
+					    v = lk->lk_lock)
+						cpu_spinwait();
+				}
+			}
+
+			/*
+			 * Try to set the LK_EXCLUSIVE_WAITERS flag.  If we
+			 * fail, loop back and retry.
+			 */
+			if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
+				if (!atomic_cmpset_ptr(&lk->lk_lock, x,
+				    x | LK_EXCLUSIVE_WAITERS)) {
+					sleepq_release(&lk->lock_object);
+					continue;
+				}
+				LOCK_LOG2(lk, "%s: %p set drain waiters flag",
+				    __func__, lk);
+			}
+
+			/*
+			 * As far as we have been unable to acquire the
+			 * exclusive lock and the exclusive waiters flag
+			 * is set, we will sleep.
+			 */
+			if (flags & LK_INTERLOCK) {
+				class->lc_unlock(ilk);
+				flags &= ~LK_INTERLOCK;
+			}
+			GIANT_SAVE();
+			sleepq_add(&lk->lock_object, NULL, iwmesg, SLEEPQ_LK,
+			    SQ_EXCLUSIVE_QUEUE);
+			sleepq_wait(&lk->lock_object, ipri & PRIMASK);
+			GIANT_RESTORE();
+			LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
+			    __func__, lk);
+		}
+
+		if (error == 0) {
+			lock_profile_obtain_lock_success(&lk->lock_object,
+			    contested, waittime, file, line);
+			LOCK_LOG_LOCK("DRAIN", &lk->lock_object, 0,
+			    lk->lk_recurse, file, line);
+			WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
+			    LK_TRYWIT(flags), file, line);
+			TD_LOCKS_INC(curthread);
+			STACK_SAVE(lk);
+		}
+		break;
+	default:
+		if (flags & LK_INTERLOCK)
+			class->lc_unlock(ilk);
+		panic("%s: unknown lockmgr request 0x%x\n", __func__, op);
+	}
+
+	if (flags & LK_INTERLOCK)
+		class->lc_unlock(ilk);
+	if (wakeup_swapper)
+		kick_proc0();
+
+	return (error);
+}
+
+void
+_lockmgr_disown(struct lock *lk, const char *file, int line)
+{
+	uintptr_t tid, x;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	tid = (uintptr_t)curthread;
+	_lockmgr_assert(lk, KA_XLOCKED, file, line);
+
+	/*
+	 * Panic if the lock is recursed.
+	 */
+	if (lockmgr_xlocked(lk) && lockmgr_recursed(lk))
+		panic("%s: disown a recursed lockmgr @ %s:%d\n",
+		    __func__,  file, line);
+
+	/*
+	 * If the owner is already LK_KERNPROC just skip the whole operation.
+	 */
+	if (LK_HOLDER(lk->lk_lock) != tid)
+		return;
+	lock_profile_release_lock(&lk->lock_object);
+	LOCK_LOG_LOCK("XDISOWN", &lk->lock_object, 0, 0, file, line);
+	WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line);
+	TD_LOCKS_DEC(curthread);
+	STACK_SAVE(lk);
+
+	/*
+	 * In order to preserve waiters flags, just spin.
+	 */
+	for (;;) {
+		x = lk->lk_lock;
+		MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
+		x &= LK_ALL_WAITERS;
+		if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
+		    LK_KERNPROC | x))
+			return;
+		cpu_spinwait();
+	}
+}
+
+void
+lockmgr_printinfo(const struct lock *lk)
+{
+	struct thread *td;
+	uintptr_t x;
+
+	if (lk->lk_lock == LK_UNLOCKED)
+		printf("lock type %s: UNLOCKED\n", lk->lock_object.lo_name);
+	else if (lk->lk_lock & LK_SHARE)
+		printf("lock type %s: SHARED (count %ju)\n",
+		    lk->lock_object.lo_name,
+		    (uintmax_t)LK_SHARERS(lk->lk_lock));
+	else {
+		td = lockmgr_xholder(lk);
+		printf("lock type %s: EXCL by thread %p "
+		    "(pid %d, %s, tid %d)\n", lk->lock_object.lo_name, td,
+		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_tid);
+	}
+
+	x = lk->lk_lock;
+	if (x & LK_EXCLUSIVE_WAITERS)
+		printf(" with exclusive waiters pending\n");
+	if (x & LK_SHARED_WAITERS)
+		printf(" with shared waiters pending\n");
+	if (x & LK_EXCLUSIVE_SPINNERS)
+		printf(" with exclusive spinners pending\n");
+
+	STACK_PRINT(lk);
+}
+
+int
+lockstatus(const struct lock *lk)
+{
+	uintptr_t v, x;
+	int ret;
+
+	ret = LK_SHARED;
+	x = lk->lk_lock;
+	v = LK_HOLDER(x);
+
+	if ((x & LK_SHARE) == 0) {
+		if (v == (uintptr_t)curthread || v == LK_KERNPROC)
+			ret = LK_EXCLUSIVE;
+		else
+			ret = LK_EXCLOTHER;
+	} else if (x == LK_UNLOCKED)
+		ret = 0;
+
+	return (ret);
+}
+
+#ifdef INVARIANT_SUPPORT
+
+FEATURE(invariant_support,
+    "Support for modules compiled with INVARIANTS option");
+
+#ifndef INVARIANTS
+#undef	_lockmgr_assert
+#endif
+
+void
+_lockmgr_assert(const struct lock *lk, int what, const char *file, int line)
+{
+	int slocked = 0;
+
+	if (panicstr != NULL)
+		return;
+	switch (what) {
+	case KA_SLOCKED:
+	case KA_SLOCKED | KA_NOTRECURSED:
+	case KA_SLOCKED | KA_RECURSED:
+		slocked = 1;
+	case KA_LOCKED:
+	case KA_LOCKED | KA_NOTRECURSED:
+	case KA_LOCKED | KA_RECURSED:
+#ifdef WITNESS
+
+		/*
+		 * We cannot trust WITNESS if the lock is held in exclusive
+		 * mode and a call to lockmgr_disown() happened.
+		 * Workaround this skipping the check if the lock is held in
+		 * exclusive mode even for the KA_LOCKED case.
+		 */
+		if (slocked || (lk->lk_lock & LK_SHARE)) {
+			witness_assert(&lk->lock_object, what, file, line);
+			break;
+		}
+#endif
+		if (lk->lk_lock == LK_UNLOCKED ||
+		    ((lk->lk_lock & LK_SHARE) == 0 && (slocked ||
+		    (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk)))))
+			panic("Lock %s not %slocked @ %s:%d\n",
+			    lk->lock_object.lo_name, slocked ? "share" : "",
+			    file, line);
+
+		if ((lk->lk_lock & LK_SHARE) == 0) {
+			if (lockmgr_recursed(lk)) {
+				if (what & KA_NOTRECURSED)
+					panic("Lock %s recursed @ %s:%d\n",
+					    lk->lock_object.lo_name, file,
+					    line);
+			} else if (what & KA_RECURSED)
+				panic("Lock %s not recursed @ %s:%d\n",
+				    lk->lock_object.lo_name, file, line);
+		}
+		break;
+	case KA_XLOCKED:
+	case KA_XLOCKED | KA_NOTRECURSED:
+	case KA_XLOCKED | KA_RECURSED:
+		if (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk))
+			panic("Lock %s not exclusively locked @ %s:%d\n",
+			    lk->lock_object.lo_name, file, line);
+		if (lockmgr_recursed(lk)) {
+			if (what & KA_NOTRECURSED)
+				panic("Lock %s recursed @ %s:%d\n",
+				    lk->lock_object.lo_name, file, line);
+		} else if (what & KA_RECURSED)
+			panic("Lock %s not recursed @ %s:%d\n",
+			    lk->lock_object.lo_name, file, line);
+		break;
+	case KA_UNLOCKED:
+		if (lockmgr_xlocked(lk) || lockmgr_disowned(lk))
+			panic("Lock %s exclusively locked @ %s:%d\n",
+			    lk->lock_object.lo_name, file, line);
+		break;
+	default:
+		panic("Unknown lockmgr assertion: %d @ %s:%d\n", what, file,
+		    line);
+	}
+}
+#endif
+
+#ifdef DDB
+int
+lockmgr_chain(struct thread *td, struct thread **ownerp)
+{
+	struct lock *lk;
+
+	lk = td->td_wchan;
+
+	if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr)
+		return (0);
+	db_printf("blocked on lockmgr %s", lk->lock_object.lo_name);
+	if (lk->lk_lock & LK_SHARE)
+		db_printf("SHARED (count %ju)\n",
+		    (uintmax_t)LK_SHARERS(lk->lk_lock));
+	else
+		db_printf("EXCL\n");
+	*ownerp = lockmgr_xholder(lk);
+
+	return (1);
+}
+
+static void
+db_show_lockmgr(const struct lock_object *lock)
+{
+	struct thread *td;
+	const struct lock *lk;
+
+	lk = (const struct lock *)lock;
+
+	db_printf(" state: ");
+	if (lk->lk_lock == LK_UNLOCKED)
+		db_printf("UNLOCKED\n");
+	else if (lk->lk_lock & LK_SHARE)
+		db_printf("SLOCK: %ju\n", (uintmax_t)LK_SHARERS(lk->lk_lock));
+	else {
+		td = lockmgr_xholder(lk);
+		if (td == (struct thread *)LK_KERNPROC)
+			db_printf("XLOCK: LK_KERNPROC\n");
+		else
+			db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
+			    td->td_tid, td->td_proc->p_pid,
+			    td->td_proc->p_comm);
+		if (lockmgr_recursed(lk))
+			db_printf(" recursed: %d\n", lk->lk_recurse);
+	}
+	db_printf(" waiters: ");
+	switch (lk->lk_lock & LK_ALL_WAITERS) {
+	case LK_SHARED_WAITERS:
+		db_printf("shared\n");
+		break;
+	case LK_EXCLUSIVE_WAITERS:
+		db_printf("exclusive\n");
+		break;
+	case LK_ALL_WAITERS:
+		db_printf("shared and exclusive\n");
+		break;
+	default:
+		db_printf("none\n");
+	}
+	db_printf(" spinners: ");
+	if (lk->lk_lock & LK_EXCLUSIVE_SPINNERS)
+		db_printf("exclusive\n");
+	else
+		db_printf("none\n");
+}
+#endif
diff --git a/sys/kern/kern_lockf.c b/sys/kern/kern_lockf.c
new file mode 100644
index 0000000..6d6dc51
--- /dev/null
+++ b/sys/kern/kern_lockf.c
@@ -0,0 +1,2545 @@
+/*-
+ * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
+ * Authors: Doug Rabson <dfr@rabson.org>
+ * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Scooter Morris at Genentech Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_lockf.c	8.3 (Berkeley) 1/6/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_debug_lockf.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/hash.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/lockf.h>
+#include <sys/taskqueue.h>
+
+#ifdef LOCKF_DEBUG
+#include <sys/sysctl.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+
+static int	lockf_debug = 0; /* control debug output */
+SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
+#endif
+
+static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
+
+struct owner_edge;
+struct owner_vertex;
+struct owner_vertex_list;
+struct owner_graph;
+
+#define NOLOCKF (struct lockf_entry *)0
+#define SELF	0x1
+#define OTHERS	0x2
+static void	 lf_init(void *);
+static int	 lf_hash_owner(caddr_t, struct flock *, int);
+static int	 lf_owner_matches(struct lock_owner *, caddr_t, struct flock *,
+    int);
+static struct lockf_entry *
+		 lf_alloc_lock(struct lock_owner *);
+static int	 lf_free_lock(struct lockf_entry *);
+static int	 lf_clearlock(struct lockf *, struct lockf_entry *);
+static int	 lf_overlaps(struct lockf_entry *, struct lockf_entry *);
+static int	 lf_blocks(struct lockf_entry *, struct lockf_entry *);
+static void	 lf_free_edge(struct lockf_edge *);
+static struct lockf_edge *
+		 lf_alloc_edge(void);
+static void	 lf_alloc_vertex(struct lockf_entry *);
+static int	 lf_add_edge(struct lockf_entry *, struct lockf_entry *);
+static void	 lf_remove_edge(struct lockf_edge *);
+static void	 lf_remove_outgoing(struct lockf_entry *);
+static void	 lf_remove_incoming(struct lockf_entry *);
+static int	 lf_add_outgoing(struct lockf *, struct lockf_entry *);
+static int	 lf_add_incoming(struct lockf *, struct lockf_entry *);
+static int	 lf_findoverlap(struct lockf_entry **, struct lockf_entry *,
+    int);
+static struct lockf_entry *
+		 lf_getblock(struct lockf *, struct lockf_entry *);
+static int	 lf_getlock(struct lockf *, struct lockf_entry *, struct flock *);
+static void	 lf_insert_lock(struct lockf *, struct lockf_entry *);
+static void	 lf_wakeup_lock(struct lockf *, struct lockf_entry *);
+static void	 lf_update_dependancies(struct lockf *, struct lockf_entry *,
+    int all, struct lockf_entry_list *);
+static void	 lf_set_start(struct lockf *, struct lockf_entry *, off_t,
+	struct lockf_entry_list*);
+static void	 lf_set_end(struct lockf *, struct lockf_entry *, off_t,
+	struct lockf_entry_list*);
+static int	 lf_setlock(struct lockf *, struct lockf_entry *,
+    struct vnode *, void **cookiep);
+static int	 lf_cancel(struct lockf *, struct lockf_entry *, void *);
+static void	 lf_split(struct lockf *, struct lockf_entry *,
+    struct lockf_entry *, struct lockf_entry_list *);
+#ifdef LOCKF_DEBUG
+static int	 graph_reaches(struct owner_vertex *x, struct owner_vertex *y,
+    struct owner_vertex_list *path);
+static void	 graph_check(struct owner_graph *g, int checkorder);
+static void	 graph_print_vertices(struct owner_vertex_list *set);
+#endif
+static int	 graph_delta_forward(struct owner_graph *g,
+    struct owner_vertex *x, struct owner_vertex *y,
+    struct owner_vertex_list *delta);
+static int	 graph_delta_backward(struct owner_graph *g,
+    struct owner_vertex *x, struct owner_vertex *y,
+    struct owner_vertex_list *delta);
+static int	 graph_add_indices(int *indices, int n,
+    struct owner_vertex_list *set);
+static int	 graph_assign_indices(struct owner_graph *g, int *indices,
+    int nextunused, struct owner_vertex_list *set);
+static int	 graph_add_edge(struct owner_graph *g,
+    struct owner_vertex *x, struct owner_vertex *y);
+static void	 graph_remove_edge(struct owner_graph *g,
+    struct owner_vertex *x, struct owner_vertex *y);
+static struct owner_vertex *graph_alloc_vertex(struct owner_graph *g,
+    struct lock_owner *lo);
+static void	 graph_free_vertex(struct owner_graph *g,
+    struct owner_vertex *v);
+static struct owner_graph * graph_init(struct owner_graph *g);
+#ifdef LOCKF_DEBUG
+static void	 lf_print(char *, struct lockf_entry *);
+static void	 lf_printlist(char *, struct lockf_entry *);
+static void	 lf_print_owner(struct lock_owner *);
+#endif
+
+/*
+ * This structure is used to keep track of both local and remote lock
+ * owners. The lf_owner field of the struct lockf_entry points back at
+ * the lock owner structure. Each possible lock owner (local proc for
+ * POSIX fcntl locks, local file for BSD flock locks or <pid,sysid>
+ * pair for remote locks) is represented by a unique instance of
+ * struct lock_owner.
+ *
+ * If a lock owner has a lock that blocks some other lock or a lock
+ * that is waiting for some other lock, it also has a vertex in the
+ * owner_graph below.
+ *
+ * Locks:
+ * (s)		locked by state->ls_lock
+ * (S)		locked by lf_lock_states_lock
+ * (l)		locked by lf_lock_owners_lock
+ * (g)		locked by lf_owner_graph_lock
+ * (c)		const until freeing
+ */
+#define	LOCK_OWNER_HASH_SIZE	256
+
+struct lock_owner {
+	LIST_ENTRY(lock_owner) lo_link; /* (l) hash chain */
+	int	lo_refs;	    /* (l) Number of locks referring to this */
+	int	lo_flags;	    /* (c) Flags passwd to lf_advlock */
+	caddr_t	lo_id;		    /* (c) Id value passed to lf_advlock */
+	pid_t	lo_pid;		    /* (c) Process Id of the lock owner */
+	int	lo_sysid;	    /* (c) System Id of the lock owner */
+	struct owner_vertex *lo_vertex; /* (g) entry in deadlock graph */
+};
+
+LIST_HEAD(lock_owner_list, lock_owner);
+
+static struct sx		lf_lock_states_lock;
+static struct lockf_list	lf_lock_states; /* (S) */
+static struct sx		lf_lock_owners_lock;
+static struct lock_owner_list	lf_lock_owners[LOCK_OWNER_HASH_SIZE]; /* (l) */
+
+/*
+ * Structures for deadlock detection.
+ *
+ * We have two types of directed graph, the first is the set of locks,
+ * both active and pending on a vnode. Within this graph, active locks
+ * are terminal nodes in the graph (i.e. have no out-going
+ * edges). Pending locks have out-going edges to each blocking active
+ * lock that prevents the lock from being granted and also to each
+ * older pending lock that would block them if it was active. The
+ * graph for each vnode is naturally acyclic; new edges are only ever
+ * added to or from new nodes (either new pending locks which only add
+ * out-going edges or new active locks which only add in-coming edges)
+ * therefore they cannot create loops in the lock graph.
+ *
+ * The second graph is a global graph of lock owners. Each lock owner
+ * is a vertex in that graph and an edge is added to the graph
+ * whenever an edge is added to a vnode graph, with end points
+ * corresponding to owner of the new pending lock and the owner of the
+ * lock upon which it waits. In order to prevent deadlock, we only add
+ * an edge to this graph if the new edge would not create a cycle.
+ * 
+ * The lock owner graph is topologically sorted, i.e. if a node has
+ * any outgoing edges, then it has an order strictly less than any
+ * node to which it has an outgoing edge. We preserve this ordering
+ * (and detect cycles) on edge insertion using Algorithm PK from the
+ * paper "A Dynamic Topological Sort Algorithm for Directed Acyclic
+ * Graphs" (ACM Journal of Experimental Algorithms, Vol 11, Article
+ * No. 1.7)
+ */
+struct owner_vertex;
+
+struct owner_edge {
+	LIST_ENTRY(owner_edge) e_outlink; /* (g) link from's out-edge list */
+	LIST_ENTRY(owner_edge) e_inlink;  /* (g) link to's in-edge list */
+	int		e_refs;		  /* (g) number of times added */
+	struct owner_vertex *e_from;	  /* (c) out-going from here */
+	struct owner_vertex *e_to;	  /* (c) in-coming to here */
+};
+LIST_HEAD(owner_edge_list, owner_edge);
+
+struct owner_vertex {
+	TAILQ_ENTRY(owner_vertex) v_link; /* (g) workspace for edge insertion */
+	uint32_t	v_gen;		  /* (g) workspace for edge insertion */
+	int		v_order;	  /* (g) order of vertex in graph */
+	struct owner_edge_list v_outedges;/* (g) list of out-edges */
+	struct owner_edge_list v_inedges; /* (g) list of in-edges */
+	struct lock_owner *v_owner;	  /* (c) corresponding lock owner */
+};
+TAILQ_HEAD(owner_vertex_list, owner_vertex);
+
+struct owner_graph {
+	struct owner_vertex** g_vertices; /* (g) pointers to vertices */
+	int		g_size;		  /* (g) number of vertices */
+	int		g_space;	  /* (g) space allocated for vertices */
+	int		*g_indexbuf;	  /* (g) workspace for loop detection */
+	uint32_t	g_gen;		  /* (g) increment when re-ordering */
+};
+
+static struct sx		lf_owner_graph_lock;
+static struct owner_graph	lf_owner_graph;
+
+/*
+ * Initialise various structures and locks.
+ */
+static void
+lf_init(void *dummy)
+{
+	int i;
+
+	sx_init(&lf_lock_states_lock, "lock states lock");
+	LIST_INIT(&lf_lock_states);
+
+	sx_init(&lf_lock_owners_lock, "lock owners lock");
+	for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++)
+		LIST_INIT(&lf_lock_owners[i]);
+
+	sx_init(&lf_owner_graph_lock, "owner graph lock");
+	graph_init(&lf_owner_graph);
+}
+SYSINIT(lf_init, SI_SUB_LOCK, SI_ORDER_FIRST, lf_init, NULL);
+
+/*
+ * Generate a hash value for a lock owner.
+ */
+static int
+lf_hash_owner(caddr_t id, struct flock *fl, int flags)
+{
+	uint32_t h;
+
+	if (flags & F_REMOTE) {
+		h = HASHSTEP(0, fl->l_pid);
+		h = HASHSTEP(h, fl->l_sysid);
+	} else if (flags & F_FLOCK) {
+		h = ((uintptr_t) id) >> 7;
+	} else {
+		struct proc *p = (struct proc *) id;
+		h = HASHSTEP(0, p->p_pid);
+		h = HASHSTEP(h, 0);
+	}
+
+	return (h % LOCK_OWNER_HASH_SIZE);
+}
+
+/*
+ * Return true if a lock owner matches the details passed to
+ * lf_advlock.
+ */
+static int
+lf_owner_matches(struct lock_owner *lo, caddr_t id, struct flock *fl,
+    int flags)
+{
+	if (flags & F_REMOTE) {
+		return lo->lo_pid == fl->l_pid
+			&& lo->lo_sysid == fl->l_sysid;
+	} else {
+		return lo->lo_id == id;
+	}
+}
+
+static struct lockf_entry *
+lf_alloc_lock(struct lock_owner *lo)
+{
+	struct lockf_entry *lf;
+
+	lf = malloc(sizeof(struct lockf_entry), M_LOCKF, M_WAITOK|M_ZERO);
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 4)
+		printf("Allocated lock %p\n", lf);
+#endif
+	if (lo) {
+		sx_xlock(&lf_lock_owners_lock);
+		lo->lo_refs++;
+		sx_xunlock(&lf_lock_owners_lock);
+		lf->lf_owner = lo;
+	}
+
+	return (lf);
+}
+
+static int
+lf_free_lock(struct lockf_entry *lock)
+{
+
+	KASSERT(lock->lf_refs > 0, ("lockf_entry negative ref count %p", lock));
+	if (--lock->lf_refs > 0)
+		return (0);
+	/*
+	 * Adjust the lock_owner reference count and
+	 * reclaim the entry if this is the last lock
+	 * for that owner.
+	 */
+	struct lock_owner *lo = lock->lf_owner;
+	if (lo) {
+		KASSERT(LIST_EMPTY(&lock->lf_outedges),
+		    ("freeing lock with dependancies"));
+		KASSERT(LIST_EMPTY(&lock->lf_inedges),
+		    ("freeing lock with dependants"));
+		sx_xlock(&lf_lock_owners_lock);
+		KASSERT(lo->lo_refs > 0, ("lock owner refcount"));
+		lo->lo_refs--;
+		if (lo->lo_refs == 0) {
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 1)
+				printf("lf_free_lock: freeing lock owner %p\n",
+				    lo);
+#endif
+			if (lo->lo_vertex) {
+				sx_xlock(&lf_owner_graph_lock);
+				graph_free_vertex(&lf_owner_graph,
+				    lo->lo_vertex);
+				sx_xunlock(&lf_owner_graph_lock);
+			}
+			LIST_REMOVE(lo, lo_link);
+			free(lo, M_LOCKF);
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 4)
+				printf("Freed lock owner %p\n", lo);
+#endif
+		}
+		sx_unlock(&lf_lock_owners_lock);
+	}
+	if ((lock->lf_flags & F_REMOTE) && lock->lf_vnode) {
+		vrele(lock->lf_vnode);
+		lock->lf_vnode = NULL;
+	}
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 4)
+		printf("Freed lock %p\n", lock);
+#endif
+	free(lock, M_LOCKF);
+	return (1);
+}
+
+/*
+ * Advisory record locking support
+ */
+int
+lf_advlockasync(struct vop_advlockasync_args *ap, struct lockf **statep,
+    u_quad_t size)
+{
+	struct lockf *state, *freestate = NULL;
+	struct flock *fl = ap->a_fl;
+	struct lockf_entry *lock;
+	struct vnode *vp = ap->a_vp;
+	caddr_t id = ap->a_id;
+	int flags = ap->a_flags;
+	int hash;
+	struct lock_owner *lo;
+	off_t start, end, oadd;
+	int error;
+
+	/*
+	 * Handle the F_UNLKSYS case first - no need to mess about
+	 * creating a lock owner for this one.
+	 */
+	if (ap->a_op == F_UNLCKSYS) {
+		lf_clearremotesys(fl->l_sysid);
+		return (0);
+	}
+
+	/*
+	 * Convert the flock structure into a start and end.
+	 */
+	switch (fl->l_whence) {
+
+	case SEEK_SET:
+	case SEEK_CUR:
+		/*
+		 * Caller is responsible for adding any necessary offset
+		 * when SEEK_CUR is used.
+		 */
+		start = fl->l_start;
+		break;
+
+	case SEEK_END:
+		if (size > OFF_MAX ||
+		    (fl->l_start > 0 && size > OFF_MAX - fl->l_start))
+			return (EOVERFLOW);
+		start = size + fl->l_start;
+		break;
+
+	default:
+		return (EINVAL);
+	}
+	if (start < 0)
+		return (EINVAL);
+	if (fl->l_len < 0) {
+		if (start == 0)
+			return (EINVAL);
+		end = start - 1;
+		start += fl->l_len;
+		if (start < 0)
+			return (EINVAL);
+	} else if (fl->l_len == 0) {
+		end = OFF_MAX;
+	} else {
+		oadd = fl->l_len - 1;
+		if (oadd > OFF_MAX - start)
+			return (EOVERFLOW);
+		end = start + oadd;
+	}
+	/*
+	 * Avoid the common case of unlocking when inode has no locks.
+	 */
+	VI_LOCK(vp);
+	if ((*statep) == NULL) {
+		if (ap->a_op != F_SETLK) {
+			fl->l_type = F_UNLCK;
+			VI_UNLOCK(vp);
+			return (0);
+		}
+	}
+	VI_UNLOCK(vp);
+
+	/*
+	 * Map our arguments to an existing lock owner or create one
+	 * if this is the first time we have seen this owner.
+	 */
+	hash = lf_hash_owner(id, fl, flags);
+	sx_xlock(&lf_lock_owners_lock);
+	LIST_FOREACH(lo, &lf_lock_owners[hash], lo_link)
+		if (lf_owner_matches(lo, id, fl, flags))
+			break;
+	if (!lo) {
+		/*
+		 * We initialise the lock with a reference
+		 * count which matches the new lockf_entry
+		 * structure created below.
+		 */
+		lo = malloc(sizeof(struct lock_owner), M_LOCKF,
+		    M_WAITOK|M_ZERO);
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 4)
+			printf("Allocated lock owner %p\n", lo);
+#endif
+
+		lo->lo_refs = 1;
+		lo->lo_flags = flags;
+		lo->lo_id = id;
+		if (flags & F_REMOTE) {
+			lo->lo_pid = fl->l_pid;
+			lo->lo_sysid = fl->l_sysid;
+		} else if (flags & F_FLOCK) {
+			lo->lo_pid = -1;
+			lo->lo_sysid = 0;
+		} else {
+			struct proc *p = (struct proc *) id;
+			lo->lo_pid = p->p_pid;
+			lo->lo_sysid = 0;
+		}
+		lo->lo_vertex = NULL;
+
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 1) {
+			printf("lf_advlockasync: new lock owner %p ", lo);
+			lf_print_owner(lo);
+			printf("\n");
+		}
+#endif
+
+		LIST_INSERT_HEAD(&lf_lock_owners[hash], lo, lo_link);
+	} else {
+		/*
+		 * We have seen this lock owner before, increase its
+		 * reference count to account for the new lockf_entry
+		 * structure we create below.
+		 */
+		lo->lo_refs++;
+	}
+	sx_xunlock(&lf_lock_owners_lock);
+
+	/*
+	 * Create the lockf structure. We initialise the lf_owner
+	 * field here instead of in lf_alloc_lock() to avoid paying
+	 * the lf_lock_owners_lock tax twice.
+	 */
+	lock = lf_alloc_lock(NULL);
+	lock->lf_refs = 1;
+	lock->lf_start = start;
+	lock->lf_end = end;
+	lock->lf_owner = lo;
+	lock->lf_vnode = vp;
+	if (flags & F_REMOTE) {
+		/*
+		 * For remote locks, the caller may release its ref to
+		 * the vnode at any time - we have to ref it here to
+		 * prevent it from being recycled unexpectedly.
+		 */
+		vref(vp);
+	}
+
+	/*
+	 * XXX The problem is that VTOI is ufs specific, so it will
+	 * break LOCKF_DEBUG for all other FS's other than UFS because
+	 * it casts the vnode->data ptr to struct inode *.
+	 */
+/*	lock->lf_inode = VTOI(ap->a_vp); */
+	lock->lf_inode = (struct inode *)0;
+	lock->lf_type = fl->l_type;
+	LIST_INIT(&lock->lf_outedges);
+	LIST_INIT(&lock->lf_inedges);
+	lock->lf_async_task = ap->a_task;
+	lock->lf_flags = ap->a_flags;
+
+	/*
+	 * Do the requested operation. First find our state structure
+	 * and create a new one if necessary - the caller's *statep
+	 * variable and the state's ls_threads count is protected by
+	 * the vnode interlock.
+	 */
+	VI_LOCK(vp);
+	if (vp->v_iflag & VI_DOOMED) {
+		VI_UNLOCK(vp);
+		lf_free_lock(lock);
+		return (ENOENT);
+	}
+
+	/*
+	 * Allocate a state structure if necessary.
+	 */
+	state = *statep;
+	if (state == NULL) {
+		struct lockf *ls;
+
+		VI_UNLOCK(vp);
+
+		ls = malloc(sizeof(struct lockf), M_LOCKF, M_WAITOK|M_ZERO);
+		sx_init(&ls->ls_lock, "ls_lock");
+		LIST_INIT(&ls->ls_active);
+		LIST_INIT(&ls->ls_pending);
+		ls->ls_threads = 1;
+
+		sx_xlock(&lf_lock_states_lock);
+		LIST_INSERT_HEAD(&lf_lock_states, ls, ls_link);
+		sx_xunlock(&lf_lock_states_lock);
+
+		/*
+		 * Cope if we lost a race with some other thread while
+		 * trying to allocate memory.
+		 */
+		VI_LOCK(vp);
+		if (vp->v_iflag & VI_DOOMED) {
+			VI_UNLOCK(vp);
+			sx_xlock(&lf_lock_states_lock);
+			LIST_REMOVE(ls, ls_link);
+			sx_xunlock(&lf_lock_states_lock);
+			sx_destroy(&ls->ls_lock);
+			free(ls, M_LOCKF);
+			lf_free_lock(lock);
+			return (ENOENT);
+		}
+		if ((*statep) == NULL) {
+			state = *statep = ls;
+			VI_UNLOCK(vp);
+		} else {
+			state = *statep;
+			state->ls_threads++;
+			VI_UNLOCK(vp);
+
+			sx_xlock(&lf_lock_states_lock);
+			LIST_REMOVE(ls, ls_link);
+			sx_xunlock(&lf_lock_states_lock);
+			sx_destroy(&ls->ls_lock);
+			free(ls, M_LOCKF);
+		}
+	} else {
+		state->ls_threads++;
+		VI_UNLOCK(vp);
+	}
+
+	sx_xlock(&state->ls_lock);
+	/*
+	 * Recheck the doomed vnode after state->ls_lock is
+	 * locked. lf_purgelocks() requires that no new threads add
+	 * pending locks when vnode is marked by VI_DOOMED flag.
+	 */
+	VI_LOCK(vp);
+	if (vp->v_iflag & VI_DOOMED) {
+		state->ls_threads--;
+		wakeup(state);
+		VI_UNLOCK(vp);
+		sx_xunlock(&state->ls_lock);
+		lf_free_lock(lock);
+		return (ENOENT);
+	}
+	VI_UNLOCK(vp);
+
+	switch (ap->a_op) {
+	case F_SETLK:
+		error = lf_setlock(state, lock, vp, ap->a_cookiep);
+		break;
+
+	case F_UNLCK:
+		error = lf_clearlock(state, lock);
+		lf_free_lock(lock);
+		break;
+
+	case F_GETLK:
+		error = lf_getlock(state, lock, fl);
+		lf_free_lock(lock);
+		break;
+
+	case F_CANCEL:
+		if (ap->a_cookiep)
+			error = lf_cancel(state, lock, *ap->a_cookiep);
+		else
+			error = EINVAL;
+		lf_free_lock(lock);
+		break;
+
+	default:
+		lf_free_lock(lock);
+		error = EINVAL;
+		break;
+	}
+
+#ifdef INVARIANTS
+	/*
+	 * Check for some can't happen stuff. In this case, the active
+	 * lock list becoming disordered or containing mutually
+	 * blocking locks. We also check the pending list for locks
+	 * which should be active (i.e. have no out-going edges).
+	 */
+	LIST_FOREACH(lock, &state->ls_active, lf_link) {
+		struct lockf_entry *lf;
+		if (LIST_NEXT(lock, lf_link))
+			KASSERT((lock->lf_start
+				<= LIST_NEXT(lock, lf_link)->lf_start),
+			    ("locks disordered"));
+		LIST_FOREACH(lf, &state->ls_active, lf_link) {
+			if (lock == lf)
+				break;
+			KASSERT(!lf_blocks(lock, lf),
+			    ("two conflicting active locks"));
+			if (lock->lf_owner == lf->lf_owner)
+				KASSERT(!lf_overlaps(lock, lf),
+				    ("two overlapping locks from same owner"));
+		}
+	}
+	LIST_FOREACH(lock, &state->ls_pending, lf_link) {
+		KASSERT(!LIST_EMPTY(&lock->lf_outedges),
+		    ("pending lock which should be active"));
+	}
+#endif
+	sx_xunlock(&state->ls_lock);
+
+	/*
+	 * If we have removed the last active lock on the vnode and
+	 * this is the last thread that was in-progress, we can free
+	 * the state structure. We update the caller's pointer inside
+	 * the vnode interlock but call free outside.
+	 *
+	 * XXX alternatively, keep the state structure around until
+	 * the filesystem recycles - requires a callback from the
+	 * filesystem.
+	 */
+	VI_LOCK(vp);
+
+	state->ls_threads--;
+	wakeup(state);
+	if (LIST_EMPTY(&state->ls_active) && state->ls_threads == 0) {
+		KASSERT(LIST_EMPTY(&state->ls_pending),
+		    ("freeing state with pending locks"));
+		freestate = state;
+		*statep = NULL;
+	}
+
+	VI_UNLOCK(vp);
+
+	if (freestate) {
+		sx_xlock(&lf_lock_states_lock);
+		LIST_REMOVE(freestate, ls_link);
+		sx_xunlock(&lf_lock_states_lock);
+		sx_destroy(&freestate->ls_lock);
+		free(freestate, M_LOCKF);
+	}
+	return (error);
+}
+
+int
+lf_advlock(struct vop_advlock_args *ap, struct lockf **statep, u_quad_t size)
+{
+	struct vop_advlockasync_args a;
+
+	a.a_vp = ap->a_vp;
+	a.a_id = ap->a_id;
+	a.a_op = ap->a_op;
+	a.a_fl = ap->a_fl;
+	a.a_flags = ap->a_flags;
+	a.a_task = NULL;
+	a.a_cookiep = NULL;
+
+	return (lf_advlockasync(&a, statep, size));
+}
+
+void
+lf_purgelocks(struct vnode *vp, struct lockf **statep)
+{
+	struct lockf *state;
+	struct lockf_entry *lock, *nlock;
+
+	/*
+	 * For this to work correctly, the caller must ensure that no
+	 * other threads enter the locking system for this vnode,
+	 * e.g. by checking VI_DOOMED. We wake up any threads that are
+	 * sleeping waiting for locks on this vnode and then free all
+	 * the remaining locks.
+	 */
+	VI_LOCK(vp);
+	KASSERT(vp->v_iflag & VI_DOOMED,
+	    ("lf_purgelocks: vp %p has not vgone yet", vp));
+	state = *statep;
+	if (state) {
+		*statep = NULL;
+		state->ls_threads++;
+		VI_UNLOCK(vp);
+
+		sx_xlock(&state->ls_lock);
+		sx_xlock(&lf_owner_graph_lock);
+		LIST_FOREACH_SAFE(lock, &state->ls_pending, lf_link, nlock) {
+			LIST_REMOVE(lock, lf_link);
+			lf_remove_outgoing(lock);
+			lf_remove_incoming(lock);
+
+			/*
+			 * If its an async lock, we can just free it
+			 * here, otherwise we let the sleeping thread
+			 * free it.
+			 */
+			if (lock->lf_async_task) {
+				lf_free_lock(lock);
+			} else {
+				lock->lf_flags |= F_INTR;
+				wakeup(lock);
+			}
+		}
+		sx_xunlock(&lf_owner_graph_lock);
+		sx_xunlock(&state->ls_lock);
+
+		/*
+		 * Wait for all other threads, sleeping and otherwise
+		 * to leave.
+		 */
+		VI_LOCK(vp);
+		while (state->ls_threads > 1)
+			msleep(state, VI_MTX(vp), 0, "purgelocks", 0);
+		VI_UNLOCK(vp);
+
+		/*
+		 * We can just free all the active locks since they
+		 * will have no dependancies (we removed them all
+		 * above). We don't need to bother locking since we
+		 * are the last thread using this state structure.
+		 */
+		KASSERT(LIST_EMPTY(&state->ls_pending),
+		    ("lock pending for %p", state));
+		LIST_FOREACH_SAFE(lock, &state->ls_active, lf_link, nlock) {
+			LIST_REMOVE(lock, lf_link);
+			lf_free_lock(lock);
+		}
+		sx_xlock(&lf_lock_states_lock);
+		LIST_REMOVE(state, ls_link);
+		sx_xunlock(&lf_lock_states_lock);
+		sx_destroy(&state->ls_lock);
+		free(state, M_LOCKF);
+	} else {
+		VI_UNLOCK(vp);
+	}
+}
+
+/*
+ * Return non-zero if locks 'x' and 'y' overlap.
+ */
+static int
+lf_overlaps(struct lockf_entry *x, struct lockf_entry *y)
+{
+
+	return (x->lf_start <= y->lf_end && x->lf_end >= y->lf_start);
+}
+
+/*
+ * Return non-zero if lock 'x' is blocked by lock 'y' (or vice versa).
+ */
+static int
+lf_blocks(struct lockf_entry *x, struct lockf_entry *y)
+{
+
+	return x->lf_owner != y->lf_owner
+		&& (x->lf_type == F_WRLCK || y->lf_type == F_WRLCK)
+		&& lf_overlaps(x, y);
+}
+
+/*
+ * Allocate a lock edge from the free list
+ */
+static struct lockf_edge *
+lf_alloc_edge(void)
+{
+
+	return (malloc(sizeof(struct lockf_edge), M_LOCKF, M_WAITOK|M_ZERO));
+}
+
+/*
+ * Free a lock edge.
+ */
+static void
+lf_free_edge(struct lockf_edge *e)
+{
+
+	free(e, M_LOCKF);
+}
+
+
+/*
+ * Ensure that the lock's owner has a corresponding vertex in the
+ * owner graph.
+ */
+static void
+lf_alloc_vertex(struct lockf_entry *lock)
+{
+	struct owner_graph *g = &lf_owner_graph;
+
+	if (!lock->lf_owner->lo_vertex)
+		lock->lf_owner->lo_vertex =
+			graph_alloc_vertex(g, lock->lf_owner);
+}
+
+/*
+ * Attempt to record an edge from lock x to lock y. Return EDEADLK if
+ * the new edge would cause a cycle in the owner graph.
+ */
+static int
+lf_add_edge(struct lockf_entry *x, struct lockf_entry *y)
+{
+	struct owner_graph *g = &lf_owner_graph;
+	struct lockf_edge *e;
+	int error;
+
+#ifdef INVARIANTS
+	LIST_FOREACH(e, &x->lf_outedges, le_outlink)
+		KASSERT(e->le_to != y, ("adding lock edge twice"));
+#endif
+
+	/*
+	 * Make sure the two owners have entries in the owner graph.
+	 */
+	lf_alloc_vertex(x);
+	lf_alloc_vertex(y);
+
+	error = graph_add_edge(g, x->lf_owner->lo_vertex,
+	    y->lf_owner->lo_vertex);
+	if (error)
+		return (error);
+
+	e = lf_alloc_edge();
+	LIST_INSERT_HEAD(&x->lf_outedges, e, le_outlink);
+	LIST_INSERT_HEAD(&y->lf_inedges, e, le_inlink);
+	e->le_from = x;
+	e->le_to = y;
+
+	return (0);
+}
+
+/*
+ * Remove an edge from the lock graph.
+ */
+static void
+lf_remove_edge(struct lockf_edge *e)
+{
+	struct owner_graph *g = &lf_owner_graph;
+	struct lockf_entry *x = e->le_from;
+	struct lockf_entry *y = e->le_to;
+
+	graph_remove_edge(g, x->lf_owner->lo_vertex, y->lf_owner->lo_vertex);
+	LIST_REMOVE(e, le_outlink);
+	LIST_REMOVE(e, le_inlink);
+	e->le_from = NULL;
+	e->le_to = NULL;
+	lf_free_edge(e);
+}
+
+/*
+ * Remove all out-going edges from lock x.
+ */
+static void
+lf_remove_outgoing(struct lockf_entry *x)
+{
+	struct lockf_edge *e;
+
+	while ((e = LIST_FIRST(&x->lf_outedges)) != NULL) {
+		lf_remove_edge(e);
+	}
+}
+
+/*
+ * Remove all in-coming edges from lock x.
+ */
+static void
+lf_remove_incoming(struct lockf_entry *x)
+{
+	struct lockf_edge *e;
+
+	while ((e = LIST_FIRST(&x->lf_inedges)) != NULL) {
+		lf_remove_edge(e);
+	}
+}
+
+/*
+ * Walk the list of locks for the file and create an out-going edge
+ * from lock to each blocking lock.
+ */
+static int
+lf_add_outgoing(struct lockf *state, struct lockf_entry *lock)
+{
+	struct lockf_entry *overlap;
+	int error;
+
+	LIST_FOREACH(overlap, &state->ls_active, lf_link) {
+		/*
+		 * We may assume that the active list is sorted by
+		 * lf_start.
+		 */
+		if (overlap->lf_start > lock->lf_end)
+			break;
+		if (!lf_blocks(lock, overlap))
+			continue;
+
+		/*
+		 * We've found a blocking lock. Add the corresponding
+		 * edge to the graphs and see if it would cause a
+		 * deadlock.
+		 */
+		error = lf_add_edge(lock, overlap);
+
+		/*
+		 * The only error that lf_add_edge returns is EDEADLK.
+		 * Remove any edges we added and return the error.
+		 */
+		if (error) {
+			lf_remove_outgoing(lock);
+			return (error);
+		}
+	}
+
+	/*
+	 * We also need to add edges to sleeping locks that block
+	 * us. This ensures that lf_wakeup_lock cannot grant two
+	 * mutually blocking locks simultaneously and also enforces a
+	 * 'first come, first served' fairness model. Note that this
+	 * only happens if we are blocked by at least one active lock
+	 * due to the call to lf_getblock in lf_setlock below.
+	 */
+	LIST_FOREACH(overlap, &state->ls_pending, lf_link) {
+		if (!lf_blocks(lock, overlap))
+			continue;
+		/*
+		 * We've found a blocking lock. Add the corresponding
+		 * edge to the graphs and see if it would cause a
+		 * deadlock.
+		 */
+		error = lf_add_edge(lock, overlap);
+
+		/*
+		 * The only error that lf_add_edge returns is EDEADLK.
+		 * Remove any edges we added and return the error.
+		 */
+		if (error) {
+			lf_remove_outgoing(lock);
+			return (error);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Walk the list of pending locks for the file and create an in-coming
+ * edge from lock to each blocking lock.
+ */
+static int
+lf_add_incoming(struct lockf *state, struct lockf_entry *lock)
+{
+	struct lockf_entry *overlap;
+	int error;
+
+	LIST_FOREACH(overlap, &state->ls_pending, lf_link) {
+		if (!lf_blocks(lock, overlap))
+			continue;
+
+		/*
+		 * We've found a blocking lock. Add the corresponding
+		 * edge to the graphs and see if it would cause a
+		 * deadlock.
+		 */
+		error = lf_add_edge(overlap, lock);
+
+		/*
+		 * The only error that lf_add_edge returns is EDEADLK.
+		 * Remove any edges we added and return the error.
+		 */
+		if (error) {
+			lf_remove_incoming(lock);
+			return (error);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Insert lock into the active list, keeping list entries ordered by
+ * increasing values of lf_start.
+ */
+static void
+lf_insert_lock(struct lockf *state, struct lockf_entry *lock)
+{
+	struct lockf_entry *lf, *lfprev;
+
+	if (LIST_EMPTY(&state->ls_active)) {
+		LIST_INSERT_HEAD(&state->ls_active, lock, lf_link);
+		return;
+	}
+
+	lfprev = NULL;
+	LIST_FOREACH(lf, &state->ls_active, lf_link) {
+		if (lf->lf_start > lock->lf_start) {
+			LIST_INSERT_BEFORE(lf, lock, lf_link);
+			return;
+		}
+		lfprev = lf;
+	}
+	LIST_INSERT_AFTER(lfprev, lock, lf_link);
+}
+
+/*
+ * Wake up a sleeping lock and remove it from the pending list now
+ * that all its dependancies have been resolved. The caller should
+ * arrange for the lock to be added to the active list, adjusting any
+ * existing locks for the same owner as needed.
+ */
+static void
+lf_wakeup_lock(struct lockf *state, struct lockf_entry *wakelock)
+{
+
+	/*
+	 * Remove from ls_pending list and wake up the caller
+	 * or start the async notification, as appropriate.
+	 */
+	LIST_REMOVE(wakelock, lf_link);
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1)
+		lf_print("lf_wakeup_lock: awakening", wakelock);
+#endif /* LOCKF_DEBUG */
+	if (wakelock->lf_async_task) {
+		taskqueue_enqueue(taskqueue_thread, wakelock->lf_async_task);
+	} else {
+		wakeup(wakelock);
+	}
+}
+
+/*
+ * Re-check all dependant locks and remove edges to locks that we no
+ * longer block. If 'all' is non-zero, the lock has been removed and
+ * we must remove all the dependancies, otherwise it has simply been
+ * reduced but remains active. Any pending locks which have been been
+ * unblocked are added to 'granted'
+ */
+static void
+lf_update_dependancies(struct lockf *state, struct lockf_entry *lock, int all,
+	struct lockf_entry_list *granted)
+{
+	struct lockf_edge *e, *ne;
+	struct lockf_entry *deplock;
+
+	LIST_FOREACH_SAFE(e, &lock->lf_inedges, le_inlink, ne) {
+		deplock = e->le_from;
+		if (all || !lf_blocks(lock, deplock)) {
+			sx_xlock(&lf_owner_graph_lock);
+			lf_remove_edge(e);
+			sx_xunlock(&lf_owner_graph_lock);
+			if (LIST_EMPTY(&deplock->lf_outedges)) {
+				lf_wakeup_lock(state, deplock);
+				LIST_INSERT_HEAD(granted, deplock, lf_link);
+			}
+		}
+	}
+}
+
+/*
+ * Set the start of an existing active lock, updating dependancies and
+ * adding any newly woken locks to 'granted'.
+ */
+static void
+lf_set_start(struct lockf *state, struct lockf_entry *lock, off_t new_start,
+	struct lockf_entry_list *granted)
+{
+
+	KASSERT(new_start >= lock->lf_start, ("can't increase lock"));
+	lock->lf_start = new_start;
+	LIST_REMOVE(lock, lf_link);
+	lf_insert_lock(state, lock);
+	lf_update_dependancies(state, lock, FALSE, granted);
+}
+
+/*
+ * Set the end of an existing active lock, updating dependancies and
+ * adding any newly woken locks to 'granted'.
+ */
+static void
+lf_set_end(struct lockf *state, struct lockf_entry *lock, off_t new_end,
+	struct lockf_entry_list *granted)
+{
+
+	KASSERT(new_end <= lock->lf_end, ("can't increase lock"));
+	lock->lf_end = new_end;
+	lf_update_dependancies(state, lock, FALSE, granted);
+}
+
+/*
+ * Add a lock to the active list, updating or removing any current
+ * locks owned by the same owner and processing any pending locks that
+ * become unblocked as a result. This code is also used for unlock
+ * since the logic for updating existing locks is identical.
+ *
+ * As a result of processing the new lock, we may unblock existing
+ * pending locks as a result of downgrading/unlocking. We simply
+ * activate the newly granted locks by looping.
+ *
+ * Since the new lock already has its dependancies set up, we always
+ * add it to the list (unless its an unlock request). This may
+ * fragment the lock list in some pathological cases but its probably
+ * not a real problem.
+ */
+static void
+lf_activate_lock(struct lockf *state, struct lockf_entry *lock)
+{
+	struct lockf_entry *overlap, *lf;
+	struct lockf_entry_list granted;
+	int ovcase;
+
+	LIST_INIT(&granted);
+	LIST_INSERT_HEAD(&granted, lock, lf_link);
+
+	while (!LIST_EMPTY(&granted)) {
+		lock = LIST_FIRST(&granted);
+		LIST_REMOVE(lock, lf_link);
+
+		/*
+		 * Skip over locks owned by other processes.  Handle
+		 * any locks that overlap and are owned by ourselves.
+		 */
+		overlap = LIST_FIRST(&state->ls_active);
+		for (;;) {
+			ovcase = lf_findoverlap(&overlap, lock, SELF);
+
+#ifdef LOCKF_DEBUG
+			if (ovcase && (lockf_debug & 2)) {
+				printf("lf_setlock: overlap %d", ovcase);
+				lf_print("", overlap);
+			}
+#endif
+			/*
+			 * Six cases:
+			 *	0) no overlap
+			 *	1) overlap == lock
+			 *	2) overlap contains lock
+			 *	3) lock contains overlap
+			 *	4) overlap starts before lock
+			 *	5) overlap ends after lock
+			 */
+			switch (ovcase) {
+			case 0: /* no overlap */
+				break;
+
+			case 1: /* overlap == lock */
+				/*
+				 * We have already setup the
+				 * dependants for the new lock, taking
+				 * into account a possible downgrade
+				 * or unlock. Remove the old lock.
+				 */
+				LIST_REMOVE(overlap, lf_link);
+				lf_update_dependancies(state, overlap, TRUE,
+					&granted);
+				lf_free_lock(overlap);
+				break;
+
+			case 2: /* overlap contains lock */
+				/*
+				 * Just split the existing lock.
+				 */
+				lf_split(state, overlap, lock, &granted);
+				break;
+
+			case 3: /* lock contains overlap */
+				/*
+				 * Delete the overlap and advance to
+				 * the next entry in the list.
+				 */
+				lf = LIST_NEXT(overlap, lf_link);
+				LIST_REMOVE(overlap, lf_link);
+				lf_update_dependancies(state, overlap, TRUE,
+					&granted);
+				lf_free_lock(overlap);
+				overlap = lf;
+				continue;
+
+			case 4: /* overlap starts before lock */
+				/*
+				 * Just update the overlap end and
+				 * move on.
+				 */
+				lf_set_end(state, overlap, lock->lf_start - 1,
+				    &granted);
+				overlap = LIST_NEXT(overlap, lf_link);
+				continue;
+
+			case 5: /* overlap ends after lock */
+				/*
+				 * Change the start of overlap and
+				 * re-insert.
+				 */
+				lf_set_start(state, overlap, lock->lf_end + 1,
+				    &granted);
+				break;
+			}
+			break;
+		}
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 1) {
+			if (lock->lf_type != F_UNLCK)
+				lf_print("lf_activate_lock: activated", lock);
+			else
+				lf_print("lf_activate_lock: unlocked", lock);
+			lf_printlist("lf_activate_lock", lock);
+		}
+#endif /* LOCKF_DEBUG */
+		if (lock->lf_type != F_UNLCK)
+			lf_insert_lock(state, lock);
+	}
+}
+
+/*
+ * Cancel a pending lock request, either as a result of a signal or a
+ * cancel request for an async lock.
+ */
+static void
+lf_cancel_lock(struct lockf *state, struct lockf_entry *lock)
+{
+	struct lockf_entry_list granted;
+
+	/*
+	 * Note it is theoretically possible that cancelling this lock
+	 * may allow some other pending lock to become
+	 * active. Consider this case:
+	 *
+	 * Owner	Action		Result		Dependancies
+	 * 
+	 * A:		lock [0..0]	succeeds	
+	 * B:		lock [2..2]	succeeds	
+	 * C:		lock [1..2]	blocked		C->B
+	 * D:		lock [0..1]	blocked		C->B,D->A,D->C
+	 * A:		unlock [0..0]			C->B,D->C
+	 * C:		cancel [1..2]	
+	 */
+
+	LIST_REMOVE(lock, lf_link);
+
+	/*
+	 * Removing out-going edges is simple.
+	 */
+	sx_xlock(&lf_owner_graph_lock);
+	lf_remove_outgoing(lock);
+	sx_xunlock(&lf_owner_graph_lock);
+
+	/*
+	 * Removing in-coming edges may allow some other lock to
+	 * become active - we use lf_update_dependancies to figure
+	 * this out.
+	 */
+	LIST_INIT(&granted);
+	lf_update_dependancies(state, lock, TRUE, &granted);
+	lf_free_lock(lock);
+
+	/*
+	 * Feed any newly active locks to lf_activate_lock.
+	 */
+	while (!LIST_EMPTY(&granted)) {
+		lock = LIST_FIRST(&granted);
+		LIST_REMOVE(lock, lf_link);
+		lf_activate_lock(state, lock);
+	}
+}
+
+/*
+ * Set a byte-range lock.
+ */
+static int
+lf_setlock(struct lockf *state, struct lockf_entry *lock, struct vnode *vp,
+    void **cookiep)
+{
+	static char lockstr[] = "lockf";
+	int priority, error;
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1)
+		lf_print("lf_setlock", lock);
+#endif /* LOCKF_DEBUG */
+
+	/*
+	 * Set the priority
+	 */
+	priority = PLOCK;
+	if (lock->lf_type == F_WRLCK)
+		priority += 4;
+	if (!(lock->lf_flags & F_NOINTR))
+		priority |= PCATCH;
+	/*
+	 * Scan lock list for this file looking for locks that would block us.
+	 */
+	if (lf_getblock(state, lock)) {
+		/*
+		 * Free the structure and return if nonblocking.
+		 */
+		if ((lock->lf_flags & F_WAIT) == 0
+		    && lock->lf_async_task == NULL) {
+			lf_free_lock(lock);
+			error = EAGAIN;
+			goto out;
+		}
+
+		/*
+		 * For flock type locks, we must first remove
+		 * any shared locks that we hold before we sleep
+		 * waiting for an exclusive lock.
+		 */
+		if ((lock->lf_flags & F_FLOCK) &&
+		    lock->lf_type == F_WRLCK) {
+			lock->lf_type = F_UNLCK;
+			lf_activate_lock(state, lock);
+			lock->lf_type = F_WRLCK;
+		}
+
+		/*
+		 * We are blocked. Create edges to each blocking lock,
+		 * checking for deadlock using the owner graph. For
+		 * simplicity, we run deadlock detection for all
+		 * locks, posix and otherwise.
+		 */
+		sx_xlock(&lf_owner_graph_lock);
+		error = lf_add_outgoing(state, lock);
+		sx_xunlock(&lf_owner_graph_lock);
+
+		if (error) {
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 1)
+				lf_print("lf_setlock: deadlock", lock);
+#endif
+			lf_free_lock(lock);
+			goto out;
+		}
+
+		/*
+		 * We have added edges to everything that blocks
+		 * us. Sleep until they all go away.
+		 */
+		LIST_INSERT_HEAD(&state->ls_pending, lock, lf_link);
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 1) {
+			struct lockf_edge *e;
+			LIST_FOREACH(e, &lock->lf_outedges, le_outlink) {
+				lf_print("lf_setlock: blocking on", e->le_to);
+				lf_printlist("lf_setlock", e->le_to);
+			}
+		}
+#endif /* LOCKF_DEBUG */
+
+		if ((lock->lf_flags & F_WAIT) == 0) {
+			/*
+			 * The caller requested async notification -
+			 * this callback happens when the blocking
+			 * lock is released, allowing the caller to
+			 * make another attempt to take the lock.
+			 */
+			*cookiep = (void *) lock;
+			error = EINPROGRESS;
+			goto out;
+		}
+
+		lock->lf_refs++;
+		error = sx_sleep(lock, &state->ls_lock, priority, lockstr, 0);
+		if (lf_free_lock(lock)) {
+			error = EINTR;
+			goto out;
+		}
+
+		/*
+		 * We may have been awakened by a signal and/or by a
+		 * debugger continuing us (in which cases we must
+		 * remove our lock graph edges) and/or by another
+		 * process releasing a lock (in which case our edges
+		 * have already been removed and we have been moved to
+		 * the active list). We may also have been woken by
+		 * lf_purgelocks which we report to the caller as
+		 * EINTR. In that case, lf_purgelocks will have
+		 * removed our lock graph edges.
+		 *
+		 * Note that it is possible to receive a signal after
+		 * we were successfully woken (and moved to the active
+		 * list) but before we resumed execution. In this
+		 * case, our lf_outedges list will be clear. We
+		 * pretend there was no error.
+		 *
+		 * Note also, if we have been sleeping long enough, we
+		 * may now have incoming edges from some newer lock
+		 * which is waiting behind us in the queue.
+		 */
+		if (lock->lf_flags & F_INTR) {
+			error = EINTR;
+			lf_free_lock(lock);
+			goto out;
+		}
+		if (LIST_EMPTY(&lock->lf_outedges)) {
+			error = 0;
+		} else {
+			lf_cancel_lock(state, lock);
+			goto out;
+		}
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 1) {
+			lf_print("lf_setlock: granted", lock);
+		}
+#endif
+		goto out;
+	}
+	/*
+	 * It looks like we are going to grant the lock. First add
+	 * edges from any currently pending lock that the new lock
+	 * would block.
+	 */
+	sx_xlock(&lf_owner_graph_lock);
+	error = lf_add_incoming(state, lock);
+	sx_xunlock(&lf_owner_graph_lock);
+	if (error) {
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 1)
+			lf_print("lf_setlock: deadlock", lock);
+#endif
+		lf_free_lock(lock);
+		goto out;
+	}
+
+	/*
+	 * No blocks!!  Add the lock.  Note that we will
+	 * downgrade or upgrade any overlapping locks this
+	 * process already owns.
+	 */
+	lf_activate_lock(state, lock);
+	error = 0;
+out:
+	return (error);
+}
+
+/*
+ * Remove a byte-range lock on an inode.
+ *
+ * Generally, find the lock (or an overlap to that lock)
+ * and remove it (or shrink it), then wakeup anyone we can.
+ */
+static int
+lf_clearlock(struct lockf *state, struct lockf_entry *unlock)
+{
+	struct lockf_entry *overlap;
+
+	overlap = LIST_FIRST(&state->ls_active);
+
+	if (overlap == NOLOCKF)
+		return (0);
+#ifdef LOCKF_DEBUG
+	if (unlock->lf_type != F_UNLCK)
+		panic("lf_clearlock: bad type");
+	if (lockf_debug & 1)
+		lf_print("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+
+	lf_activate_lock(state, unlock);
+
+	return (0);
+}
+
+/*
+ * Check whether there is a blocking lock, and if so return its
+ * details in '*fl'.
+ */
+static int
+lf_getlock(struct lockf *state, struct lockf_entry *lock, struct flock *fl)
+{
+	struct lockf_entry *block;
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1)
+		lf_print("lf_getlock", lock);
+#endif /* LOCKF_DEBUG */
+
+	if ((block = lf_getblock(state, lock))) {
+		fl->l_type = block->lf_type;
+		fl->l_whence = SEEK_SET;
+		fl->l_start = block->lf_start;
+		if (block->lf_end == OFF_MAX)
+			fl->l_len = 0;
+		else
+			fl->l_len = block->lf_end - block->lf_start + 1;
+		fl->l_pid = block->lf_owner->lo_pid;
+		fl->l_sysid = block->lf_owner->lo_sysid;
+	} else {
+		fl->l_type = F_UNLCK;
+	}
+	return (0);
+}
+
+/*
+ * Cancel an async lock request.
+ */
+static int
+lf_cancel(struct lockf *state, struct lockf_entry *lock, void *cookie)
+{
+	struct lockf_entry *reallock;
+
+	/*
+	 * We need to match this request with an existing lock
+	 * request.
+	 */
+	LIST_FOREACH(reallock, &state->ls_pending, lf_link) {
+		if ((void *) reallock == cookie) {
+			/*
+			 * Double-check that this lock looks right
+			 * (maybe use a rolling ID for the cancel
+			 * cookie instead?)
+			 */
+			if (!(reallock->lf_vnode == lock->lf_vnode
+				&& reallock->lf_start == lock->lf_start
+				&& reallock->lf_end == lock->lf_end)) {
+				return (ENOENT);
+			}
+
+			/*
+			 * Make sure this lock was async and then just
+			 * remove it from its wait lists.
+			 */
+			if (!reallock->lf_async_task) {
+				return (ENOENT);
+			}
+
+			/*
+			 * Note that since any other thread must take
+			 * state->ls_lock before it can possibly
+			 * trigger the async callback, we are safe
+			 * from a race with lf_wakeup_lock, i.e. we
+			 * can free the lock (actually our caller does
+			 * this).
+			 */
+			lf_cancel_lock(state, reallock);
+			return (0);
+		}
+	}
+
+	/*
+	 * We didn't find a matching lock - not much we can do here.
+	 */
+	return (ENOENT);
+}
+
+/*
+ * Walk the list of locks for an inode and
+ * return the first blocking lock.
+ */
+static struct lockf_entry *
+lf_getblock(struct lockf *state, struct lockf_entry *lock)
+{
+	struct lockf_entry *overlap;
+
+	LIST_FOREACH(overlap, &state->ls_active, lf_link) {
+		/*
+		 * We may assume that the active list is sorted by
+		 * lf_start.
+		 */
+		if (overlap->lf_start > lock->lf_end)
+			break;
+		if (!lf_blocks(lock, overlap))
+			continue;
+		return (overlap);
+	}
+	return (NOLOCKF);
+}
+
+/*
+ * Walk the list of locks for an inode to find an overlapping lock (if
+ * any) and return a classification of that overlap.
+ *
+ * Arguments:
+ *	*overlap	The place in the lock list to start looking
+ *	lock		The lock which is being tested
+ *	type		Pass 'SELF' to test only locks with the same
+ *			owner as lock, or 'OTHER' to test only locks
+ *			with a different owner
+ *
+ * Returns one of six values:
+ *	0) no overlap
+ *	1) overlap == lock
+ *	2) overlap contains lock
+ *	3) lock contains overlap
+ *	4) overlap starts before lock
+ *	5) overlap ends after lock
+ *
+ * If there is an overlapping lock, '*overlap' is set to point at the
+ * overlapping lock.
+ *
+ * NOTE: this returns only the FIRST overlapping lock.  There
+ *	 may be more than one.
+ */
+static int
+lf_findoverlap(struct lockf_entry **overlap, struct lockf_entry *lock, int type)
+{
+	struct lockf_entry *lf;
+	off_t start, end;
+	int res;
+
+	if ((*overlap) == NOLOCKF) {
+		return (0);
+	}
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 2)
+		lf_print("lf_findoverlap: looking for overlap in", lock);
+#endif /* LOCKF_DEBUG */
+	start = lock->lf_start;
+	end = lock->lf_end;
+	res = 0;
+	while (*overlap) {
+		lf = *overlap;
+		if (lf->lf_start > end)
+			break;
+		if (((type & SELF) && lf->lf_owner != lock->lf_owner) ||
+		    ((type & OTHERS) && lf->lf_owner == lock->lf_owner)) {
+			*overlap = LIST_NEXT(lf, lf_link);
+			continue;
+		}
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 2)
+			lf_print("\tchecking", lf);
+#endif /* LOCKF_DEBUG */
+		/*
+		 * OK, check for overlap
+		 *
+		 * Six cases:
+		 *	0) no overlap
+		 *	1) overlap == lock
+		 *	2) overlap contains lock
+		 *	3) lock contains overlap
+		 *	4) overlap starts before lock
+		 *	5) overlap ends after lock
+		 */
+		if (start > lf->lf_end) {
+			/* Case 0 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("no overlap\n");
+#endif /* LOCKF_DEBUG */
+			*overlap = LIST_NEXT(lf, lf_link);
+			continue;
+		}
+		if (lf->lf_start == start && lf->lf_end == end) {
+			/* Case 1 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap == lock\n");
+#endif /* LOCKF_DEBUG */
+			res = 1;
+			break;
+		}
+		if (lf->lf_start <= start && lf->lf_end >= end) {
+			/* Case 2 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap contains lock\n");
+#endif /* LOCKF_DEBUG */
+			res = 2;
+			break;
+		}
+		if (start <= lf->lf_start && end >= lf->lf_end) {
+			/* Case 3 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("lock contains overlap\n");
+#endif /* LOCKF_DEBUG */
+			res = 3;
+			break;
+		}
+		if (lf->lf_start < start && lf->lf_end >= start) {
+			/* Case 4 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap starts before lock\n");
+#endif /* LOCKF_DEBUG */
+			res = 4;
+			break;
+		}
+		if (lf->lf_start > start && lf->lf_end > end) {
+			/* Case 5 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap ends after lock\n");
+#endif /* LOCKF_DEBUG */
+			res = 5;
+			break;
+		}
+		panic("lf_findoverlap: default");
+	}
+	return (res);
+}
+
+/*
+ * Split an the existing 'lock1', based on the extent of the lock
+ * described by 'lock2'. The existing lock should cover 'lock2'
+ * entirely.
+ *
+ * Any pending locks which have been been unblocked are added to
+ * 'granted'
+ */
+static void
+lf_split(struct lockf *state, struct lockf_entry *lock1,
+    struct lockf_entry *lock2, struct lockf_entry_list *granted)
+{
+	struct lockf_entry *splitlock;
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 2) {
+		lf_print("lf_split", lock1);
+		lf_print("splitting from", lock2);
+	}
+#endif /* LOCKF_DEBUG */
+	/*
+	 * Check to see if we don't need to split at all.
+	 */
+	if (lock1->lf_start == lock2->lf_start) {
+		lf_set_start(state, lock1, lock2->lf_end + 1, granted);
+		return;
+	}
+	if (lock1->lf_end == lock2->lf_end) {
+		lf_set_end(state, lock1, lock2->lf_start - 1, granted);
+		return;
+	}
+	/*
+	 * Make a new lock consisting of the last part of
+	 * the encompassing lock.
+	 */
+	splitlock = lf_alloc_lock(lock1->lf_owner);
+	memcpy(splitlock, lock1, sizeof *splitlock);
+	splitlock->lf_refs = 1;
+	if (splitlock->lf_flags & F_REMOTE)
+		vref(splitlock->lf_vnode);
+
+	/*
+	 * This cannot cause a deadlock since any edges we would add
+	 * to splitlock already exist in lock1. We must be sure to add
+	 * necessary dependancies to splitlock before we reduce lock1
+	 * otherwise we may accidentally grant a pending lock that
+	 * was blocked by the tail end of lock1.
+	 */
+	splitlock->lf_start = lock2->lf_end + 1;
+	LIST_INIT(&splitlock->lf_outedges);
+	LIST_INIT(&splitlock->lf_inedges);
+	sx_xlock(&lf_owner_graph_lock);
+	lf_add_incoming(state, splitlock);
+	sx_xunlock(&lf_owner_graph_lock);
+
+	lf_set_end(state, lock1, lock2->lf_start - 1, granted);
+
+	/*
+	 * OK, now link it in
+	 */
+	lf_insert_lock(state, splitlock);
+}
+
+struct lockdesc {
+	STAILQ_ENTRY(lockdesc) link;
+	struct vnode *vp;
+	struct flock fl;
+};
+STAILQ_HEAD(lockdesclist, lockdesc);
+
+int
+lf_iteratelocks_sysid(int sysid, lf_iterator *fn, void *arg)
+{
+	struct lockf *ls;
+	struct lockf_entry *lf;
+	struct lockdesc *ldesc;
+	struct lockdesclist locks;
+	int error;
+
+	/*
+	 * In order to keep the locking simple, we iterate over the
+	 * active lock lists to build a list of locks that need
+	 * releasing. We then call the iterator for each one in turn.
+	 *
+	 * We take an extra reference to the vnode for the duration to
+	 * make sure it doesn't go away before we are finished.
+	 */
+	STAILQ_INIT(&locks);
+	sx_xlock(&lf_lock_states_lock);
+	LIST_FOREACH(ls, &lf_lock_states, ls_link) {
+		sx_xlock(&ls->ls_lock);
+		LIST_FOREACH(lf, &ls->ls_active, lf_link) {
+			if (lf->lf_owner->lo_sysid != sysid)
+				continue;
+
+			ldesc = malloc(sizeof(struct lockdesc), M_LOCKF,
+			    M_WAITOK);
+			ldesc->vp = lf->lf_vnode;
+			vref(ldesc->vp);
+			ldesc->fl.l_start = lf->lf_start;
+			if (lf->lf_end == OFF_MAX)
+				ldesc->fl.l_len = 0;
+			else
+				ldesc->fl.l_len =
+					lf->lf_end - lf->lf_start + 1;
+			ldesc->fl.l_whence = SEEK_SET;
+			ldesc->fl.l_type = F_UNLCK;
+			ldesc->fl.l_pid = lf->lf_owner->lo_pid;
+			ldesc->fl.l_sysid = sysid;
+			STAILQ_INSERT_TAIL(&locks, ldesc, link);
+		}
+		sx_xunlock(&ls->ls_lock);
+	}
+	sx_xunlock(&lf_lock_states_lock);
+
+	/*
+	 * Call the iterator function for each lock in turn. If the
+	 * iterator returns an error code, just free the rest of the
+	 * lockdesc structures.
+	 */
+	error = 0;
+	while ((ldesc = STAILQ_FIRST(&locks)) != NULL) {
+		STAILQ_REMOVE_HEAD(&locks, link);
+		if (!error)
+			error = fn(ldesc->vp, &ldesc->fl, arg);
+		vrele(ldesc->vp);
+		free(ldesc, M_LOCKF);
+	}
+
+	return (error);
+}
+
+int
+lf_iteratelocks_vnode(struct vnode *vp, lf_iterator *fn, void *arg)
+{
+	struct lockf *ls;
+	struct lockf_entry *lf;
+	struct lockdesc *ldesc;
+	struct lockdesclist locks;
+	int error;
+
+	/*
+	 * In order to keep the locking simple, we iterate over the
+	 * active lock lists to build a list of locks that need
+	 * releasing. We then call the iterator for each one in turn.
+	 *
+	 * We take an extra reference to the vnode for the duration to
+	 * make sure it doesn't go away before we are finished.
+	 */
+	STAILQ_INIT(&locks);
+	VI_LOCK(vp);
+	ls = vp->v_lockf;
+	if (!ls) {
+		VI_UNLOCK(vp);
+		return (0);
+	}
+	ls->ls_threads++;
+	VI_UNLOCK(vp);
+
+	sx_xlock(&ls->ls_lock);
+	LIST_FOREACH(lf, &ls->ls_active, lf_link) {
+		ldesc = malloc(sizeof(struct lockdesc), M_LOCKF,
+		    M_WAITOK);
+		ldesc->vp = lf->lf_vnode;
+		vref(ldesc->vp);
+		ldesc->fl.l_start = lf->lf_start;
+		if (lf->lf_end == OFF_MAX)
+			ldesc->fl.l_len = 0;
+		else
+			ldesc->fl.l_len =
+				lf->lf_end - lf->lf_start + 1;
+		ldesc->fl.l_whence = SEEK_SET;
+		ldesc->fl.l_type = F_UNLCK;
+		ldesc->fl.l_pid = lf->lf_owner->lo_pid;
+		ldesc->fl.l_sysid = lf->lf_owner->lo_sysid;
+		STAILQ_INSERT_TAIL(&locks, ldesc, link);
+	}
+	sx_xunlock(&ls->ls_lock);
+	VI_LOCK(vp);
+	ls->ls_threads--;
+	wakeup(ls);
+	VI_UNLOCK(vp);
+
+	/*
+	 * Call the iterator function for each lock in turn. If the
+	 * iterator returns an error code, just free the rest of the
+	 * lockdesc structures.
+	 */
+	error = 0;
+	while ((ldesc = STAILQ_FIRST(&locks)) != NULL) {
+		STAILQ_REMOVE_HEAD(&locks, link);
+		if (!error)
+			error = fn(ldesc->vp, &ldesc->fl, arg);
+		vrele(ldesc->vp);
+		free(ldesc, M_LOCKF);
+	}
+
+	return (error);
+}
+
+static int
+lf_clearremotesys_iterator(struct vnode *vp, struct flock *fl, void *arg)
+{
+
+	VOP_ADVLOCK(vp, 0, F_UNLCK, fl, F_REMOTE);
+	return (0);
+}
+
+void
+lf_clearremotesys(int sysid)
+{
+
+	KASSERT(sysid != 0, ("Can't clear local locks with F_UNLCKSYS"));
+	lf_iteratelocks_sysid(sysid, lf_clearremotesys_iterator, NULL);
+}
+
+int
+lf_countlocks(int sysid)
+{
+	int i;
+	struct lock_owner *lo;
+	int count;
+
+	count = 0;
+	sx_xlock(&lf_lock_owners_lock);
+	for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++)
+		LIST_FOREACH(lo, &lf_lock_owners[i], lo_link)
+			if (lo->lo_sysid == sysid)
+				count += lo->lo_refs;
+	sx_xunlock(&lf_lock_owners_lock);
+
+	return (count);
+}
+
+#ifdef LOCKF_DEBUG
+
+/*
+ * Return non-zero if y is reachable from x using a brute force
+ * search. If reachable and path is non-null, return the route taken
+ * in path.
+ */
+static int
+graph_reaches(struct owner_vertex *x, struct owner_vertex *y,
+    struct owner_vertex_list *path)
+{
+	struct owner_edge *e;
+
+	if (x == y) {
+		if (path)
+			TAILQ_INSERT_HEAD(path, x, v_link);
+		return 1;
+	}
+
+	LIST_FOREACH(e, &x->v_outedges, e_outlink) {
+		if (graph_reaches(e->e_to, y, path)) {
+			if (path)
+				TAILQ_INSERT_HEAD(path, x, v_link);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Perform consistency checks on the graph. Make sure the values of
+ * v_order are correct. If checkorder is non-zero, check no vertex can
+ * reach any other vertex with a smaller order.
+ */
+static void
+graph_check(struct owner_graph *g, int checkorder)
+{
+	int i, j;
+
+	for (i = 0; i < g->g_size; i++) {
+		if (!g->g_vertices[i]->v_owner)
+			continue;
+		KASSERT(g->g_vertices[i]->v_order == i,
+		    ("lock graph vertices disordered"));
+		if (checkorder) {
+			for (j = 0; j < i; j++) {
+				if (!g->g_vertices[j]->v_owner)
+					continue;
+				KASSERT(!graph_reaches(g->g_vertices[i],
+					g->g_vertices[j], NULL),
+				    ("lock graph vertices disordered"));
+			}
+		}
+	}
+}
+
+static void
+graph_print_vertices(struct owner_vertex_list *set)
+{
+	struct owner_vertex *v;
+
+	printf("{ ");
+	TAILQ_FOREACH(v, set, v_link) {
+		printf("%d:", v->v_order);
+		lf_print_owner(v->v_owner);
+		if (TAILQ_NEXT(v, v_link))
+			printf(", ");
+	}
+	printf(" }\n");
+}
+
+#endif
+
+/*
+ * Calculate the sub-set of vertices v from the affected region [y..x]
+ * where v is reachable from y. Return -1 if a loop was detected
+ * (i.e. x is reachable from y, otherwise the number of vertices in
+ * this subset.
+ */
+static int
+graph_delta_forward(struct owner_graph *g, struct owner_vertex *x,
+    struct owner_vertex *y, struct owner_vertex_list *delta)
+{
+	uint32_t gen;
+	struct owner_vertex *v;
+	struct owner_edge *e;
+	int n;
+
+	/*
+	 * We start with a set containing just y. Then for each vertex
+	 * v in the set so far unprocessed, we add each vertex that v
+	 * has an out-edge to and that is within the affected region
+	 * [y..x]. If we see the vertex x on our travels, stop
+	 * immediately.
+	 */
+	TAILQ_INIT(delta);
+	TAILQ_INSERT_TAIL(delta, y, v_link);
+	v = y;
+	n = 1;
+	gen = g->g_gen;
+	while (v) {
+		LIST_FOREACH(e, &v->v_outedges, e_outlink) {
+			if (e->e_to == x)
+				return -1;
+			if (e->e_to->v_order < x->v_order
+			    && e->e_to->v_gen != gen) {
+				e->e_to->v_gen = gen;
+				TAILQ_INSERT_TAIL(delta, e->e_to, v_link);
+				n++;
+			}
+		}
+		v = TAILQ_NEXT(v, v_link);
+	}
+
+	return (n);
+}
+
+/*
+ * Calculate the sub-set of vertices v from the affected region [y..x]
+ * where v reaches x. Return the number of vertices in this subset.
+ */
+static int
+graph_delta_backward(struct owner_graph *g, struct owner_vertex *x,
+    struct owner_vertex *y, struct owner_vertex_list *delta)
+{
+	uint32_t gen;
+	struct owner_vertex *v;
+	struct owner_edge *e;
+	int n;
+
+	/*
+	 * We start with a set containing just x. Then for each vertex
+	 * v in the set so far unprocessed, we add each vertex that v
+	 * has an in-edge from and that is within the affected region
+	 * [y..x].
+	 */
+	TAILQ_INIT(delta);
+	TAILQ_INSERT_TAIL(delta, x, v_link);
+	v = x;
+	n = 1;
+	gen = g->g_gen;
+	while (v) {
+		LIST_FOREACH(e, &v->v_inedges, e_inlink) {
+			if (e->e_from->v_order > y->v_order
+			    && e->e_from->v_gen != gen) {
+				e->e_from->v_gen = gen;
+				TAILQ_INSERT_HEAD(delta, e->e_from, v_link);
+				n++;
+			}
+		}
+		v = TAILQ_PREV(v, owner_vertex_list, v_link);
+	}
+
+	return (n);
+}
+
+static int
+graph_add_indices(int *indices, int n, struct owner_vertex_list *set)
+{
+	struct owner_vertex *v;
+	int i, j;
+
+	TAILQ_FOREACH(v, set, v_link) {
+		for (i = n;
+		     i > 0 && indices[i - 1] > v->v_order; i--)
+			;
+		for (j = n - 1; j >= i; j--)
+			indices[j + 1] = indices[j];
+		indices[i] = v->v_order;
+		n++;
+	}
+
+	return (n);
+}
+
+static int
+graph_assign_indices(struct owner_graph *g, int *indices, int nextunused,
+    struct owner_vertex_list *set)
+{
+	struct owner_vertex *v, *vlowest;
+
+	while (!TAILQ_EMPTY(set)) {
+		vlowest = NULL;
+		TAILQ_FOREACH(v, set, v_link) {
+			if (!vlowest || v->v_order < vlowest->v_order)
+				vlowest = v;
+		}
+		TAILQ_REMOVE(set, vlowest, v_link);
+		vlowest->v_order = indices[nextunused];
+		g->g_vertices[vlowest->v_order] = vlowest;
+		nextunused++;
+	}
+
+	return (nextunused);
+}
+
+static int
+graph_add_edge(struct owner_graph *g, struct owner_vertex *x,
+    struct owner_vertex *y)
+{
+	struct owner_edge *e;
+	struct owner_vertex_list deltaF, deltaB;
+	int nF, nB, n, vi, i;
+	int *indices;
+
+	sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
+
+	LIST_FOREACH(e, &x->v_outedges, e_outlink) {
+		if (e->e_to == y) {
+			e->e_refs++;
+			return (0);
+		}
+	}
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 8) {
+		printf("adding edge %d:", x->v_order);
+		lf_print_owner(x->v_owner);
+		printf(" -> %d:", y->v_order);
+		lf_print_owner(y->v_owner);
+		printf("\n");
+	}
+#endif
+	if (y->v_order < x->v_order) {
+		/*
+		 * The new edge violates the order. First find the set
+		 * of affected vertices reachable from y (deltaF) and
+		 * the set of affect vertices affected that reach x
+		 * (deltaB), using the graph generation number to
+		 * detect whether we have visited a given vertex
+		 * already. We re-order the graph so that each vertex
+		 * in deltaB appears before each vertex in deltaF.
+		 *
+		 * If x is a member of deltaF, then the new edge would
+		 * create a cycle. Otherwise, we may assume that
+		 * deltaF and deltaB are disjoint.
+		 */
+		g->g_gen++;
+		if (g->g_gen == 0) {
+			/*
+			 * Generation wrap.
+			 */
+			for (vi = 0; vi < g->g_size; vi++) {
+				g->g_vertices[vi]->v_gen = 0;
+			}
+			g->g_gen++;
+		}
+		nF = graph_delta_forward(g, x, y, &deltaF);
+		if (nF < 0) {
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 8) {
+				struct owner_vertex_list path;
+				printf("deadlock: ");
+				TAILQ_INIT(&path);
+				graph_reaches(y, x, &path);
+				graph_print_vertices(&path);
+			}
+#endif
+			return (EDEADLK);
+		}
+
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 8) {
+			printf("re-ordering graph vertices\n");
+			printf("deltaF = ");
+			graph_print_vertices(&deltaF);
+		}
+#endif
+
+		nB = graph_delta_backward(g, x, y, &deltaB);
+
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 8) {
+			printf("deltaB = ");
+			graph_print_vertices(&deltaB);
+		}
+#endif
+
+		/*
+		 * We first build a set of vertex indices (vertex
+		 * order values) that we may use, then we re-assign
+		 * orders first to those vertices in deltaB, then to
+		 * deltaF. Note that the contents of deltaF and deltaB
+		 * may be partially disordered - we perform an
+		 * insertion sort while building our index set.
+		 */
+		indices = g->g_indexbuf;
+		n = graph_add_indices(indices, 0, &deltaF);
+		graph_add_indices(indices, n, &deltaB);
+
+		/*
+		 * We must also be sure to maintain the relative
+		 * ordering of deltaF and deltaB when re-assigning
+		 * vertices. We do this by iteratively removing the
+		 * lowest ordered element from the set and assigning
+		 * it the next value from our new ordering.
+		 */
+		i = graph_assign_indices(g, indices, 0, &deltaB);
+		graph_assign_indices(g, indices, i, &deltaF);
+
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 8) {
+			struct owner_vertex_list set;
+			TAILQ_INIT(&set);
+			for (i = 0; i < nB + nF; i++)
+				TAILQ_INSERT_TAIL(&set,
+				    g->g_vertices[indices[i]], v_link);
+			printf("new ordering = ");
+			graph_print_vertices(&set);
+		}
+#endif
+	}
+
+	KASSERT(x->v_order < y->v_order, ("Failed to re-order graph"));
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 8) {
+		graph_check(g, TRUE);
+	}
+#endif
+
+	e = malloc(sizeof(struct owner_edge), M_LOCKF, M_WAITOK);
+
+	LIST_INSERT_HEAD(&x->v_outedges, e, e_outlink);
+	LIST_INSERT_HEAD(&y->v_inedges, e, e_inlink);
+	e->e_refs = 1;
+	e->e_from = x;
+	e->e_to = y;
+
+	return (0);
+}
+
+/*
+ * Remove an edge x->y from the graph.
+ */
+static void
+graph_remove_edge(struct owner_graph *g, struct owner_vertex *x,
+    struct owner_vertex *y)
+{
+	struct owner_edge *e;
+
+	sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
+
+	LIST_FOREACH(e, &x->v_outedges, e_outlink) {
+		if (e->e_to == y)
+			break;
+	}
+	KASSERT(e, ("Removing non-existent edge from deadlock graph"));
+
+	e->e_refs--;
+	if (e->e_refs == 0) {
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 8) {
+			printf("removing edge %d:", x->v_order);
+			lf_print_owner(x->v_owner);
+			printf(" -> %d:", y->v_order);
+			lf_print_owner(y->v_owner);
+			printf("\n");
+		}
+#endif
+		LIST_REMOVE(e, e_outlink);
+		LIST_REMOVE(e, e_inlink);
+		free(e, M_LOCKF);
+	}
+}
+
+/*
+ * Allocate a vertex from the free list. Return ENOMEM if there are
+ * none.
+ */
+static struct owner_vertex *
+graph_alloc_vertex(struct owner_graph *g, struct lock_owner *lo)
+{
+	struct owner_vertex *v;
+
+	sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
+
+	v = malloc(sizeof(struct owner_vertex), M_LOCKF, M_WAITOK);
+	if (g->g_size == g->g_space) {
+		g->g_vertices = realloc(g->g_vertices,
+		    2 * g->g_space * sizeof(struct owner_vertex *),
+		    M_LOCKF, M_WAITOK);
+		free(g->g_indexbuf, M_LOCKF);
+		g->g_indexbuf = malloc(2 * g->g_space * sizeof(int),
+		    M_LOCKF, M_WAITOK);
+		g->g_space = 2 * g->g_space;
+	}
+	v->v_order = g->g_size;
+	v->v_gen = g->g_gen;
+	g->g_vertices[g->g_size] = v;
+	g->g_size++;
+
+	LIST_INIT(&v->v_outedges);
+	LIST_INIT(&v->v_inedges);
+	v->v_owner = lo;
+
+	return (v);
+}
+
+static void
+graph_free_vertex(struct owner_graph *g, struct owner_vertex *v)
+{
+	struct owner_vertex *w;
+	int i;
+
+	sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
+	
+	KASSERT(LIST_EMPTY(&v->v_outedges), ("Freeing vertex with edges"));
+	KASSERT(LIST_EMPTY(&v->v_inedges), ("Freeing vertex with edges"));
+
+	/*
+	 * Remove from the graph's array and close up the gap,
+	 * renumbering the other vertices.
+	 */
+	for (i = v->v_order + 1; i < g->g_size; i++) {
+		w = g->g_vertices[i];
+		w->v_order--;
+		g->g_vertices[i - 1] = w;
+	}
+	g->g_size--;
+
+	free(v, M_LOCKF);
+}
+
+static struct owner_graph *
+graph_init(struct owner_graph *g)
+{
+
+	g->g_vertices = malloc(10 * sizeof(struct owner_vertex *),
+	    M_LOCKF, M_WAITOK);
+	g->g_size = 0;
+	g->g_space = 10;
+	g->g_indexbuf = malloc(g->g_space * sizeof(int), M_LOCKF, M_WAITOK);
+	g->g_gen = 0;
+
+	return (g);
+}
+
+#ifdef LOCKF_DEBUG
+/*
+ * Print description of a lock owner
+ */
+static void
+lf_print_owner(struct lock_owner *lo)
+{
+
+	if (lo->lo_flags & F_REMOTE) {
+		printf("remote pid %d, system %d",
+		    lo->lo_pid, lo->lo_sysid);
+	} else if (lo->lo_flags & F_FLOCK) {
+		printf("file %p", lo->lo_id);
+	} else {
+		printf("local pid %d", lo->lo_pid);
+	}
+}
+
+/*
+ * Print out a lock.
+ */
+static void
+lf_print(char *tag, struct lockf_entry *lock)
+{
+
+	printf("%s: lock %p for ", tag, (void *)lock);
+	lf_print_owner(lock->lf_owner);
+	if (lock->lf_inode != (struct inode *)0)
+		printf(" in ino %ju on dev <%s>,",
+		    (uintmax_t)lock->lf_inode->i_number,
+		    devtoname(lock->lf_inode->i_dev));
+	printf(" %s, start %jd, end ",
+	    lock->lf_type == F_RDLCK ? "shared" :
+	    lock->lf_type == F_WRLCK ? "exclusive" :
+	    lock->lf_type == F_UNLCK ? "unlock" : "unknown",
+	    (intmax_t)lock->lf_start);
+	if (lock->lf_end == OFF_MAX)
+		printf("EOF");
+	else
+		printf("%jd", (intmax_t)lock->lf_end);
+	if (!LIST_EMPTY(&lock->lf_outedges))
+		printf(" block %p\n",
+		    (void *)LIST_FIRST(&lock->lf_outedges)->le_to);
+	else
+		printf("\n");
+}
+
+static void
+lf_printlist(char *tag, struct lockf_entry *lock)
+{
+	struct lockf_entry *lf, *blk;
+	struct lockf_edge *e;
+
+	if (lock->lf_inode == (struct inode *)0)
+		return;
+
+	printf("%s: Lock list for ino %ju on dev <%s>:\n",
+	    tag, (uintmax_t)lock->lf_inode->i_number,
+	    devtoname(lock->lf_inode->i_dev));
+	LIST_FOREACH(lf, &lock->lf_vnode->v_lockf->ls_active, lf_link) {
+		printf("\tlock %p for ",(void *)lf);
+		lf_print_owner(lock->lf_owner);
+		printf(", %s, start %jd, end %jd",
+		    lf->lf_type == F_RDLCK ? "shared" :
+		    lf->lf_type == F_WRLCK ? "exclusive" :
+		    lf->lf_type == F_UNLCK ? "unlock" :
+		    "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
+		LIST_FOREACH(e, &lf->lf_outedges, le_outlink) {
+			blk = e->le_to;
+			printf("\n\t\tlock request %p for ", (void *)blk);
+			lf_print_owner(blk->lf_owner);
+			printf(", %s, start %jd, end %jd",
+			    blk->lf_type == F_RDLCK ? "shared" :
+			    blk->lf_type == F_WRLCK ? "exclusive" :
+			    blk->lf_type == F_UNLCK ? "unlock" :
+			    "unknown", (intmax_t)blk->lf_start,
+			    (intmax_t)blk->lf_end);
+			if (!LIST_EMPTY(&blk->lf_inedges))
+				panic("lf_printlist: bad list");
+		}
+		printf("\n");
+	}
+}
+#endif /* LOCKF_DEBUG */
diff --git a/sys/kern/kern_lockstat.c b/sys/kern/kern_lockstat.c
new file mode 100644
index 0000000..1f35893
--- /dev/null
+++ b/sys/kern/kern_lockstat.c
@@ -0,0 +1,64 @@
+/*-
+ * Copyright 2008-2009 Stacey Son <sson@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Backend for the lock tracing (lockstat) kernel support. This is required 
+ * to allow a module to load even though DTrace kernel support may not be 
+ * present. 
+ *
+ */
+
+#include "opt_kdtrace.h"
+
+#ifdef KDTRACE_HOOKS
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/lockstat.h>
+
+/*
+ * The following must match the type definition of dtrace_probe.  It is  
+ * defined this way to avoid having to rely on CDDL code.
+ */
+uint32_t lockstat_probemap[LS_NPROBES];
+void (*lockstat_probe_func)(uint32_t, uintptr_t, uintptr_t,
+    uintptr_t, uintptr_t, uintptr_t);
+
+
+uint64_t 
+lockstat_nsecs(void)
+{
+	struct bintime bt;
+	uint64_t ns;
+
+	binuptime(&bt);
+	ns = bt.sec * (uint64_t)1000000000;
+	ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32;
+	return (ns);
+}
+
+#endif /* KDTRACE_HOOKS */
diff --git a/sys/kern/kern_loginclass.c b/sys/kern/kern_loginclass.c
new file mode 100644
index 0000000..beac93b
--- /dev/null
+++ b/sys/kern/kern_loginclass.c
@@ -0,0 +1,238 @@
+/*-
+ * Copyright (c) 2011 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Edward Tomasz Napierala under sponsorship
+ * from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Processes may set login class name using setloginclass(2).  This
+ * is usually done through call to setusercontext(3), by programs
+ * such as login(1), based on information from master.passwd(5).  Kernel
+ * uses this information to enforce per-class resource limits.  Current
+ * login class can be determined using id(1).  Login class is inherited
+ * from the parent process during fork(2).  If not set, it defaults
+ * to "default".
+ *
+ * Code in this file implements setloginclass(2) and getloginclass(2)
+ * system calls, and maintains class name storage and retrieval.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/loginclass.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/types.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/racct.h>
+#include <sys/refcount.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+
+static MALLOC_DEFINE(M_LOGINCLASS, "loginclass", "loginclass structures");
+
+LIST_HEAD(, loginclass)	loginclasses;
+
+/*
+ * Lock protecting loginclasses list.
+ */
+static struct mtx loginclasses_lock;
+
+static void lc_init(void);
+SYSINIT(loginclass, SI_SUB_CPU, SI_ORDER_FIRST, lc_init, NULL);
+
+void
+loginclass_hold(struct loginclass *lc)
+{
+
+	refcount_acquire(&lc->lc_refcount);
+}
+
+void
+loginclass_free(struct loginclass *lc)
+{
+	int old;
+
+	old = lc->lc_refcount;
+	if (old > 1 && atomic_cmpset_int(&lc->lc_refcount, old, old - 1))
+		return;
+
+	mtx_lock(&loginclasses_lock);
+	if (refcount_release(&lc->lc_refcount)) {
+		racct_destroy(&lc->lc_racct);
+		LIST_REMOVE(lc, lc_next);
+		mtx_unlock(&loginclasses_lock);
+		free(lc, M_LOGINCLASS);
+
+		return;
+	}
+	mtx_unlock(&loginclasses_lock);
+}
+
+/*
+ * Return loginclass structure with a corresponding name.  Not
+ * performance critical, as it's used mainly by setloginclass(2),
+ * which happens once per login session.  Caller has to use
+ * loginclass_free() on the returned value when it's no longer
+ * needed.
+ */
+struct loginclass *
+loginclass_find(const char *name)
+{
+	struct loginclass *lc, *newlc;
+
+	if (name[0] == '\0' || strlen(name) >= MAXLOGNAME)
+		return (NULL);
+
+	newlc = malloc(sizeof(*newlc), M_LOGINCLASS, M_ZERO | M_WAITOK);
+	racct_create(&newlc->lc_racct);
+
+	mtx_lock(&loginclasses_lock);
+	LIST_FOREACH(lc, &loginclasses, lc_next) {
+		if (strcmp(name, lc->lc_name) != 0)
+			continue;
+
+		/* Found loginclass with a matching name? */
+		loginclass_hold(lc);
+		mtx_unlock(&loginclasses_lock);
+		racct_destroy(&newlc->lc_racct);
+		free(newlc, M_LOGINCLASS);
+		return (lc);
+	}
+
+	/* Add new loginclass. */
+	strcpy(newlc->lc_name, name);
+	refcount_init(&newlc->lc_refcount, 1);
+	LIST_INSERT_HEAD(&loginclasses, newlc, lc_next);
+	mtx_unlock(&loginclasses_lock);
+
+	return (newlc);
+}
+
+/*
+ * Get login class name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getloginclass_args {
+	char	*namebuf;
+	size_t	namelen;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getloginclass(struct thread *td, struct getloginclass_args *uap)
+{
+	int error = 0;
+	size_t lcnamelen;
+	struct proc *p;
+	struct loginclass *lc;
+
+	p = td->td_proc;
+	PROC_LOCK(p);
+	lc = p->p_ucred->cr_loginclass;
+	loginclass_hold(lc);
+	PROC_UNLOCK(p);
+
+	lcnamelen = strlen(lc->lc_name) + 1;
+	if (lcnamelen > uap->namelen)
+		error = ERANGE;
+	if (error == 0)
+		error = copyout(lc->lc_name, uap->namebuf, lcnamelen);
+	loginclass_free(lc);
+	return (error);
+}
+
+/*
+ * Set login class name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setloginclass_args {
+	const char	*namebuf;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setloginclass(struct thread *td, struct setloginclass_args *uap)
+{
+	struct proc *p = td->td_proc;
+	int error;
+	char lcname[MAXLOGNAME];
+	struct loginclass *newlc;
+	struct ucred *newcred, *oldcred;
+
+	error = priv_check(td, PRIV_PROC_SETLOGINCLASS);
+	if (error != 0)
+		return (error);
+	error = copyinstr(uap->namebuf, lcname, sizeof(lcname), NULL);
+	if (error != 0)
+		return (error);
+
+	newlc = loginclass_find(lcname);
+	if (newlc == NULL)
+		return (EINVAL);
+	newcred = crget();
+
+	PROC_LOCK(p);
+	oldcred = crcopysafe(p, newcred);
+	newcred->cr_loginclass = newlc;
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+#ifdef RACCT
+	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+	loginclass_free(oldcred->cr_loginclass);
+	crfree(oldcred);
+
+	return (0);
+}
+
+void
+loginclass_racct_foreach(void (*callback)(struct racct *racct,
+    void *arg2, void *arg3), void *arg2, void *arg3)
+{
+	struct loginclass *lc;
+
+	mtx_lock(&loginclasses_lock);
+	LIST_FOREACH(lc, &loginclasses, lc_next)
+		(callback)(lc->lc_racct, arg2, arg3);
+	mtx_unlock(&loginclasses_lock);
+}
+
+static void
+lc_init(void)
+{
+
+	mtx_init(&loginclasses_lock, "loginclasses lock", NULL, MTX_DEF);
+}
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
new file mode 100644
index 0000000..9116433
--- /dev/null
+++ b/sys/kern/kern_malloc.c
@@ -0,0 +1,1100 @@
+/*-
+ * Copyright (c) 1987, 1991, 1993
+ *	The Regents of the University of California.
+ * Copyright (c) 2005-2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_malloc.c	8.3 (Berkeley) 1/4/94
+ */
+
+/*
+ * Kernel malloc(9) implementation -- general purpose kernel memory allocator
+ * based on memory types.  Back end is implemented using the UMA(9) zone
+ * allocator.  A set of fixed-size buckets are used for smaller allocations,
+ * and a special UMA allocation interface is used for larger allocations.
+ * Callers declare memory types, and statistics are maintained independently
+ * for each memory type.  Statistics are maintained per-CPU for performance
+ * reasons.  See malloc(9) and comments in malloc.h for a detailed
+ * description.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_kdtrace.h"
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/vmmeter.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+#include <sys/vmem.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+#include <vm/uma_int.h>
+#include <vm/uma_dbg.h>
+
+#ifdef DEBUG_MEMGUARD
+#include <vm/memguard.h>
+#endif
+#ifdef DEBUG_REDZONE
+#include <vm/redzone.h>
+#endif
+
+#if defined(INVARIANTS) && defined(__i386__)
+#include <machine/cpu.h>
+#endif
+
+#include <ddb/ddb.h>
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+
+dtrace_malloc_probe_func_t	dtrace_malloc_probe;
+#endif
+
+/*
+ * When realloc() is called, if the new size is sufficiently smaller than
+ * the old size, realloc() will allocate a new, smaller block to avoid
+ * wasting memory. 'Sufficiently smaller' is defined as: newsize <=
+ * oldsize / 2^n, where REALLOC_FRACTION defines the value of 'n'.
+ */
+#ifndef REALLOC_FRACTION
+#define	REALLOC_FRACTION	1	/* new block if <= half the size */
+#endif
+
+/*
+ * Centrally define some common malloc types.
+ */
+MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches");
+MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
+MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers");
+
+MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
+MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery");
+
+static struct malloc_type *kmemstatistics;
+static int kmemcount;
+
+#define KMEM_ZSHIFT	4
+#define KMEM_ZBASE	16
+#define KMEM_ZMASK	(KMEM_ZBASE - 1)
+
+#define KMEM_ZMAX	PAGE_SIZE
+#define KMEM_ZSIZE	(KMEM_ZMAX >> KMEM_ZSHIFT)
+static uint8_t kmemsize[KMEM_ZSIZE + 1];
+
+#ifndef MALLOC_DEBUG_MAXZONES
+#define	MALLOC_DEBUG_MAXZONES	1
+#endif
+static int numzones = MALLOC_DEBUG_MAXZONES;
+
+/*
+ * Small malloc(9) memory allocations are allocated from a set of UMA buckets
+ * of various sizes.
+ *
+ * XXX: The comment here used to read "These won't be powers of two for
+ * long."  It's possible that a significant amount of wasted memory could be
+ * recovered by tuning the sizes of these buckets.
+ */
+struct {
+	int kz_size;
+	char *kz_name;
+	uma_zone_t kz_zone[MALLOC_DEBUG_MAXZONES];
+} kmemzones[] = {
+	{16, "16", },
+	{32, "32", },
+	{64, "64", },
+	{128, "128", },
+	{256, "256", },
+	{512, "512", },
+	{1024, "1024", },
+	{2048, "2048", },
+	{4096, "4096", },
+#if PAGE_SIZE > 4096
+	{8192, "8192", },
+#if PAGE_SIZE > 8192
+	{16384, "16384", },
+#if PAGE_SIZE > 16384
+	{32768, "32768", },
+#if PAGE_SIZE > 32768
+	{65536, "65536", },
+#if PAGE_SIZE > 65536
+#error	"Unsupported PAGE_SIZE"
+#endif	/* 65536 */
+#endif	/* 32768 */
+#endif	/* 16384 */
+#endif	/* 8192 */
+#endif	/* 4096 */
+	{0, NULL},
+};
+
+/*
+ * Zone to allocate malloc type descriptions from.  For ABI reasons, memory
+ * types are described by a data structure passed by the declaring code, but
+ * the malloc(9) implementation has its own data structure describing the
+ * type and statistics.  This permits the malloc(9)-internal data structures
+ * to be modified without breaking binary-compiled kernel modules that
+ * declare malloc types.
+ */
+static uma_zone_t mt_zone;
+
+u_long vm_kmem_size;
+SYSCTL_ULONG(_vm, OID_AUTO, kmem_size, CTLFLAG_RDTUN, &vm_kmem_size, 0,
+    "Size of kernel memory");
+
+static u_long vm_kmem_size_min;
+SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_min, CTLFLAG_RDTUN, &vm_kmem_size_min, 0,
+    "Minimum size of kernel memory");
+
+static u_long vm_kmem_size_max;
+SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_max, CTLFLAG_RDTUN, &vm_kmem_size_max, 0,
+    "Maximum size of kernel memory");
+
+static u_int vm_kmem_size_scale;
+SYSCTL_UINT(_vm, OID_AUTO, kmem_size_scale, CTLFLAG_RDTUN, &vm_kmem_size_scale, 0,
+    "Scale factor for kernel memory size");
+
+static int sysctl_kmem_map_size(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, kmem_map_size,
+    CTLFLAG_RD | CTLTYPE_ULONG | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_kmem_map_size, "LU", "Current kmem allocation size");
+
+static int sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, kmem_map_free,
+    CTLFLAG_RD | CTLTYPE_ULONG | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_kmem_map_free, "LU", "Free space in kmem");
+
+/*
+ * The malloc_mtx protects the kmemstatistics linked list.
+ */
+struct mtx malloc_mtx;
+
+#ifdef MALLOC_PROFILE
+uint64_t krequests[KMEM_ZSIZE + 1];
+
+static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS);
+#endif
+
+static int sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS);
+
+/*
+ * time_uptime of the last malloc(9) failure (induced or real).
+ */
+static time_t t_malloc_fail;
+
+#if defined(MALLOC_MAKE_FAILURES) || (MALLOC_DEBUG_MAXZONES > 1)
+static SYSCTL_NODE(_debug, OID_AUTO, malloc, CTLFLAG_RD, 0,
+    "Kernel malloc debugging options");
+#endif
+
+/*
+ * malloc(9) fault injection -- cause malloc failures every (n) mallocs when
+ * the caller specifies M_NOWAIT.  If set to 0, no failures are caused.
+ */
+#ifdef MALLOC_MAKE_FAILURES
+static int malloc_failure_rate;
+static int malloc_nowait_count;
+static int malloc_failure_count;
+SYSCTL_INT(_debug_malloc, OID_AUTO, failure_rate, CTLFLAG_RW,
+    &malloc_failure_rate, 0, "Every (n) mallocs with M_NOWAIT will fail");
+TUNABLE_INT("debug.malloc.failure_rate", &malloc_failure_rate);
+SYSCTL_INT(_debug_malloc, OID_AUTO, failure_count, CTLFLAG_RD,
+    &malloc_failure_count, 0, "Number of imposed M_NOWAIT malloc failures");
+#endif
+
+static int
+sysctl_kmem_map_size(SYSCTL_HANDLER_ARGS)
+{
+	u_long size;
+
+	size = vmem_size(kmem_arena, VMEM_ALLOC);
+	return (sysctl_handle_long(oidp, &size, 0, req));
+}
+
+static int
+sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS)
+{
+	u_long size;
+
+	size = vmem_size(kmem_arena, VMEM_FREE);
+	return (sysctl_handle_long(oidp, &size, 0, req));
+}
+
+/*
+ * malloc(9) uma zone separation -- sub-page buffer overruns in one
+ * malloc type will affect only a subset of other malloc types.
+ */
+#if MALLOC_DEBUG_MAXZONES > 1
+static void
+tunable_set_numzones(void)
+{
+
+	TUNABLE_INT_FETCH("debug.malloc.numzones",
+	    &numzones);
+
+	/* Sanity check the number of malloc uma zones. */
+	if (numzones <= 0)
+		numzones = 1;
+	if (numzones > MALLOC_DEBUG_MAXZONES)
+		numzones = MALLOC_DEBUG_MAXZONES;
+}
+SYSINIT(numzones, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_set_numzones, NULL);
+SYSCTL_INT(_debug_malloc, OID_AUTO, numzones, CTLFLAG_RDTUN,
+    &numzones, 0, "Number of malloc uma subzones");
+
+/*
+ * Any number that changes regularly is an okay choice for the
+ * offset.  Build numbers are pretty good of you have them.
+ */
+static u_int zone_offset = __FreeBSD_version;
+TUNABLE_INT("debug.malloc.zone_offset", &zone_offset);
+SYSCTL_UINT(_debug_malloc, OID_AUTO, zone_offset, CTLFLAG_RDTUN,
+    &zone_offset, 0, "Separate malloc types by examining the "
+    "Nth character in the malloc type short description.");
+
+static u_int
+mtp_get_subzone(const char *desc)
+{
+	size_t len;
+	u_int val;
+
+	if (desc == NULL || (len = strlen(desc)) == 0)
+		return (0);
+	val = desc[zone_offset % len];
+	return (val % numzones);
+}
+#elif MALLOC_DEBUG_MAXZONES == 0
+#error "MALLOC_DEBUG_MAXZONES must be positive."
+#else
+static inline u_int
+mtp_get_subzone(const char *desc)
+{
+
+	return (0);
+}
+#endif /* MALLOC_DEBUG_MAXZONES > 1 */
+
+int
+malloc_last_fail(void)
+{
+
+	return (time_uptime - t_malloc_fail);
+}
+
+/*
+ * An allocation has succeeded -- update malloc type statistics for the
+ * amount of bucket size.  Occurs within a critical section so that the
+ * thread isn't preempted and doesn't migrate while updating per-PCU
+ * statistics.
+ */
+static void
+malloc_type_zone_allocated(struct malloc_type *mtp, unsigned long size,
+    int zindx)
+{
+	struct malloc_type_internal *mtip;
+	struct malloc_type_stats *mtsp;
+
+	critical_enter();
+	mtip = mtp->ks_handle;
+	mtsp = &mtip->mti_stats[curcpu];
+	if (size > 0) {
+		mtsp->mts_memalloced += size;
+		mtsp->mts_numallocs++;
+	}
+	if (zindx != -1)
+		mtsp->mts_size |= 1 << zindx;
+
+#ifdef KDTRACE_HOOKS
+	if (dtrace_malloc_probe != NULL) {
+		uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_MALLOC];
+		if (probe_id != 0)
+			(dtrace_malloc_probe)(probe_id,
+			    (uintptr_t) mtp, (uintptr_t) mtip,
+			    (uintptr_t) mtsp, size, zindx);
+	}
+#endif
+
+	critical_exit();
+}
+
+void
+malloc_type_allocated(struct malloc_type *mtp, unsigned long size)
+{
+
+	if (size > 0)
+		malloc_type_zone_allocated(mtp, size, -1);
+}
+
+/*
+ * A free operation has occurred -- update malloc type statistics for the
+ * amount of the bucket size.  Occurs within a critical section so that the
+ * thread isn't preempted and doesn't migrate while updating per-CPU
+ * statistics.
+ */
+void
+malloc_type_freed(struct malloc_type *mtp, unsigned long size)
+{
+	struct malloc_type_internal *mtip;
+	struct malloc_type_stats *mtsp;
+
+	critical_enter();
+	mtip = mtp->ks_handle;
+	mtsp = &mtip->mti_stats[curcpu];
+	mtsp->mts_memfreed += size;
+	mtsp->mts_numfrees++;
+
+#ifdef KDTRACE_HOOKS
+	if (dtrace_malloc_probe != NULL) {
+		uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_FREE];
+		if (probe_id != 0)
+			(dtrace_malloc_probe)(probe_id,
+			    (uintptr_t) mtp, (uintptr_t) mtip,
+			    (uintptr_t) mtsp, size, 0);
+	}
+#endif
+
+	critical_exit();
+}
+
+/*
+ *	contigmalloc:
+ *
+ *	Allocate a block of physically contiguous memory.
+ *
+ *	If M_NOWAIT is set, this routine will not block and return NULL if
+ *	the allocation fails.
+ */
+void *
+contigmalloc(unsigned long size, struct malloc_type *type, int flags,
+    vm_paddr_t low, vm_paddr_t high, unsigned long alignment,
+    vm_paddr_t boundary)
+{
+	void *ret;
+
+	ret = (void *)kmem_alloc_contig(kernel_arena, size, flags, low, high,
+	    alignment, boundary, VM_MEMATTR_DEFAULT);
+	if (ret != NULL)
+		malloc_type_allocated(type, round_page(size));
+	return (ret);
+}
+
+/*
+ *	contigfree:
+ *
+ *	Free a block of memory allocated by contigmalloc.
+ *
+ *	This routine may not block.
+ */
+void
+contigfree(void *addr, unsigned long size, struct malloc_type *type)
+{
+
+	kmem_free(kernel_arena, (vm_offset_t)addr, size);
+	malloc_type_freed(type, round_page(size));
+}
+
+/*
+ *	malloc:
+ *
+ *	Allocate a block of memory.
+ *
+ *	If M_NOWAIT is set, this routine will not block and return NULL if
+ *	the allocation fails.
+ */
+void *
+malloc(unsigned long size, struct malloc_type *mtp, int flags)
+{
+	int indx;
+	struct malloc_type_internal *mtip;
+	caddr_t va;
+	uma_zone_t zone;
+#if defined(DIAGNOSTIC) || defined(DEBUG_REDZONE)
+	unsigned long osize = size;
+#endif
+
+#ifdef INVARIANTS
+	KASSERT(mtp->ks_magic == M_MAGIC, ("malloc: bad malloc type magic"));
+	/*
+	 * Check that exactly one of M_WAITOK or M_NOWAIT is specified.
+	 */
+	indx = flags & (M_WAITOK | M_NOWAIT);
+	if (indx != M_NOWAIT && indx != M_WAITOK) {
+		static	struct timeval lasterr;
+		static	int curerr, once;
+		if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) {
+			printf("Bad malloc flags: %x\n", indx);
+			kdb_backtrace();
+			flags |= M_WAITOK;
+			once++;
+		}
+	}
+#endif
+#ifdef MALLOC_MAKE_FAILURES
+	if ((flags & M_NOWAIT) && (malloc_failure_rate != 0)) {
+		atomic_add_int(&malloc_nowait_count, 1);
+		if ((malloc_nowait_count % malloc_failure_rate) == 0) {
+			atomic_add_int(&malloc_failure_count, 1);
+			t_malloc_fail = time_uptime;
+			return (NULL);
+		}
+	}
+#endif
+	if (flags & M_WAITOK)
+		KASSERT(curthread->td_intr_nesting_level == 0,
+		   ("malloc(M_WAITOK) in interrupt context"));
+
+#ifdef DEBUG_MEMGUARD
+	if (memguard_cmp_mtp(mtp, size)) {
+		va = memguard_alloc(size, flags);
+		if (va != NULL)
+			return (va);
+		/* This is unfortunate but should not be fatal. */
+	}
+#endif
+
+#ifdef DEBUG_REDZONE
+	size = redzone_size_ntor(size);
+#endif
+
+	if (size <= KMEM_ZMAX) {
+		mtip = mtp->ks_handle;
+		if (size & KMEM_ZMASK)
+			size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
+		indx = kmemsize[size >> KMEM_ZSHIFT];
+		KASSERT(mtip->mti_zone < numzones,
+		    ("mti_zone %u out of range %d",
+		    mtip->mti_zone, numzones));
+		zone = kmemzones[indx].kz_zone[mtip->mti_zone];
+#ifdef MALLOC_PROFILE
+		krequests[size >> KMEM_ZSHIFT]++;
+#endif
+		va = uma_zalloc(zone, flags);
+		if (va != NULL)
+			size = zone->uz_size;
+		malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx);
+	} else {
+		size = roundup(size, PAGE_SIZE);
+		zone = NULL;
+		va = uma_large_malloc(size, flags);
+		malloc_type_allocated(mtp, va == NULL ? 0 : size);
+	}
+	if (flags & M_WAITOK)
+		KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL"));
+	else if (va == NULL)
+		t_malloc_fail = time_uptime;
+#ifdef DIAGNOSTIC
+	if (va != NULL && !(flags & M_ZERO)) {
+		memset(va, 0x70, osize);
+	}
+#endif
+#ifdef DEBUG_REDZONE
+	if (va != NULL)
+		va = redzone_setup(va, osize);
+#endif
+	return ((void *) va);
+}
+
+/*
+ *	free:
+ *
+ *	Free a block of memory allocated by malloc.
+ *
+ *	This routine may not block.
+ */
+void
+free(void *addr, struct malloc_type *mtp)
+{
+	uma_slab_t slab;
+	u_long size;
+
+	KASSERT(mtp->ks_magic == M_MAGIC, ("free: bad malloc type magic"));
+
+	/* free(NULL, ...) does nothing */
+	if (addr == NULL)
+		return;
+
+#ifdef DEBUG_MEMGUARD
+	if (is_memguard_addr(addr)) {
+		memguard_free(addr);
+		return;
+	}
+#endif
+
+#ifdef DEBUG_REDZONE
+	redzone_check(addr);
+	addr = redzone_addr_ntor(addr);
+#endif
+
+	slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
+
+	if (slab == NULL)
+		panic("free: address %p(%p) has not been allocated.\n",
+		    addr, (void *)((u_long)addr & (~UMA_SLAB_MASK)));
+
+	if (!(slab->us_flags & UMA_SLAB_MALLOC)) {
+#ifdef INVARIANTS
+		struct malloc_type **mtpp = addr;
+#endif
+		size = slab->us_keg->uk_size;
+#ifdef INVARIANTS
+		/*
+		 * Cache a pointer to the malloc_type that most recently freed
+		 * this memory here.  This way we know who is most likely to
+		 * have stepped on it later.
+		 *
+		 * This code assumes that size is a multiple of 8 bytes for
+		 * 64 bit machines
+		 */
+		mtpp = (struct malloc_type **)
+		    ((unsigned long)mtpp & ~UMA_ALIGN_PTR);
+		mtpp += (size - sizeof(struct malloc_type *)) /
+		    sizeof(struct malloc_type *);
+		*mtpp = mtp;
+#endif
+		uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab);
+	} else {
+		size = slab->us_size;
+		uma_large_free(slab);
+	}
+	malloc_type_freed(mtp, size);
+}
+
+/*
+ *	realloc: change the size of a memory block
+ */
+void *
+realloc(void *addr, unsigned long size, struct malloc_type *mtp, int flags)
+{
+	uma_slab_t slab;
+	unsigned long alloc;
+	void *newaddr;
+
+	KASSERT(mtp->ks_magic == M_MAGIC,
+	    ("realloc: bad malloc type magic"));
+
+	/* realloc(NULL, ...) is equivalent to malloc(...) */
+	if (addr == NULL)
+		return (malloc(size, mtp, flags));
+
+	/*
+	 * XXX: Should report free of old memory and alloc of new memory to
+	 * per-CPU stats.
+	 */
+
+#ifdef DEBUG_MEMGUARD
+	if (is_memguard_addr(addr))
+		return (memguard_realloc(addr, size, mtp, flags));
+#endif
+
+#ifdef DEBUG_REDZONE
+	slab = NULL;
+	alloc = redzone_get_size(addr);
+#else
+	slab = vtoslab((vm_offset_t)addr & ~(UMA_SLAB_MASK));
+
+	/* Sanity check */
+	KASSERT(slab != NULL,
+	    ("realloc: address %p out of range", (void *)addr));
+
+	/* Get the size of the original block */
+	if (!(slab->us_flags & UMA_SLAB_MALLOC))
+		alloc = slab->us_keg->uk_size;
+	else
+		alloc = slab->us_size;
+
+	/* Reuse the original block if appropriate */
+	if (size <= alloc
+	    && (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE))
+		return (addr);
+#endif /* !DEBUG_REDZONE */
+
+	/* Allocate a new, bigger (or smaller) block */
+	if ((newaddr = malloc(size, mtp, flags)) == NULL)
+		return (NULL);
+
+	/* Copy over original contents */
+	bcopy(addr, newaddr, min(size, alloc));
+	free(addr, mtp);
+	return (newaddr);
+}
+
+/*
+ *	reallocf: same as realloc() but free memory on failure.
+ */
+void *
+reallocf(void *addr, unsigned long size, struct malloc_type *mtp, int flags)
+{
+	void *mem;
+
+	if ((mem = realloc(addr, size, mtp, flags)) == NULL)
+		free(addr, mtp);
+	return (mem);
+}
+
+/*
+ * Wake the page daemon when we exhaust KVA.  It will call the lowmem handler
+ * and uma_reclaim() callbacks in a context that is safe.
+ */
+static void
+kmem_reclaim(vmem_t *vm, int flags)
+{
+
+	pagedaemon_wakeup();
+}
+
+/*
+ * Initialize the kernel memory arena.
+ */
+void
+kmeminit(void)
+{
+	u_long mem_size, tmp;
+ 
+	/*
+	 * Try to auto-tune the kernel memory size, so that it is
+	 * more applicable for a wider range of machine sizes.  The
+	 * VM_KMEM_SIZE_MAX is dependent on the maximum KVA space
+	 * available.
+	 *
+	 * Note that the kmem_map is also used by the zone allocator,
+	 * so make sure that there is enough space.
+	 */
+	vm_kmem_size = VM_KMEM_SIZE + nmbclusters * PAGE_SIZE;
+	mem_size = cnt.v_page_count;
+
+#if defined(VM_KMEM_SIZE_SCALE)
+	vm_kmem_size_scale = VM_KMEM_SIZE_SCALE;
+#endif
+	TUNABLE_INT_FETCH("vm.kmem_size_scale", &vm_kmem_size_scale);
+	if (vm_kmem_size_scale > 0 &&
+	    (mem_size / vm_kmem_size_scale) > (vm_kmem_size / PAGE_SIZE))
+		vm_kmem_size = (mem_size / vm_kmem_size_scale) * PAGE_SIZE;
+
+#if defined(VM_KMEM_SIZE_MIN)
+	vm_kmem_size_min = VM_KMEM_SIZE_MIN;
+#endif
+	TUNABLE_ULONG_FETCH("vm.kmem_size_min", &vm_kmem_size_min);
+	if (vm_kmem_size_min > 0 && vm_kmem_size < vm_kmem_size_min) {
+		vm_kmem_size = vm_kmem_size_min;
+	}
+
+#if defined(VM_KMEM_SIZE_MAX)
+	vm_kmem_size_max = VM_KMEM_SIZE_MAX;
+#endif
+	TUNABLE_ULONG_FETCH("vm.kmem_size_max", &vm_kmem_size_max);
+	if (vm_kmem_size_max > 0 && vm_kmem_size >= vm_kmem_size_max)
+		vm_kmem_size = vm_kmem_size_max;
+
+	/* Allow final override from the kernel environment */
+	TUNABLE_ULONG_FETCH("vm.kmem_size", &vm_kmem_size);
+
+	/*
+	 * Limit kmem virtual size to twice the physical memory.
+	 * This allows for kmem map sparseness, but limits the size
+	 * to something sane.  Be careful to not overflow the 32bit
+	 * ints while doing the check or the adjustment.
+	 */
+	if (vm_kmem_size / 2 / PAGE_SIZE > mem_size)
+		vm_kmem_size = 2 * mem_size * PAGE_SIZE;
+
+	vm_kmem_size = round_page(vm_kmem_size);
+#ifdef DEBUG_MEMGUARD
+	tmp = memguard_fudge(vm_kmem_size, kernel_map);
+#else
+	tmp = vm_kmem_size;
+#endif
+	vmem_init(kmem_arena, "kmem arena", kva_alloc(tmp), tmp, PAGE_SIZE,
+	    0, 0);
+	vmem_set_reclaim(kmem_arena, kmem_reclaim);
+
+#ifdef DEBUG_MEMGUARD
+	/*
+	 * Initialize MemGuard if support compiled in.  MemGuard is a
+	 * replacement allocator used for detecting tamper-after-free
+	 * scenarios as they occur.  It is only used for debugging.
+	 */
+	memguard_init(kmem_arena);
+#endif
+}
+
+/*
+ * Initialize the kernel memory allocator
+ */
+/* ARGSUSED*/
+static void
+mallocinit(void *dummy)
+{
+	int i;
+	uint8_t indx;
+
+	mtx_init(&malloc_mtx, "malloc", NULL, MTX_DEF);
+
+	kmeminit();
+
+	uma_startup2();
+
+	mt_zone = uma_zcreate("mt_zone", sizeof(struct malloc_type_internal),
+#ifdef INVARIANTS
+	    mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
+#else
+	    NULL, NULL, NULL, NULL,
+#endif
+	    UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
+	for (i = 0, indx = 0; kmemzones[indx].kz_size != 0; indx++) {
+		int size = kmemzones[indx].kz_size;
+		char *name = kmemzones[indx].kz_name;
+		int subzone;
+
+		for (subzone = 0; subzone < numzones; subzone++) {
+			kmemzones[indx].kz_zone[subzone] =
+			    uma_zcreate(name, size,
+#ifdef INVARIANTS
+			    mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
+#else
+			    NULL, NULL, NULL, NULL,
+#endif
+			    UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
+		}		    
+		for (;i <= size; i+= KMEM_ZBASE)
+			kmemsize[i >> KMEM_ZSHIFT] = indx;
+		
+	}
+}
+SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, mallocinit, NULL);
+
+void
+malloc_init(void *data)
+{
+	struct malloc_type_internal *mtip;
+	struct malloc_type *mtp;
+
+	KASSERT(cnt.v_page_count != 0, ("malloc_register before vm_init"));
+
+	mtp = data;
+	if (mtp->ks_magic != M_MAGIC)
+		panic("malloc_init: bad malloc type magic");
+
+	mtip = uma_zalloc(mt_zone, M_WAITOK | M_ZERO);
+	mtp->ks_handle = mtip;
+	mtip->mti_zone = mtp_get_subzone(mtp->ks_shortdesc);
+
+	mtx_lock(&malloc_mtx);
+	mtp->ks_next = kmemstatistics;
+	kmemstatistics = mtp;
+	kmemcount++;
+	mtx_unlock(&malloc_mtx);
+}
+
+void
+malloc_uninit(void *data)
+{
+	struct malloc_type_internal *mtip;
+	struct malloc_type_stats *mtsp;
+	struct malloc_type *mtp, *temp;
+	uma_slab_t slab;
+	long temp_allocs, temp_bytes;
+	int i;
+
+	mtp = data;
+	KASSERT(mtp->ks_magic == M_MAGIC,
+	    ("malloc_uninit: bad malloc type magic"));
+	KASSERT(mtp->ks_handle != NULL, ("malloc_deregister: cookie NULL"));
+
+	mtx_lock(&malloc_mtx);
+	mtip = mtp->ks_handle;
+	mtp->ks_handle = NULL;
+	if (mtp != kmemstatistics) {
+		for (temp = kmemstatistics; temp != NULL;
+		    temp = temp->ks_next) {
+			if (temp->ks_next == mtp) {
+				temp->ks_next = mtp->ks_next;
+				break;
+			}
+		}
+		KASSERT(temp,
+		    ("malloc_uninit: type '%s' not found", mtp->ks_shortdesc));
+	} else
+		kmemstatistics = mtp->ks_next;
+	kmemcount--;
+	mtx_unlock(&malloc_mtx);
+
+	/*
+	 * Look for memory leaks.
+	 */
+	temp_allocs = temp_bytes = 0;
+	for (i = 0; i < MAXCPU; i++) {
+		mtsp = &mtip->mti_stats[i];
+		temp_allocs += mtsp->mts_numallocs;
+		temp_allocs -= mtsp->mts_numfrees;
+		temp_bytes += mtsp->mts_memalloced;
+		temp_bytes -= mtsp->mts_memfreed;
+	}
+	if (temp_allocs > 0 || temp_bytes > 0) {
+		printf("Warning: memory type %s leaked memory on destroy "
+		    "(%ld allocations, %ld bytes leaked).\n", mtp->ks_shortdesc,
+		    temp_allocs, temp_bytes);
+	}
+
+	slab = vtoslab((vm_offset_t) mtip & (~UMA_SLAB_MASK));
+	uma_zfree_arg(mt_zone, mtip, slab);
+}
+
+struct malloc_type *
+malloc_desc2type(const char *desc)
+{
+	struct malloc_type *mtp;
+
+	mtx_assert(&malloc_mtx, MA_OWNED);
+	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
+		if (strcmp(mtp->ks_shortdesc, desc) == 0)
+			return (mtp);
+	}
+	return (NULL);
+}
+
+static int
+sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS)
+{
+	struct malloc_type_stream_header mtsh;
+	struct malloc_type_internal *mtip;
+	struct malloc_type_header mth;
+	struct malloc_type *mtp;
+	int error, i;
+	struct sbuf sbuf;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+	mtx_lock(&malloc_mtx);
+
+	/*
+	 * Insert stream header.
+	 */
+	bzero(&mtsh, sizeof(mtsh));
+	mtsh.mtsh_version = MALLOC_TYPE_STREAM_VERSION;
+	mtsh.mtsh_maxcpus = MAXCPU;
+	mtsh.mtsh_count = kmemcount;
+	(void)sbuf_bcat(&sbuf, &mtsh, sizeof(mtsh));
+
+	/*
+	 * Insert alternating sequence of type headers and type statistics.
+	 */
+	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
+		mtip = (struct malloc_type_internal *)mtp->ks_handle;
+
+		/*
+		 * Insert type header.
+		 */
+		bzero(&mth, sizeof(mth));
+		strlcpy(mth.mth_name, mtp->ks_shortdesc, MALLOC_MAX_NAME);
+		(void)sbuf_bcat(&sbuf, &mth, sizeof(mth));
+
+		/*
+		 * Insert type statistics for each CPU.
+		 */
+		for (i = 0; i < MAXCPU; i++) {
+			(void)sbuf_bcat(&sbuf, &mtip->mti_stats[i],
+			    sizeof(mtip->mti_stats[i]));
+		}
+	}
+	mtx_unlock(&malloc_mtx);
+	error = sbuf_finish(&sbuf);
+	sbuf_delete(&sbuf);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, malloc_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
+    0, 0, sysctl_kern_malloc_stats, "s,malloc_type_ustats",
+    "Return malloc types");
+
+SYSCTL_INT(_kern, OID_AUTO, malloc_count, CTLFLAG_RD, &kmemcount, 0,
+    "Count of kernel malloc types");
+
+void
+malloc_type_list(malloc_type_list_func_t *func, void *arg)
+{
+	struct malloc_type *mtp, **bufmtp;
+	int count, i;
+	size_t buflen;
+
+	mtx_lock(&malloc_mtx);
+restart:
+	mtx_assert(&malloc_mtx, MA_OWNED);
+	count = kmemcount;
+	mtx_unlock(&malloc_mtx);
+
+	buflen = sizeof(struct malloc_type *) * count;
+	bufmtp = malloc(buflen, M_TEMP, M_WAITOK);
+
+	mtx_lock(&malloc_mtx);
+
+	if (count < kmemcount) {
+		free(bufmtp, M_TEMP);
+		goto restart;
+	}
+
+	for (mtp = kmemstatistics, i = 0; mtp != NULL; mtp = mtp->ks_next, i++)
+		bufmtp[i] = mtp;
+
+	mtx_unlock(&malloc_mtx);
+
+	for (i = 0; i < count; i++)
+		(func)(bufmtp[i], arg);
+
+	free(bufmtp, M_TEMP);
+}
+
+#ifdef DDB
+DB_SHOW_COMMAND(malloc, db_show_malloc)
+{
+	struct malloc_type_internal *mtip;
+	struct malloc_type *mtp;
+	uint64_t allocs, frees;
+	uint64_t alloced, freed;
+	int i;
+
+	db_printf("%18s %12s  %12s %12s\n", "Type", "InUse", "MemUse",
+	    "Requests");
+	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
+		mtip = (struct malloc_type_internal *)mtp->ks_handle;
+		allocs = 0;
+		frees = 0;
+		alloced = 0;
+		freed = 0;
+		for (i = 0; i < MAXCPU; i++) {
+			allocs += mtip->mti_stats[i].mts_numallocs;
+			frees += mtip->mti_stats[i].mts_numfrees;
+			alloced += mtip->mti_stats[i].mts_memalloced;
+			freed += mtip->mti_stats[i].mts_memfreed;
+		}
+		db_printf("%18s %12ju %12juK %12ju\n",
+		    mtp->ks_shortdesc, allocs - frees,
+		    (alloced - freed + 1023) / 1024, allocs);
+		if (db_pager_quit)
+			break;
+	}
+}
+
+#if MALLOC_DEBUG_MAXZONES > 1
+DB_SHOW_COMMAND(multizone_matches, db_show_multizone_matches)
+{
+	struct malloc_type_internal *mtip;
+	struct malloc_type *mtp;
+	u_int subzone;
+
+	if (!have_addr) {
+		db_printf("Usage: show multizone_matches <malloc type/addr>\n");
+		return;
+	}
+	mtp = (void *)addr;
+	if (mtp->ks_magic != M_MAGIC) {
+		db_printf("Magic %lx does not match expected %x\n",
+		    mtp->ks_magic, M_MAGIC);
+		return;
+	}
+
+	mtip = mtp->ks_handle;
+	subzone = mtip->mti_zone;
+
+	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
+		mtip = mtp->ks_handle;
+		if (mtip->mti_zone != subzone)
+			continue;
+		db_printf("%s\n", mtp->ks_shortdesc);
+		if (db_pager_quit)
+			break;
+	}
+}
+#endif /* MALLOC_DEBUG_MAXZONES > 1 */
+#endif /* DDB */
+
+#ifdef MALLOC_PROFILE
+
+static int
+sysctl_kern_mprof(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf sbuf;
+	uint64_t count;
+	uint64_t waste;
+	uint64_t mem;
+	int error;
+	int rsize;
+	int size;
+	int i;
+
+	waste = 0;
+	mem = 0;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+	sbuf_printf(&sbuf, 
+	    "\n  Size                    Requests  Real Size\n");
+	for (i = 0; i < KMEM_ZSIZE; i++) {
+		size = i << KMEM_ZSHIFT;
+		rsize = kmemzones[kmemsize[i]].kz_size;
+		count = (long long unsigned)krequests[i];
+
+		sbuf_printf(&sbuf, "%6d%28llu%11d\n", size,
+		    (unsigned long long)count, rsize);
+
+		if ((rsize * count) > (size * count))
+			waste += (rsize * count) - (size * count);
+		mem += (rsize * count);
+	}
+	sbuf_printf(&sbuf,
+	    "\nTotal memory used:\t%30llu\nTotal Memory wasted:\t%30llu\n",
+	    (unsigned long long)mem, (unsigned long long)waste);
+	error = sbuf_finish(&sbuf);
+	sbuf_delete(&sbuf);
+	return (error);
+}
+
+SYSCTL_OID(_kern, OID_AUTO, mprof, CTLTYPE_STRING|CTLFLAG_RD,
+    NULL, 0, sysctl_kern_mprof, "A", "Malloc Profiling");
+#endif /* MALLOC_PROFILE */
diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c
new file mode 100644
index 0000000..5d58942
--- /dev/null
+++ b/sys/kern/kern_mbuf.c
@@ -0,0 +1,694 @@
+/*-
+ * Copyright (c) 2004, 2005,
+ *	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_param.h"
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/domain.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/protosw.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/uma.h>
+#include <vm/uma_int.h>
+#include <vm/uma_dbg.h>
+
+/*
+ * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
+ * Zones.
+ *
+ * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
+ * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
+ * administrator so desires.
+ *
+ * Mbufs are allocated from a UMA Master Zone called the Mbuf
+ * Zone.
+ *
+ * Additionally, FreeBSD provides a Packet Zone, which it
+ * configures as a Secondary Zone to the Mbuf Master Zone,
+ * thus sharing backend Slab kegs with the Mbuf Master Zone.
+ *
+ * Thus common-case allocations and locking are simplified:
+ *
+ *  m_clget()                m_getcl()
+ *    |                         |
+ *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
+ *    |   |             [     Packet   ]            |
+ *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
+ *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Master Zone ]
+ *        |                       \________         |
+ *  [ Cluster Keg   ]                      \       /
+ *        |	                         [ Mbuf Keg   ]
+ *  [ Cluster Slabs ]                         |
+ *        |                              [ Mbuf Slabs ]
+ *         \____________(VM)_________________/
+ *
+ *
+ * Whenever an object is allocated with uma_zalloc() out of
+ * one of the Zones its _ctor_ function is executed.  The same
+ * for any deallocation through uma_zfree() the _dtor_ function
+ * is executed.
+ *
+ * Caches are per-CPU and are filled from the Master Zone.
+ *
+ * Whenever an object is allocated from the underlying global
+ * memory pool it gets pre-initialized with the _zinit_ functions.
+ * When the Keg's are overfull objects get decomissioned with
+ * _zfini_ functions and free'd back to the global memory pool.
+ *
+ */
+
+int nmbufs;			/* limits number of mbufs */
+int nmbclusters;		/* limits number of mbuf clusters */
+int nmbjumbop;			/* limits number of page size jumbo clusters */
+int nmbjumbo9;			/* limits number of 9k jumbo clusters */
+int nmbjumbo16;			/* limits number of 16k jumbo clusters */
+
+static quad_t maxmbufmem;	/* overall real memory limit for all mbufs */
+
+SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN, &maxmbufmem, 0,
+    "Maximum real memory allocatable to various mbuf types");
+
+/*
+ * tunable_mbinit() has to be run before any mbuf allocations are done.
+ */
+static void
+tunable_mbinit(void *dummy)
+{
+	quad_t realmem;
+
+	/*
+	 * The default limit for all mbuf related memory is 1/2 of all
+	 * available kernel memory (physical or kmem).
+	 * At most it can be 3/4 of available kernel memory.
+	 */
+	realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size);
+	maxmbufmem = realmem / 2;
+	TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem);
+	if (maxmbufmem > realmem / 4 * 3)
+		maxmbufmem = realmem / 4 * 3;
+
+	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
+	if (nmbclusters == 0)
+		nmbclusters = maxmbufmem / MCLBYTES / 4;
+
+	TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop);
+	if (nmbjumbop == 0)
+		nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4;
+
+	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9);
+	if (nmbjumbo9 == 0)
+		nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6;
+
+	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16);
+	if (nmbjumbo16 == 0)
+		nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6;
+
+	/*
+	 * We need at least as many mbufs as we have clusters of
+	 * the various types added together.
+	 */
+	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
+	if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16)
+		nmbufs = lmax(maxmbufmem / MSIZE / 5,
+		    nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16);
+}
+SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL);
+
+static int
+sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
+{
+	int error, newnmbclusters;
+
+	newnmbclusters = nmbclusters;
+	error = sysctl_handle_int(oidp, &newnmbclusters, 0, req);
+	if (error == 0 && req->newptr) {
+		if (newnmbclusters > nmbclusters &&
+		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
+			nmbclusters = newnmbclusters;
+			nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
+			EVENTHANDLER_INVOKE(nmbclusters_change);
+		} else
+			error = EINVAL;
+	}
+	return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW,
+&nmbclusters, 0, sysctl_nmbclusters, "IU",
+    "Maximum number of mbuf clusters allowed");
+
+static int
+sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
+{
+	int error, newnmbjumbop;
+
+	newnmbjumbop = nmbjumbop;
+	error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req);
+	if (error == 0 && req->newptr) {
+		if (newnmbjumbop > nmbjumbop &&
+		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
+			nmbjumbop = newnmbjumbop;
+			nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
+		} else
+			error = EINVAL;
+	}
+	return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW,
+&nmbjumbop, 0, sysctl_nmbjumbop, "IU",
+    "Maximum number of mbuf page size jumbo clusters allowed");
+
+static int
+sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
+{
+	int error, newnmbjumbo9;
+
+	newnmbjumbo9 = nmbjumbo9;
+	error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req);
+	if (error == 0 && req->newptr) {
+		if (newnmbjumbo9 > nmbjumbo9 &&
+		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
+			nmbjumbo9 = newnmbjumbo9;
+			nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
+		} else
+			error = EINVAL;
+	}
+	return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW,
+&nmbjumbo9, 0, sysctl_nmbjumbo9, "IU",
+    "Maximum number of mbuf 9k jumbo clusters allowed");
+
+static int
+sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
+{
+	int error, newnmbjumbo16;
+
+	newnmbjumbo16 = nmbjumbo16;
+	error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req);
+	if (error == 0 && req->newptr) {
+		if (newnmbjumbo16 > nmbjumbo16 &&
+		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
+			nmbjumbo16 = newnmbjumbo16;
+			nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
+		} else
+			error = EINVAL;
+	}
+	return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW,
+&nmbjumbo16, 0, sysctl_nmbjumbo16, "IU",
+    "Maximum number of mbuf 16k jumbo clusters allowed");
+
+static int
+sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
+{
+	int error, newnmbufs;
+
+	newnmbufs = nmbufs;
+	error = sysctl_handle_int(oidp, &newnmbufs, 0, req);
+	if (error == 0 && req->newptr) {
+		if (newnmbufs > nmbufs) {
+			nmbufs = newnmbufs;
+			nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
+			EVENTHANDLER_INVOKE(nmbufs_change);
+		} else
+			error = EINVAL;
+	}
+	return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT|CTLFLAG_RW,
+&nmbufs, 0, sysctl_nmbufs, "IU",
+    "Maximum number of mbufs allowed");
+
+/*
+ * Zones from which we allocate.
+ */
+uma_zone_t	zone_mbuf;
+uma_zone_t	zone_clust;
+uma_zone_t	zone_pack;
+uma_zone_t	zone_jumbop;
+uma_zone_t	zone_jumbo9;
+uma_zone_t	zone_jumbo16;
+uma_zone_t	zone_ext_refcnt;
+
+/*
+ * Local prototypes.
+ */
+static int	mb_ctor_mbuf(void *, int, void *, int);
+static int	mb_ctor_clust(void *, int, void *, int);
+static int	mb_ctor_pack(void *, int, void *, int);
+static void	mb_dtor_mbuf(void *, int, void *);
+static void	mb_dtor_clust(void *, int, void *);
+static void	mb_dtor_pack(void *, int, void *);
+static int	mb_zinit_pack(void *, int, int);
+static void	mb_zfini_pack(void *, int);
+
+static void	mb_reclaim(void *);
+static void    *mbuf_jumbo_alloc(uma_zone_t, int, uint8_t *, int);
+
+/* Ensure that MSIZE is a power of 2. */
+CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
+
+/*
+ * Initialize FreeBSD Network buffer allocation.
+ */
+static void
+mbuf_init(void *dummy)
+{
+
+	/*
+	 * Configure UMA zones for Mbufs, Clusters, and Packets.
+	 */
+	zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE,
+	    mb_ctor_mbuf, mb_dtor_mbuf,
+#ifdef INVARIANTS
+	    trash_init, trash_fini,
+#else
+	    NULL, NULL,
+#endif
+	    MSIZE - 1, UMA_ZONE_MAXBUCKET);
+	if (nmbufs > 0)
+		nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
+	uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached");
+
+	zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
+	    mb_ctor_clust, mb_dtor_clust,
+#ifdef INVARIANTS
+	    trash_init, trash_fini,
+#else
+	    NULL, NULL,
+#endif
+	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
+	if (nmbclusters > 0)
+		nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
+	uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached");
+
+	zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
+	    mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf);
+
+	/* Make jumbo frame zone too. Page size, 9k and 16k. */
+	zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE,
+	    mb_ctor_clust, mb_dtor_clust,
+#ifdef INVARIANTS
+	    trash_init, trash_fini,
+#else
+	    NULL, NULL,
+#endif
+	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
+	if (nmbjumbop > 0)
+		nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
+	uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached");
+
+	zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES,
+	    mb_ctor_clust, mb_dtor_clust,
+#ifdef INVARIANTS
+	    trash_init, trash_fini,
+#else
+	    NULL, NULL,
+#endif
+	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
+	uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc);
+	if (nmbjumbo9 > 0)
+		nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
+	uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached");
+
+	zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES,
+	    mb_ctor_clust, mb_dtor_clust,
+#ifdef INVARIANTS
+	    trash_init, trash_fini,
+#else
+	    NULL, NULL,
+#endif
+	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
+	uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc);
+	if (nmbjumbo16 > 0)
+		nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
+	uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
+
+	zone_ext_refcnt = uma_zcreate(MBUF_EXTREFCNT_MEM_NAME, sizeof(u_int),
+	    NULL, NULL,
+	    NULL, NULL,
+	    UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+
+	/* uma_prealloc() goes here... */
+
+	/*
+	 * Hook event handler for low-memory situation, used to
+	 * drain protocols and push data back to the caches (UMA
+	 * later pushes it back to VM).
+	 */
+	EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
+	    EVENTHANDLER_PRI_FIRST);
+}
+SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL);
+
+/*
+ * UMA backend page allocator for the jumbo frame zones.
+ *
+ * Allocates kernel virtual memory that is backed by contiguous physical
+ * pages.
+ */
+static void *
+mbuf_jumbo_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait)
+{
+
+	/* Inform UMA that this allocator uses kernel_map/object. */
+	*flags = UMA_SLAB_KERNEL;
+	return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait,
+	    (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT));
+}
+
+/*
+ * Constructor for Mbuf master zone.
+ *
+ * The 'arg' pointer points to a mb_args structure which
+ * contains call-specific information required to support the
+ * mbuf allocation API.  See mbuf.h.
+ */
+static int
+mb_ctor_mbuf(void *mem, int size, void *arg, int how)
+{
+	struct mbuf *m;
+	struct mb_args *args;
+	int error;
+	int flags;
+	short type;
+
+#ifdef INVARIANTS
+	trash_ctor(mem, size, arg, how);
+#endif
+	args = (struct mb_args *)arg;
+	type = args->type;
+
+	/*
+	 * The mbuf is initialized later.  The caller has the
+	 * responsibility to set up any MAC labels too.
+	 */
+	if (type == MT_NOINIT)
+		return (0);
+
+	m = (struct mbuf *)mem;
+	flags = args->flags;
+
+	error = m_init(m, NULL, size, how, type, flags);
+
+	return (error);
+}
+
+/*
+ * The Mbuf master zone destructor.
+ */
+static void
+mb_dtor_mbuf(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+	unsigned long flags;
+
+	m = (struct mbuf *)mem;
+	flags = (unsigned long)arg;
+
+	if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags))
+		m_tag_delete_chain(m, NULL);
+	KASSERT((m->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
+	KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
+#ifdef INVARIANTS
+	trash_dtor(mem, size, arg);
+#endif
+}
+
+/*
+ * The Mbuf Packet zone destructor.
+ */
+static void
+mb_dtor_pack(void *mem, int size, void *arg)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+	if ((m->m_flags & M_PKTHDR) != 0)
+		m_tag_delete_chain(m, NULL);
+
+	/* Make sure we've got a clean cluster back. */
+	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
+	KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__));
+	KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__));
+	KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__));
+	KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__));
+	KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__));
+	KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__));
+	KASSERT(*m->m_ext.ref_cnt == 1, ("%s: ref_cnt != 1", __func__));
+#ifdef INVARIANTS
+	trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg);
+#endif
+	/*
+	 * If there are processes blocked on zone_clust, waiting for pages
+	 * to be freed up, * cause them to be woken up by draining the
+	 * packet zone.  We are exposed to a race here * (in the check for
+	 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that
+	 * is deliberate. We don't want to acquire the zone lock for every
+	 * mbuf free.
+	 */
+	if (uma_zone_exhausted_nolock(zone_clust))
+		zone_drain(zone_pack);
+}
+
+/*
+ * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor.
+ *
+ * Here the 'arg' pointer points to the Mbuf which we
+ * are configuring cluster storage for.  If 'arg' is
+ * empty we allocate just the cluster without setting
+ * the mbuf to it.  See mbuf.h.
+ */
+static int
+mb_ctor_clust(void *mem, int size, void *arg, int how)
+{
+	struct mbuf *m;
+	u_int *refcnt;
+	int type;
+	uma_zone_t zone;
+
+#ifdef INVARIANTS
+	trash_ctor(mem, size, arg, how);
+#endif
+	switch (size) {
+	case MCLBYTES:
+		type = EXT_CLUSTER;
+		zone = zone_clust;
+		break;
+#if MJUMPAGESIZE != MCLBYTES
+	case MJUMPAGESIZE:
+		type = EXT_JUMBOP;
+		zone = zone_jumbop;
+		break;
+#endif
+	case MJUM9BYTES:
+		type = EXT_JUMBO9;
+		zone = zone_jumbo9;
+		break;
+	case MJUM16BYTES:
+		type = EXT_JUMBO16;
+		zone = zone_jumbo16;
+		break;
+	default:
+		panic("unknown cluster size");
+		break;
+	}
+
+	m = (struct mbuf *)arg;
+	refcnt = uma_find_refcnt(zone, mem);
+	*refcnt = 1;
+	if (m != NULL) {
+		m->m_ext.ext_buf = (caddr_t)mem;
+		m->m_data = m->m_ext.ext_buf;
+		m->m_flags |= M_EXT;
+		m->m_ext.ext_free = NULL;
+		m->m_ext.ext_arg1 = NULL;
+		m->m_ext.ext_arg2 = NULL;
+		m->m_ext.ext_size = size;
+		m->m_ext.ext_type = type;
+		m->m_ext.ext_flags = 0;
+		m->m_ext.ref_cnt = refcnt;
+	}
+
+	return (0);
+}
+
+/*
+ * The Mbuf Cluster zone destructor.
+ */
+static void
+mb_dtor_clust(void *mem, int size, void *arg)
+{
+#ifdef INVARIANTS
+	uma_zone_t zone;
+
+	zone = m_getzone(size);
+	KASSERT(*(uma_find_refcnt(zone, mem)) <= 1,
+		("%s: refcnt incorrect %u", __func__,
+		 *(uma_find_refcnt(zone, mem))) );
+
+	trash_dtor(mem, size, arg);
+#endif
+}
+
+/*
+ * The Packet secondary zone's init routine, executed on the
+ * object's transition from mbuf keg slab to zone cache.
+ */
+static int
+mb_zinit_pack(void *mem, int size, int how)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;		/* m is virgin. */
+	if (uma_zalloc_arg(zone_clust, m, how) == NULL ||
+	    m->m_ext.ext_buf == NULL)
+		return (ENOMEM);
+	m->m_ext.ext_type = EXT_PACKET;	/* Override. */
+#ifdef INVARIANTS
+	trash_init(m->m_ext.ext_buf, MCLBYTES, how);
+#endif
+	return (0);
+}
+
+/*
+ * The Packet secondary zone's fini routine, executed on the
+ * object's transition from zone cache to keg slab.
+ */
+static void
+mb_zfini_pack(void *mem, int size)
+{
+	struct mbuf *m;
+
+	m = (struct mbuf *)mem;
+#ifdef INVARIANTS
+	trash_fini(m->m_ext.ext_buf, MCLBYTES);
+#endif
+	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
+#ifdef INVARIANTS
+	trash_dtor(mem, size, NULL);
+#endif
+}
+
+/*
+ * The "packet" keg constructor.
+ */
+static int
+mb_ctor_pack(void *mem, int size, void *arg, int how)
+{
+	struct mbuf *m;
+	struct mb_args *args;
+	int error, flags;
+	short type;
+
+	m = (struct mbuf *)mem;
+	args = (struct mb_args *)arg;
+	flags = args->flags;
+	type = args->type;
+
+#ifdef INVARIANTS
+	trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how);
+#endif
+
+	error = m_init(m, NULL, size, how, type, flags);
+
+	/* m_ext is already initialized. */
+	m->m_data = m->m_ext.ext_buf;
+ 	m->m_flags = (flags | M_EXT);
+
+	return (error);
+}
+
+int
+m_pkthdr_init(struct mbuf *m, int how)
+{
+#ifdef MAC
+	int error;
+#endif
+	m->m_data = m->m_pktdat;
+	m->m_pkthdr.rcvif = NULL;
+	SLIST_INIT(&m->m_pkthdr.tags);
+	m->m_pkthdr.len = 0;
+	m->m_pkthdr.flowid = 0;
+	m->m_pkthdr.csum_flags = 0;
+	m->m_pkthdr.fibnum = 0;
+	m->m_pkthdr.cosqos = 0;
+	m->m_pkthdr.rsstype = 0;
+	m->m_pkthdr.l2hlen = 0;
+	m->m_pkthdr.l3hlen = 0;
+	m->m_pkthdr.l4hlen = 0;
+	m->m_pkthdr.l5hlen = 0;
+	m->m_pkthdr.PH_per.sixtyfour[0] = 0;
+	m->m_pkthdr.PH_loc.sixtyfour[0] = 0;
+#ifdef MAC
+	/* If the label init fails, fail the alloc */
+	error = mac_mbuf_init(m, how);
+	if (error)
+		return (error);
+#endif
+
+	return (0);
+}
+
+/*
+ * This is the protocol drain routine.
+ *
+ * No locks should be held when this is called.  The drain routines have to
+ * presently acquire some locks which raises the possibility of lock order
+ * reversal.
+ */
+static void
+mb_reclaim(void *junk)
+{
+	struct domain *dp;
+	struct protosw *pr;
+
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
+	    "mb_reclaim()");
+
+	for (dp = domains; dp != NULL; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_drain != NULL)
+				(*pr->pr_drain)();
+}
diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c
new file mode 100644
index 0000000..c84d4b2
--- /dev/null
+++ b/sys/kern/kern_mib.c
@@ -0,0 +1,542 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_posix.h"
+#include "opt_config.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sbuf.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/jail.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+
+SYSCTL_NODE(, 0,	  sysctl, CTLFLAG_RW, 0,
+	"Sysctl internal magic");
+SYSCTL_NODE(, CTL_KERN,	  kern,   CTLFLAG_RW|CTLFLAG_CAPRD, 0,
+	"High kernel, proc, limits &c");
+SYSCTL_NODE(, CTL_VM,	  vm,     CTLFLAG_RW, 0,
+	"Virtual memory");
+SYSCTL_NODE(, CTL_VFS,	  vfs,     CTLFLAG_RW, 0,
+	"File system");
+SYSCTL_NODE(, CTL_NET,	  net,    CTLFLAG_RW, 0,
+	"Network, (see socket.h)");
+SYSCTL_NODE(, CTL_DEBUG,  debug,  CTLFLAG_RW, 0,
+	"Debugging");
+SYSCTL_NODE(_debug, OID_AUTO,  sizeof,  CTLFLAG_RW, 0,
+	"Sizeof various things");
+SYSCTL_NODE(, CTL_HW,	  hw,     CTLFLAG_RW, 0,
+	"hardware");
+SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0,
+	"machine dependent");
+SYSCTL_NODE(, CTL_USER,	  user,   CTLFLAG_RW, 0,
+	"user-level");
+SYSCTL_NODE(, CTL_P1003_1B,  p1003_1b,   CTLFLAG_RW, 0,
+	"p1003_1b, (see p1003_1b.h)");
+
+SYSCTL_NODE(, OID_AUTO,  compat, CTLFLAG_RW, 0,
+	"Compatibility code");
+SYSCTL_NODE(, OID_AUTO, security, CTLFLAG_RW, 0, 
+     	"Security");
+#ifdef REGRESSION
+SYSCTL_NODE(, OID_AUTO, regression, CTLFLAG_RW, 0,
+     "Regression test MIB");
+#endif
+
+SYSCTL_STRING(_kern, OID_AUTO, ident, CTLFLAG_RD|CTLFLAG_MPSAFE,
+    kern_ident, 0, "Kernel identifier");
+
+SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD|CTLFLAG_MPSAFE|
+    CTLFLAG_CAPRD, osrelease, 0, "Operating system release");
+
+SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD|CTLFLAG_CAPRD,
+    0, BSD, "Operating system revision");
+
+SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD|CTLFLAG_MPSAFE,
+    version, 0, "Kernel version");
+
+SYSCTL_STRING(_kern, OID_AUTO, compiler_version, CTLFLAG_RD|CTLFLAG_MPSAFE,
+    compiler_version, 0, "Version of compiler used to compile kernel");
+
+SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD|CTLFLAG_MPSAFE|
+    CTLFLAG_CAPRD, ostype, 0, "Operating system type");
+
+/*
+ * NOTICE: The *userland* release date is available in
+ * /usr/include/osreldate.h
+ */
+SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD|CTLFLAG_CAPRD,
+    &osreldate, 0, "Kernel release date");
+
+SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RDTUN,
+    &maxproc, 0, "Maximum number of processes");
+
+SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW,
+    &maxprocperuid, 0, "Maximum processes allowed per userid");
+
+SYSCTL_INT(_kern, OID_AUTO, maxusers, CTLFLAG_RDTUN,
+    &maxusers, 0, "Hint for kernel tuning");
+
+SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD|CTLFLAG_CAPRD,
+    0, ARG_MAX, "Maximum bytes of argument to execve(2)");
+
+SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD|CTLFLAG_CAPRD,
+    0, _POSIX_VERSION, "Version of POSIX attempting to comply to");
+
+SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
+    &ngroups_max, 0,
+    "Maximum number of supplemental groups a user can belong to");
+
+SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD|CTLFLAG_CAPRD,
+    0, 1, "Whether job control is available");
+
+#ifdef _POSIX_SAVED_IDS
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD,
+    0, 1, "Whether saved set-group/user ID is available");
+#else
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD,
+    0, 0, "Whether saved set-group/user ID is available");
+#endif
+
+char kernelname[MAXPATHLEN] = "/kernel";	/* XXX bloat */
+
+SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW,
+    kernelname, sizeof kernelname, "Name of kernel file booted");
+
+SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD|CTLFLAG_CAPRD,
+    &mp_ncpus, 0, "Number of active CPUs");
+
+SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD|CTLFLAG_CAPRD,
+    0, BYTE_ORDER, "System byte order");
+
+SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD|CTLFLAG_CAPRD,
+    0, PAGE_SIZE, "System memory page size");
+
+static int
+sysctl_kern_arnd(SYSCTL_HANDLER_ARGS)
+{
+	char buf[256];
+	size_t len;
+
+	len = req->oldlen;
+	if (len > sizeof(buf))
+		len = sizeof(buf);
+	arc4rand(buf, len, 0);
+	return (SYSCTL_OUT(req, buf, len));
+}
+
+SYSCTL_PROC(_kern, KERN_ARND, arandom,
+    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, NULL, 0,
+    sysctl_kern_arnd, "", "arc4rand");
+
+static int
+sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
+{
+	u_long val;
+
+	val = ctob(physmem);
+	return (sysctl_handle_long(oidp, &val, 0, req));
+}
+
+SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG | CTLFLAG_RD,
+	0, 0, sysctl_hw_physmem, "LU", "");
+
+static int
+sysctl_hw_realmem(SYSCTL_HANDLER_ARGS)
+{
+	u_long val;
+	val = ctob(realmem);
+	return (sysctl_handle_long(oidp, &val, 0, req));
+}
+SYSCTL_PROC(_hw, HW_REALMEM, realmem, CTLTYPE_ULONG | CTLFLAG_RD,
+	0, 0, sysctl_hw_realmem, "LU", "");
+static int
+sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
+{
+	u_long val;
+
+	val = ctob(physmem - cnt.v_wire_count);
+	return (sysctl_handle_long(oidp, &val, 0, req));
+}
+
+SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG | CTLFLAG_RD,
+	0, 0, sysctl_hw_usermem, "LU", "");
+
+SYSCTL_LONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0, "");
+
+u_long pagesizes[MAXPAGESIZES] = { PAGE_SIZE };
+
+static int
+sysctl_hw_pagesizes(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+#ifdef SCTL_MASK32
+	int i;
+	uint32_t pagesizes32[MAXPAGESIZES];
+
+	if (req->flags & SCTL_MASK32) {
+		/*
+		 * Recreate the "pagesizes" array with 32-bit elements.  Truncate
+		 * any page size greater than UINT32_MAX to zero.
+		 */
+		for (i = 0; i < MAXPAGESIZES; i++)
+			pagesizes32[i] = (uint32_t)pagesizes[i];
+
+		error = SYSCTL_OUT(req, pagesizes32, sizeof(pagesizes32));
+	} else
+#endif
+		error = SYSCTL_OUT(req, pagesizes, sizeof(pagesizes));
+	return (error);
+}
+SYSCTL_PROC(_hw, OID_AUTO, pagesizes, CTLTYPE_ULONG | CTLFLAG_RD,
+    NULL, 0, sysctl_hw_pagesizes, "LU", "Supported page sizes");
+
+#ifdef SCTL_MASK32
+int adaptive_machine_arch = 1;
+SYSCTL_INT(_debug, OID_AUTO, adaptive_machine_arch, CTLFLAG_RW,
+    &adaptive_machine_arch, 1,
+    "Adapt reported machine architecture to the ABI of the binary");
+#endif
+
+static int
+sysctl_hw_machine_arch(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	static const char machine_arch[] = MACHINE_ARCH;
+#ifdef SCTL_MASK32
+	static const char machine_arch32[] = MACHINE_ARCH32;
+
+	if ((req->flags & SCTL_MASK32) != 0 && adaptive_machine_arch)
+		error = SYSCTL_OUT(req, machine_arch32, sizeof(machine_arch32));
+	else
+#endif
+		error = SYSCTL_OUT(req, machine_arch, sizeof(machine_arch));
+	return (error);
+
+}
+SYSCTL_PROC(_hw, HW_MACHINE_ARCH, machine_arch, CTLTYPE_STRING | CTLFLAG_RD,
+    NULL, 0, sysctl_hw_machine_arch, "A", "System architecture");
+
+static int
+sysctl_hostname(SYSCTL_HANDLER_ARGS)
+{
+	struct prison *pr, *cpr;
+	size_t pr_offset;
+	char tmpname[MAXHOSTNAMELEN];
+	int descend, error, len;
+
+	/*
+	 * This function can set: hostname domainname hostuuid.
+	 * Keep that in mind when comments say "hostname".
+	 */
+	pr_offset = (size_t)arg1;
+	len = arg2;
+	KASSERT(len <= sizeof(tmpname),
+	    ("length %d too long for %s", len, __func__));
+
+	pr = req->td->td_ucred->cr_prison;
+	if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME) && req->newptr)
+		return (EPERM);
+	/*
+	 * Make a local copy of hostname to get/set so we don't have to hold
+	 * the jail mutex during the sysctl copyin/copyout activities.
+	 */
+	mtx_lock(&pr->pr_mtx);
+	bcopy((char *)pr + pr_offset, tmpname, len);
+	mtx_unlock(&pr->pr_mtx);
+
+	error = sysctl_handle_string(oidp, tmpname, len, req);
+
+	if (req->newptr != NULL && error == 0) {
+		/*
+		 * Copy the locally set hostname to all jails that share
+		 * this host info.
+		 */
+		sx_slock(&allprison_lock);
+		while (!(pr->pr_flags & PR_HOST))
+			pr = pr->pr_parent;
+		mtx_lock(&pr->pr_mtx);
+		bcopy(tmpname, (char *)pr + pr_offset, len);
+		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
+			if (cpr->pr_flags & PR_HOST)
+				descend = 0;
+			else
+				bcopy(tmpname, (char *)cpr + pr_offset, len);
+		mtx_unlock(&pr->pr_mtx);
+		sx_sunlock(&allprison_lock);
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname,
+    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
+    (void *)(offsetof(struct prison, pr_hostname)), MAXHOSTNAMELEN,
+    sysctl_hostname, "A", "Hostname");
+SYSCTL_PROC(_kern, KERN_NISDOMAINNAME, domainname,
+    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
+    (void *)(offsetof(struct prison, pr_domainname)), MAXHOSTNAMELEN,
+    sysctl_hostname, "A", "Name of the current YP/NIS domain");
+SYSCTL_PROC(_kern, KERN_HOSTUUID, hostuuid,
+    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
+    (void *)(offsetof(struct prison, pr_hostuuid)), HOSTUUIDLEN,
+    sysctl_hostname, "A", "Host UUID");
+
+static int	regression_securelevel_nonmonotonic = 0;
+
+#ifdef REGRESSION
+SYSCTL_INT(_regression, OID_AUTO, securelevel_nonmonotonic, CTLFLAG_RW,
+    &regression_securelevel_nonmonotonic, 0, "securelevel may be lowered");
+#endif
+
+static int
+sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS)
+{
+	struct prison *pr, *cpr;
+	int descend, error, level;
+
+	pr = req->td->td_ucred->cr_prison;
+
+	/*
+	 * Reading the securelevel is easy, since the current jail's level
+	 * is known to be at least as secure as any higher levels.  Perform
+	 * a lockless read since the securelevel is an integer.
+	 */
+	level = pr->pr_securelevel;
+	error = sysctl_handle_int(oidp, &level, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	/* Permit update only if the new securelevel exceeds the old. */
+	sx_slock(&allprison_lock);
+	mtx_lock(&pr->pr_mtx);
+	if (!regression_securelevel_nonmonotonic &&
+	    level < pr->pr_securelevel) {
+		mtx_unlock(&pr->pr_mtx);
+		sx_sunlock(&allprison_lock);
+		return (EPERM);
+	}
+	pr->pr_securelevel = level;
+	/*
+	 * Set all child jails to be at least this level, but do not lower
+	 * them (even if regression_securelevel_nonmonotonic).
+	 */
+	FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) {
+		if (cpr->pr_securelevel < level)
+			cpr->pr_securelevel = level;
+	}
+	mtx_unlock(&pr->pr_mtx);
+	sx_sunlock(&allprison_lock);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel,
+    CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, sysctl_kern_securelvl,
+    "I", "Current secure level");
+
+#ifdef INCLUDE_CONFIG_FILE
+/* Actual kernel configuration options. */
+extern char kernconfstring[];
+
+SYSCTL_STRING(_kern, OID_AUTO, conftxt, CTLFLAG_RD, kernconfstring, 0,
+    "Kernel configuration file");
+#endif
+
+static int
+sysctl_hostid(SYSCTL_HANDLER_ARGS)
+{
+	struct prison *pr, *cpr;
+	u_long tmpid;
+	int descend, error;
+
+	/*
+	 * Like sysctl_hostname, except it operates on a u_long
+	 * instead of a string, and is used only for hostid.
+	 */
+	pr = req->td->td_ucred->cr_prison;
+	if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME) && req->newptr)
+		return (EPERM);
+	tmpid = pr->pr_hostid;
+	error = sysctl_handle_long(oidp, &tmpid, 0, req);
+
+	if (req->newptr != NULL && error == 0) {
+		sx_slock(&allprison_lock);
+		while (!(pr->pr_flags & PR_HOST))
+			pr = pr->pr_parent;
+		mtx_lock(&pr->pr_mtx);
+		pr->pr_hostid = tmpid;
+		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
+			if (cpr->pr_flags & PR_HOST)
+				descend = 0;
+			else
+				cpr->pr_hostid = tmpid;
+		mtx_unlock(&pr->pr_mtx);
+		sx_sunlock(&allprison_lock);
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_HOSTID, hostid,
+    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
+    NULL, 0, sysctl_hostid, "LU", "Host ID");
+
+SYSCTL_NODE(_kern, OID_AUTO, features, CTLFLAG_RD, 0, "Kernel Features");
+
+#ifdef COMPAT_FREEBSD4
+FEATURE(compat_freebsd4, "Compatible with FreeBSD 4");
+#endif
+
+#ifdef COMPAT_FREEBSD5
+FEATURE(compat_freebsd5, "Compatible with FreeBSD 5");
+#endif
+
+#ifdef COMPAT_FREEBSD6
+FEATURE(compat_freebsd6, "Compatible with FreeBSD 6");
+#endif
+
+#ifdef COMPAT_FREEBSD7
+FEATURE(compat_freebsd7, "Compatible with FreeBSD 7");
+#endif
+
+/*
+ * This is really cheating.  These actually live in the libc, something
+ * which I'm not quite sure is a good idea anyway, but in order for
+ * getnext and friends to actually work, we define dummies here.
+ *
+ * XXXRW: These probably should be CTLFLAG_CAPRD.
+ */
+SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD,
+    "", 0, "PATH that finds all the standard utilities");
+SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD,
+    0, 0, "Max ibase/obase values in bc(1)");
+SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD,
+    0, 0, "Max array size in bc(1)");
+SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD,
+    0, 0, "Max scale value in bc(1)");
+SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD,
+    0, 0, "Max string length in bc(1)");
+SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD,
+    0, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry");
+SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD,
+    0, 0, "Max length (bytes) of a text-processing utility's input line");
+SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD,
+    0, 0, "Maximum number of repeats of a regexp permitted");
+SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD,
+    0, 0,
+    "The version of POSIX 1003.2 with which the system attempts to comply");
+SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD,
+    0, 0, "Whether C development supports the C bindings option");
+SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD,
+    0, 0, "Whether system supports the C development utilities option");
+SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD,
+    0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD,
+    0, 0, "Whether system supports FORTRAN development utilities");
+SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD,
+    0, 0, "Whether system supports FORTRAN runtime utilities");
+SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD,
+    0, 0, "Whether system supports creation of locales");
+SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD,
+    0, 0, "Whether system supports software development utilities");
+SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD,
+    0, 0, "Whether system supports the user portability utilities");
+SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD,
+    0, 0, "Min Maximum number of streams a process may have open at one time");
+SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD,
+    0, 0, "Min Maximum number of types supported for timezone names");
+
+#include <sys/vnode.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD,
+    0, sizeof(struct vnode), "sizeof(struct vnode)");
+
+SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD,
+    0, sizeof(struct proc), "sizeof(struct proc)");
+
+static int
+sysctl_kern_pid_max(SYSCTL_HANDLER_ARGS)
+{
+	int error, pm;
+
+	pm = pid_max;
+	error = sysctl_handle_int(oidp, &pm, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	sx_xlock(&proctree_lock);
+	sx_xlock(&allproc_lock);
+
+	/*
+	 * Only permit the values less then PID_MAX.
+	 * As a safety measure, do not allow to limit the pid_max too much.
+	 */
+	if (pm < 300 || pm > PID_MAX)
+		error = EINVAL;
+	else
+		pid_max = pm;
+	sx_xunlock(&allproc_lock);
+	sx_xunlock(&proctree_lock);
+	return (error);
+}
+SYSCTL_PROC(_kern, OID_AUTO, pid_max, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_TUN |
+    CTLFLAG_MPSAFE, 0, 0, sysctl_kern_pid_max, "I",
+    "Maximum allowed pid");
+
+#include <sys/bio.h>
+#include <sys/buf.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD,
+    0, sizeof(struct bio), "sizeof(struct bio)");
+SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD,
+    0, sizeof(struct buf), "sizeof(struct buf)");
+
+#include <sys/user.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD,
+    0, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)");
+
+/* XXX compatibility, remove for 6.0 */
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+SYSCTL_INT(_kern, OID_AUTO, fallback_elf_brand, CTLFLAG_RW,
+    &__elfN(fallback_brand), sizeof(__elfN(fallback_brand)),
+    "compatibility for kern.fallback_elf_brand");
diff --git a/sys/kern/kern_module.c b/sys/kern/kern_module.c
new file mode 100644
index 0000000..b769320
--- /dev/null
+++ b/sys/kern/kern_module.c
@@ -0,0 +1,523 @@
+/*-
+ * Copyright (c) 1997 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_compat.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/reboot.h>
+#include <sys/sx.h>
+#include <sys/module.h>
+#include <sys/linker.h>
+
+static MALLOC_DEFINE(M_MODULE, "module", "module data structures");
+
+struct module {
+	TAILQ_ENTRY(module)	link;	/* chain together all modules */
+	TAILQ_ENTRY(module)	flink;	/* all modules in a file */
+	struct linker_file	*file;	/* file which contains this module */
+	int			refs;	/* reference count */
+	int 			id;	/* unique id number */
+	char 			*name;	/* module name */
+	modeventhand_t 		handler;	/* event handler */
+	void 			*arg;	/* argument for handler */
+	modspecific_t 		data;	/* module specific data */
+};
+
+#define MOD_EVENT(mod, type)	(mod)->handler((mod), (type), (mod)->arg)
+
+static TAILQ_HEAD(modulelist, module) modules;
+struct sx modules_sx;
+static int nextid = 1;
+static void module_shutdown(void *, int);
+
+static int
+modevent_nop(module_t mod, int what, void *arg)
+{
+
+	switch(what) {
+	case MOD_LOAD:
+		return (0);
+	case MOD_UNLOAD:
+		return (EBUSY);
+	default:
+		return (EOPNOTSUPP);
+	}
+}
+
+static void
+module_init(void *arg)
+{
+
+	sx_init(&modules_sx, "module subsystem sx lock");
+	TAILQ_INIT(&modules);
+	EVENTHANDLER_REGISTER(shutdown_final, module_shutdown, NULL,
+	    SHUTDOWN_PRI_DEFAULT);
+}
+
+SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0);
+
+static void
+module_shutdown(void *arg1, int arg2)
+{
+	module_t mod;
+
+	if (arg2 & RB_NOSYNC)
+		return;
+	mtx_lock(&Giant);
+	MOD_SLOCK;
+	TAILQ_FOREACH_REVERSE(mod, &modules, modulelist, link)
+		MOD_EVENT(mod, MOD_SHUTDOWN);
+	MOD_SUNLOCK;
+	mtx_unlock(&Giant);
+}
+
+void
+module_register_init(const void *arg)
+{
+	const moduledata_t *data = (const moduledata_t *)arg;
+	int error;
+	module_t mod;
+
+	mtx_lock(&Giant);
+	MOD_SLOCK;
+	mod = module_lookupbyname(data->name);
+	if (mod == NULL)
+		panic("module_register_init: module named %s not found\n",
+		    data->name);
+	MOD_SUNLOCK;
+	error = MOD_EVENT(mod, MOD_LOAD);
+	if (error) {
+		MOD_EVENT(mod, MOD_UNLOAD);
+		MOD_XLOCK;
+		module_release(mod);
+		MOD_XUNLOCK;
+		printf("module_register_init: MOD_LOAD (%s, %p, %p) error"
+		    " %d\n", data->name, (void *)data->evhand, data->priv,
+		    error); 
+	} else {
+		MOD_XLOCK;
+		if (mod->file) {
+			/*
+			 * Once a module is successfully loaded, move
+			 * it to the head of the module list for this
+			 * linker file.  This resorts the list so that
+			 * when the kernel linker iterates over the
+			 * modules to unload them, it will unload them
+			 * in the reverse order they were loaded.
+			 */
+			TAILQ_REMOVE(&mod->file->modules, mod, flink);
+			TAILQ_INSERT_HEAD(&mod->file->modules, mod, flink);
+		}
+		MOD_XUNLOCK;
+	}
+	mtx_unlock(&Giant);
+}
+
+int
+module_register(const moduledata_t *data, linker_file_t container)
+{
+	size_t namelen;
+	module_t newmod;
+
+	MOD_XLOCK;
+	newmod = module_lookupbyname(data->name);
+	if (newmod != NULL) {
+		MOD_XUNLOCK;
+		printf("module_register: module %s already exists!\n",
+		    data->name);
+		return (EEXIST);
+	}
+	namelen = strlen(data->name) + 1;
+	newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK);
+	if (newmod == NULL) {
+		MOD_XUNLOCK;
+		return (ENOMEM);
+	}
+	newmod->refs = 1;
+	newmod->id = nextid++;
+	newmod->name = (char *)(newmod + 1);
+	strcpy(newmod->name, data->name);
+	newmod->handler = data->evhand ? data->evhand : modevent_nop;
+	newmod->arg = data->priv;
+	bzero(&newmod->data, sizeof(newmod->data));
+	TAILQ_INSERT_TAIL(&modules, newmod, link);
+
+	if (container)
+		TAILQ_INSERT_TAIL(&container->modules, newmod, flink);
+	newmod->file = container;
+	MOD_XUNLOCK;
+	return (0);
+}
+
+void
+module_reference(module_t mod)
+{
+
+	MOD_XLOCK_ASSERT;
+
+	MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs));
+	mod->refs++;
+}
+
+void
+module_release(module_t mod)
+{
+
+	MOD_XLOCK_ASSERT;
+
+	if (mod->refs <= 0)
+		panic("module_release: bad reference count");
+
+	MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs));
+	
+	mod->refs--;
+	if (mod->refs == 0) {
+		TAILQ_REMOVE(&modules, mod, link);
+		if (mod->file)
+			TAILQ_REMOVE(&mod->file->modules, mod, flink);
+		free(mod, M_MODULE);
+	}
+}
+
+module_t
+module_lookupbyname(const char *name)
+{
+	module_t mod;
+	int err;
+
+	MOD_LOCK_ASSERT;
+
+	TAILQ_FOREACH(mod, &modules, link) {
+		err = strcmp(mod->name, name);
+		if (err == 0)
+			return (mod);
+	}
+	return (NULL);
+}
+
+module_t
+module_lookupbyid(int modid)
+{
+        module_t mod;
+
+        MOD_LOCK_ASSERT;
+
+        TAILQ_FOREACH(mod, &modules, link)
+                if (mod->id == modid)
+                        return(mod);
+        return (NULL);
+}
+
+int
+module_quiesce(module_t mod)
+{
+	int error;
+
+	mtx_lock(&Giant);
+	error = MOD_EVENT(mod, MOD_QUIESCE);
+	mtx_unlock(&Giant);
+	if (error == EOPNOTSUPP || error == EINVAL)
+		error = 0;
+	return (error);
+}
+
+int
+module_unload(module_t mod)
+{
+	int error;
+
+	mtx_lock(&Giant);
+	error = MOD_EVENT(mod, MOD_UNLOAD);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+int
+module_getid(module_t mod)
+{
+
+	MOD_LOCK_ASSERT;
+	return (mod->id);
+}
+
+module_t
+module_getfnext(module_t mod)
+{
+
+	MOD_LOCK_ASSERT;
+	return (TAILQ_NEXT(mod, flink));
+}
+
+const char *
+module_getname(module_t mod)
+{
+
+	MOD_LOCK_ASSERT;
+	return (mod->name);
+}
+
+void
+module_setspecific(module_t mod, modspecific_t *datap)
+{
+
+	MOD_XLOCK_ASSERT;
+	mod->data = *datap;
+}
+
+linker_file_t
+module_file(module_t mod)
+{
+
+	return (mod->file);
+}
+
+/*
+ * Syscalls.
+ */
+int
+sys_modnext(struct thread *td, struct modnext_args *uap)
+{
+	module_t mod;
+	int error = 0;
+
+	td->td_retval[0] = -1;
+
+	MOD_SLOCK;
+	if (uap->modid == 0) {
+		mod = TAILQ_FIRST(&modules);
+		if (mod)
+			td->td_retval[0] = mod->id;
+		else
+			error = ENOENT;
+		goto done2;
+	}
+	mod = module_lookupbyid(uap->modid);
+	if (mod == NULL) {
+		error = ENOENT;
+		goto done2;
+	}
+	if (TAILQ_NEXT(mod, link))
+		td->td_retval[0] = TAILQ_NEXT(mod, link)->id;
+	else
+		td->td_retval[0] = 0;
+done2:
+	MOD_SUNLOCK;
+	return (error);
+}
+
+int
+sys_modfnext(struct thread *td, struct modfnext_args *uap)
+{
+	module_t mod;
+	int error;
+
+	td->td_retval[0] = -1;
+
+	MOD_SLOCK;
+	mod = module_lookupbyid(uap->modid);
+	if (mod == NULL) {
+		error = ENOENT;
+	} else {
+		error = 0;
+		if (TAILQ_NEXT(mod, flink))
+			td->td_retval[0] = TAILQ_NEXT(mod, flink)->id;
+		else
+			td->td_retval[0] = 0;
+	}
+	MOD_SUNLOCK;
+	return (error);
+}
+
+struct module_stat_v1 {
+	int	version;		/* set to sizeof(struct module_stat) */
+	char	name[MAXMODNAME];
+	int	refs;
+	int	id;
+};
+
+int
+sys_modstat(struct thread *td, struct modstat_args *uap)
+{
+	module_t mod;
+	modspecific_t data;
+	int error = 0;
+	int id, namelen, refs, version;
+	struct module_stat *stat;
+	char *name;
+
+	MOD_SLOCK;
+	mod = module_lookupbyid(uap->modid);
+	if (mod == NULL) {
+		MOD_SUNLOCK;
+		return (ENOENT);
+	}
+	id = mod->id;
+	refs = mod->refs;
+	name = mod->name;
+	data = mod->data;
+	MOD_SUNLOCK;
+	stat = uap->stat;
+
+	/*
+	 * Check the version of the user's structure.
+	 */
+	if ((error = copyin(&stat->version, &version, sizeof(version))) != 0)
+		return (error);
+	if (version != sizeof(struct module_stat_v1)
+	    && version != sizeof(struct module_stat))
+		return (EINVAL);
+	namelen = strlen(mod->name) + 1;
+	if (namelen > MAXMODNAME)
+		namelen = MAXMODNAME;
+	if ((error = copyout(name, &stat->name[0], namelen)) != 0)
+		return (error);
+
+	if ((error = copyout(&refs, &stat->refs, sizeof(int))) != 0)
+		return (error);
+	if ((error = copyout(&id, &stat->id, sizeof(int))) != 0)
+		return (error);
+
+	/*
+	 * >v1 stat includes module data.
+	 */
+	if (version == sizeof(struct module_stat))
+		if ((error = copyout(&data, &stat->data, 
+		    sizeof(data))) != 0)
+			return (error);
+	td->td_retval[0] = 0;
+	return (error);
+}
+
+int
+sys_modfind(struct thread *td, struct modfind_args *uap)
+{
+	int error = 0;
+	char name[MAXMODNAME];
+	module_t mod;
+
+	if ((error = copyinstr(uap->name, name, sizeof name, 0)) != 0)
+		return (error);
+
+	MOD_SLOCK;
+	mod = module_lookupbyname(name);
+	if (mod == NULL)
+		error = ENOENT;
+	else
+		td->td_retval[0] = module_getid(mod);
+	MOD_SUNLOCK;
+	return (error);
+}
+
+MODULE_VERSION(kernel, __FreeBSD_version);
+
+#ifdef COMPAT_FREEBSD32
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <compat/freebsd32/freebsd32_util.h>
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+
+typedef union modspecific32 {
+	int		intval;
+	uint32_t	uintval;
+	int		longval;
+	uint32_t	ulongval;
+} modspecific32_t;
+
+struct module_stat32 {
+	int		version;
+	char		name[MAXMODNAME];
+	int		refs;
+	int		id;
+	modspecific32_t	data;
+};
+
+int
+freebsd32_modstat(struct thread *td, struct freebsd32_modstat_args *uap)
+{
+	module_t mod;
+	modspecific32_t data32;
+	int error = 0;
+	int id, namelen, refs, version;
+	struct module_stat32 *stat32;
+	char *name;
+
+	MOD_SLOCK;
+	mod = module_lookupbyid(uap->modid);
+	if (mod == NULL) {
+		MOD_SUNLOCK;
+		return (ENOENT);
+	}
+
+	id = mod->id;
+	refs = mod->refs;
+	name = mod->name;
+	CP(mod->data, data32, intval);
+	CP(mod->data, data32, uintval);
+	CP(mod->data, data32, longval);
+	CP(mod->data, data32, ulongval);
+	MOD_SUNLOCK;
+	stat32 = uap->stat;
+
+	if ((error = copyin(&stat32->version, &version, sizeof(version))) != 0)
+		return (error);
+	if (version != sizeof(struct module_stat_v1)
+	    && version != sizeof(struct module_stat32))
+		return (EINVAL);
+	namelen = strlen(mod->name) + 1;
+	if (namelen > MAXMODNAME)
+		namelen = MAXMODNAME;
+	if ((error = copyout(name, &stat32->name[0], namelen)) != 0)
+		return (error);
+
+	if ((error = copyout(&refs, &stat32->refs, sizeof(int))) != 0)
+		return (error);
+	if ((error = copyout(&id, &stat32->id, sizeof(int))) != 0)
+		return (error);
+
+	/*
+	 * >v1 stat includes module data.
+	 */
+	if (version == sizeof(struct module_stat32))
+		if ((error = copyout(&data32, &stat32->data,
+		    sizeof(data32))) != 0)
+			return (error);
+	td->td_retval[0] = 0;
+	return (error);
+}
+#endif
diff --git a/sys/kern/kern_mtxpool.c b/sys/kern/kern_mtxpool.c
new file mode 100644
index 0000000..23b41bb
--- /dev/null
+++ b/sys/kern/kern_mtxpool.c
@@ -0,0 +1,218 @@
+/*-
+ * Copyright (c) 2001 Matthew Dillon.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* Mutex pool routines.  These routines are designed to be used as short
+ * term leaf mutexes (e.g. the last mutex you might acquire other then
+ * calling msleep()).  They operate using a shared pool.  A mutex is chosen
+ * from the pool based on the supplied pointer (which may or may not be
+ * valid).
+ *
+ * Advantages:
+ *	- no structural overhead.  Mutexes can be associated with structures
+ *	  without adding bloat to the structures.
+ *	- mutexes can be obtained for invalid pointers, useful when uses
+ *	  mutexes to interlock destructor ops.
+ *	- no initialization/destructor overhead.
+ *	- can be used with msleep.
+ *
+ * Disadvantages:
+ *	- should generally only be used as leaf mutexes.
+ *	- pool/pool dependancy ordering cannot be depended on.
+ *	- possible L1 cache mastersip contention between cpus.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+
+static MALLOC_DEFINE(M_MTXPOOL, "mtx_pool", "mutex pool");
+
+/* Pool sizes must be a power of two */
+#ifndef MTX_POOL_LOCKBUILDER_SIZE
+#define MTX_POOL_LOCKBUILDER_SIZE	128
+#endif
+#ifndef MTX_POOL_SLEEP_SIZE
+#define MTX_POOL_SLEEP_SIZE		128
+#endif
+
+struct mtxpool_header {
+	int		mtxpool_size;
+	int		mtxpool_mask;
+	int		mtxpool_shift;
+	int		mtxpool_next;
+};
+
+struct mtx_pool {
+	struct mtxpool_header mtx_pool_header;
+	struct mtx	mtx_pool_ary[1];
+};
+
+static struct mtx_pool_lockbuilder {
+	struct mtxpool_header mtx_pool_header;
+	struct mtx	mtx_pool_ary[MTX_POOL_LOCKBUILDER_SIZE];
+} lockbuilder_pool;
+
+#define mtx_pool_size	mtx_pool_header.mtxpool_size
+#define mtx_pool_mask	mtx_pool_header.mtxpool_mask
+#define mtx_pool_shift	mtx_pool_header.mtxpool_shift
+#define mtx_pool_next	mtx_pool_header.mtxpool_next
+
+struct mtx_pool *mtxpool_sleep;
+struct mtx_pool *mtxpool_lockbuilder;
+
+#if UINTPTR_MAX == UINT64_MAX	/* 64 bits */
+# define POINTER_BITS		64
+# define HASH_MULTIPLIER	11400714819323198485u /* (2^64)*(sqrt(5)-1)/2 */
+#else				/* assume 32 bits */
+# define POINTER_BITS		32
+# define HASH_MULTIPLIER	2654435769u	      /* (2^32)*(sqrt(5)-1)/2 */
+#endif
+
+/*
+ * Return the (shared) pool mutex associated with the specified address.
+ * The returned mutex is a leaf level mutex, meaning that if you obtain it
+ * you cannot obtain any other mutexes until you release it.  You can
+ * legally msleep() on the mutex.
+ */
+struct mtx *
+mtx_pool_find(struct mtx_pool *pool, void *ptr)
+{
+	int p;
+
+	KASSERT(pool != NULL, ("_mtx_pool_find(): null pool"));
+	/*
+	 * Fibonacci hash, see Knuth's
+	 * _Art of Computer Programming, Volume 3 / Sorting and Searching_
+	 */
+	p = ((HASH_MULTIPLIER * (uintptr_t)ptr) >> pool->mtx_pool_shift) &
+	    pool->mtx_pool_mask;
+	return (&pool->mtx_pool_ary[p]);
+}
+
+static void
+mtx_pool_initialize(struct mtx_pool *pool, const char *mtx_name, int pool_size,
+    int opts)
+{
+	int i, maskbits;
+
+	pool->mtx_pool_size = pool_size;
+	pool->mtx_pool_mask = pool_size - 1;
+	for (i = 1, maskbits = 0; (i & pool_size) == 0; i = i << 1)
+		maskbits++;
+	pool->mtx_pool_shift = POINTER_BITS - maskbits;
+	pool->mtx_pool_next = 0;
+	for (i = 0; i < pool_size; ++i)
+		mtx_init(&pool->mtx_pool_ary[i], mtx_name, NULL, opts);
+}
+
+struct mtx_pool *
+mtx_pool_create(const char *mtx_name, int pool_size, int opts)
+{
+	struct mtx_pool *pool;
+
+	if (pool_size <= 0 || !powerof2(pool_size)) {
+		printf("WARNING: %s pool size is not a power of 2.\n",
+		    mtx_name);
+		pool_size = 128;
+	}
+	pool = malloc(sizeof (struct mtx_pool) +
+	    ((pool_size - 1) * sizeof (struct mtx)),
+	    M_MTXPOOL, M_WAITOK | M_ZERO);
+	mtx_pool_initialize(pool, mtx_name, pool_size, opts);
+	return pool;
+}
+
+void
+mtx_pool_destroy(struct mtx_pool **poolp)
+{
+	int i;
+	struct mtx_pool *pool = *poolp;
+
+	for (i = pool->mtx_pool_size - 1; i >= 0; --i)
+		mtx_destroy(&pool->mtx_pool_ary[i]);
+	free(pool, M_MTXPOOL);
+	*poolp = NULL;
+}
+
+static void
+mtx_pool_setup_static(void *dummy __unused)
+{
+	mtx_pool_initialize((struct mtx_pool *)&lockbuilder_pool,
+	    "lockbuilder mtxpool", MTX_POOL_LOCKBUILDER_SIZE,
+	    MTX_DEF | MTX_NOWITNESS | MTX_QUIET);
+	mtxpool_lockbuilder = (struct mtx_pool *)&lockbuilder_pool;
+}
+
+static void
+mtx_pool_setup_dynamic(void *dummy __unused)
+{
+	mtxpool_sleep = mtx_pool_create("sleep mtxpool",
+	    MTX_POOL_SLEEP_SIZE, MTX_DEF);
+}
+
+/*
+ * Obtain a (shared) mutex from the pool.  The returned mutex is a leaf
+ * level mutex, meaning that if you obtain it you cannot obtain any other
+ * mutexes until you release it.  You can legally msleep() on the mutex.
+ */
+struct mtx *
+mtx_pool_alloc(struct mtx_pool *pool)
+{
+	int i;
+
+	KASSERT(pool != NULL, ("mtx_pool_alloc(): null pool"));
+	/*
+	 * mtx_pool_next is unprotected against multiple accesses,
+	 * but simultaneous access by two CPUs should not be very
+	 * harmful.
+	 */
+	i = pool->mtx_pool_next;
+	pool->mtx_pool_next = (i + 1) & pool->mtx_pool_mask;
+	return (&pool->mtx_pool_ary[i]);
+}
+
+/*
+ * The lockbuilder pool must be initialized early because the lockmgr
+ * and sx locks depend on it.  The sx locks are used in the kernel
+ * memory allocator.  The lockmgr subsystem is initialized by
+ * SYSINIT(..., SI_SUB_LOCKMGR, ...).
+ *
+ * We can't call malloc() to dynamically allocate the sleep pool
+ * until after kmeminit() has been called, which is done by
+ * SYSINIT(..., SI_SUB_KMEM, ...).
+ */
+SYSINIT(mtxpooli1, SI_SUB_MTX_POOL_STATIC, SI_ORDER_FIRST,
+    mtx_pool_setup_static, NULL);
+SYSINIT(mtxpooli2, SI_SUB_MTX_POOL_DYNAMIC, SI_ORDER_FIRST,
+    mtx_pool_setup_dynamic, NULL);
diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
new file mode 100644
index 0000000..cd1ed7d
--- /dev/null
+++ b/sys/kern/kern_mutex.c
@@ -0,0 +1,1009 @@
+/*-
+ * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ *    promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
+ */
+
+/*
+ * Machine independent bits of mutex implementation.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_adaptive_mutexes.h"
+#include "opt_ddb.h"
+#include "opt_global.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/turnstile.h>
+#include <sys/vmmeter.h>
+#include <sys/lock_profile.h>
+
+#include <machine/atomic.h>
+#include <machine/bus.h>
+#include <machine/cpu.h>
+
+#include <ddb/ddb.h>
+
+#include <fs/devfs/devfs_int.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+#if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES)
+#define	ADAPTIVE_MUTEXES
+#endif
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+PMC_SOFT_DEFINE( , , lock, failed);
+#endif
+
+/*
+ * Return the mutex address when the lock cookie address is provided.
+ * This functionality assumes that struct mtx* have a member named mtx_lock.
+ */
+#define	mtxlock2mtx(c)	(__containerof(c, struct mtx, mtx_lock))
+
+/*
+ * Internal utility macros.
+ */
+#define mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
+
+#define	mtx_destroyed(m) ((m)->mtx_lock == MTX_DESTROYED)
+
+#define	mtx_owner(m)	((struct thread *)((m)->mtx_lock & ~MTX_FLAGMASK))
+
+static void	assert_mtx(const struct lock_object *lock, int what);
+#ifdef DDB
+static void	db_show_mtx(const struct lock_object *lock);
+#endif
+static void	lock_mtx(struct lock_object *lock, int how);
+static void	lock_spin(struct lock_object *lock, int how);
+#ifdef KDTRACE_HOOKS
+static int	owner_mtx(const struct lock_object *lock,
+		    struct thread **owner);
+#endif
+static int	unlock_mtx(struct lock_object *lock);
+static int	unlock_spin(struct lock_object *lock);
+
+/*
+ * Lock classes for sleep and spin mutexes.
+ */
+struct lock_class lock_class_mtx_sleep = {
+	.lc_name = "sleep mutex",
+	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
+	.lc_assert = assert_mtx,
+#ifdef DDB
+	.lc_ddb_show = db_show_mtx,
+#endif
+	.lc_lock = lock_mtx,
+	.lc_unlock = unlock_mtx,
+#ifdef KDTRACE_HOOKS
+	.lc_owner = owner_mtx,
+#endif
+};
+struct lock_class lock_class_mtx_spin = {
+	.lc_name = "spin mutex",
+	.lc_flags = LC_SPINLOCK | LC_RECURSABLE,
+	.lc_assert = assert_mtx,
+#ifdef DDB
+	.lc_ddb_show = db_show_mtx,
+#endif
+	.lc_lock = lock_spin,
+	.lc_unlock = unlock_spin,
+#ifdef KDTRACE_HOOKS
+	.lc_owner = owner_mtx,
+#endif
+};
+
+/*
+ * System-wide mutexes
+ */
+struct mtx blocked_lock;
+struct mtx Giant;
+
+void
+assert_mtx(const struct lock_object *lock, int what)
+{
+
+	mtx_assert((const struct mtx *)lock, what);
+}
+
+void
+lock_mtx(struct lock_object *lock, int how)
+{
+
+	mtx_lock((struct mtx *)lock);
+}
+
+void
+lock_spin(struct lock_object *lock, int how)
+{
+
+	panic("spin locks can only use msleep_spin");
+}
+
+int
+unlock_mtx(struct lock_object *lock)
+{
+	struct mtx *m;
+
+	m = (struct mtx *)lock;
+	mtx_assert(m, MA_OWNED | MA_NOTRECURSED);
+	mtx_unlock(m);
+	return (0);
+}
+
+int
+unlock_spin(struct lock_object *lock)
+{
+
+	panic("spin locks can only use msleep_spin");
+}
+
+#ifdef KDTRACE_HOOKS
+int
+owner_mtx(const struct lock_object *lock, struct thread **owner)
+{
+	const struct mtx *m = (const struct mtx *)lock;
+
+	*owner = mtx_owner(m);
+	return (mtx_unowned(m) == 0);
+}
+#endif
+
+/*
+ * Function versions of the inlined __mtx_* macros.  These are used by
+ * modules and can also be called from assembly language if needed.
+ */
+void
+__mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file, int line)
+{
+	struct mtx *m;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	m = mtxlock2mtx(c);
+
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("mtx_lock() by idle thread %p on sleep mutex %s @ %s:%d",
+	    curthread, m->lock_object.lo_name, file, line));
+	KASSERT(m->mtx_lock != MTX_DESTROYED,
+	    ("mtx_lock() of destroyed mutex @ %s:%d", file, line));
+	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
+	    ("mtx_lock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
+	    file, line));
+	WITNESS_CHECKORDER(&m->lock_object, (opts & ~MTX_RECURSE) |
+	    LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL);
+
+	__mtx_lock(m, curthread, opts, file, line);
+	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
+	    line);
+	WITNESS_LOCK(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_EXCLUSIVE,
+	    file, line);
+	curthread->td_locks++;
+}
+
+void
+__mtx_unlock_flags(volatile uintptr_t *c, int opts, const char *file, int line)
+{
+	struct mtx *m;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	m = mtxlock2mtx(c);
+
+	KASSERT(m->mtx_lock != MTX_DESTROYED,
+	    ("mtx_unlock() of destroyed mutex @ %s:%d", file, line));
+	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
+	    ("mtx_unlock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
+	    file, line));
+	WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+	LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
+	    line);
+	mtx_assert(m, MA_OWNED);
+
+	if (m->mtx_recurse == 0)
+		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_MTX_UNLOCK_RELEASE, m);
+	__mtx_unlock(m, curthread, opts, file, line);
+	curthread->td_locks--;
+}
+
+void
+__mtx_lock_spin_flags(volatile uintptr_t *c, int opts, const char *file,
+    int line)
+{
+	struct mtx *m;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	m = mtxlock2mtx(c);
+
+	KASSERT(m->mtx_lock != MTX_DESTROYED,
+	    ("mtx_lock_spin() of destroyed mutex @ %s:%d", file, line));
+	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
+	    ("mtx_lock_spin() of sleep mutex %s @ %s:%d",
+	    m->lock_object.lo_name, file, line));
+	if (mtx_owned(m))
+		KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
+		    (opts & MTX_RECURSE) != 0,
+	    ("mtx_lock_spin: recursed on non-recursive mutex %s @ %s:%d\n",
+		    m->lock_object.lo_name, file, line));
+	opts &= ~MTX_RECURSE;
+	WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE,
+	    file, line, NULL);
+	__mtx_lock_spin(m, curthread, opts, file, line);
+	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
+	    line);
+	WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+}
+
+void
+__mtx_unlock_spin_flags(volatile uintptr_t *c, int opts, const char *file,
+    int line)
+{
+	struct mtx *m;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	m = mtxlock2mtx(c);
+
+	KASSERT(m->mtx_lock != MTX_DESTROYED,
+	    ("mtx_unlock_spin() of destroyed mutex @ %s:%d", file, line));
+	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
+	    ("mtx_unlock_spin() of sleep mutex %s @ %s:%d",
+	    m->lock_object.lo_name, file, line));
+	WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+	LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
+	    line);
+	mtx_assert(m, MA_OWNED);
+
+	__mtx_unlock_spin(m);
+}
+
+/*
+ * The important part of mtx_trylock{,_flags}()
+ * Tries to acquire lock `m.'  If this function is called on a mutex that
+ * is already owned, it will recursively acquire the lock.
+ */
+int
+_mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line)
+{
+	struct mtx *m;
+#ifdef LOCK_PROFILING
+	uint64_t waittime = 0;
+	int contested = 0;
+#endif
+	int rval;
+
+	if (SCHEDULER_STOPPED())
+		return (1);
+
+	m = mtxlock2mtx(c);
+
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("mtx_trylock() by idle thread %p on sleep mutex %s @ %s:%d",
+	    curthread, m->lock_object.lo_name, file, line));
+	KASSERT(m->mtx_lock != MTX_DESTROYED,
+	    ("mtx_trylock() of destroyed mutex @ %s:%d", file, line));
+	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
+	    ("mtx_trylock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
+	    file, line));
+
+	if (mtx_owned(m) && ((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
+	    (opts & MTX_RECURSE) != 0)) {
+		m->mtx_recurse++;
+		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
+		rval = 1;
+	} else
+		rval = _mtx_obtain_lock(m, (uintptr_t)curthread);
+	opts &= ~MTX_RECURSE;
+
+	LOCK_LOG_TRY("LOCK", &m->lock_object, opts, rval, file, line);
+	if (rval) {
+		WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
+		    file, line);
+		curthread->td_locks++;
+		if (m->mtx_recurse == 0)
+			LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_LOCK_ACQUIRE,
+			    m, contested, waittime, file, line);
+
+	}
+
+	return (rval);
+}
+
+/*
+ * __mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
+ *
+ * We call this if the lock is either contested (i.e. we need to go to
+ * sleep waiting for it), or if we need to recurse on it.
+ */
+void
+__mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts,
+    const char *file, int line)
+{
+	struct mtx *m;
+	struct turnstile *ts;
+	uintptr_t v;
+#ifdef ADAPTIVE_MUTEXES
+	volatile struct thread *owner;
+#endif
+#ifdef KTR
+	int cont_logged = 0;
+#endif
+#ifdef LOCK_PROFILING
+	int contested = 0;
+	uint64_t waittime = 0;
+#endif
+#ifdef KDTRACE_HOOKS
+	uint64_t spin_cnt = 0;
+	uint64_t sleep_cnt = 0;
+	int64_t sleep_time = 0;
+#endif
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	m = mtxlock2mtx(c);
+
+	if (mtx_owned(m)) {
+		KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
+		    (opts & MTX_RECURSE) != 0,
+	    ("_mtx_lock_sleep: recursed on non-recursive mutex %s @ %s:%d\n",
+		    m->lock_object.lo_name, file, line));
+		opts &= ~MTX_RECURSE;
+		m->mtx_recurse++;
+		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
+		if (LOCK_LOG_TEST(&m->lock_object, opts))
+			CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
+		return;
+	}
+	opts &= ~MTX_RECURSE;
+
+#ifdef HWPMC_HOOKS
+	PMC_SOFT_CALL( , , lock, failed);
+#endif
+	lock_profile_obtain_lock_failed(&m->lock_object,
+		    &contested, &waittime);
+	if (LOCK_LOG_TEST(&m->lock_object, opts))
+		CTR4(KTR_LOCK,
+		    "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d",
+		    m->lock_object.lo_name, (void *)m->mtx_lock, file, line);
+
+	while (!_mtx_obtain_lock(m, tid)) {
+#ifdef KDTRACE_HOOKS
+		spin_cnt++;
+#endif
+#ifdef ADAPTIVE_MUTEXES
+		/*
+		 * If the owner is running on another CPU, spin until the
+		 * owner stops running or the state of the lock changes.
+		 */
+		v = m->mtx_lock;
+		if (v != MTX_UNOWNED) {
+			owner = (struct thread *)(v & ~MTX_FLAGMASK);
+			if (TD_IS_RUNNING(owner)) {
+				if (LOCK_LOG_TEST(&m->lock_object, 0))
+					CTR3(KTR_LOCK,
+					    "%s: spinning on %p held by %p",
+					    __func__, m, owner);
+				while (mtx_owner(m) == owner &&
+				    TD_IS_RUNNING(owner)) {
+					cpu_spinwait();
+#ifdef KDTRACE_HOOKS
+					spin_cnt++;
+#endif
+				}
+				continue;
+			}
+		}
+#endif
+
+		ts = turnstile_trywait(&m->lock_object);
+		v = m->mtx_lock;
+
+		/*
+		 * Check if the lock has been released while spinning for
+		 * the turnstile chain lock.
+		 */
+		if (v == MTX_UNOWNED) {
+			turnstile_cancel(ts);
+			continue;
+		}
+
+#ifdef ADAPTIVE_MUTEXES
+		/*
+		 * The current lock owner might have started executing
+		 * on another CPU (or the lock could have changed
+		 * owners) while we were waiting on the turnstile
+		 * chain lock.  If so, drop the turnstile lock and try
+		 * again.
+		 */
+		owner = (struct thread *)(v & ~MTX_FLAGMASK);
+		if (TD_IS_RUNNING(owner)) {
+			turnstile_cancel(ts);
+			continue;
+		}
+#endif
+
+		/*
+		 * If the mutex isn't already contested and a failure occurs
+		 * setting the contested bit, the mutex was either released
+		 * or the state of the MTX_RECURSED bit changed.
+		 */
+		if ((v & MTX_CONTESTED) == 0 &&
+		    !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) {
+			turnstile_cancel(ts);
+			continue;
+		}
+
+		/*
+		 * We definitely must sleep for this lock.
+		 */
+		mtx_assert(m, MA_NOTOWNED);
+
+#ifdef KTR
+		if (!cont_logged) {
+			CTR6(KTR_CONTENTION,
+			    "contention: %p at %s:%d wants %s, taken by %s:%d",
+			    (void *)tid, file, line, m->lock_object.lo_name,
+			    WITNESS_FILE(&m->lock_object),
+			    WITNESS_LINE(&m->lock_object));
+			cont_logged = 1;
+		}
+#endif
+
+		/*
+		 * Block on the turnstile.
+		 */
+#ifdef KDTRACE_HOOKS
+		sleep_time -= lockstat_nsecs();
+#endif
+		turnstile_wait(ts, mtx_owner(m), TS_EXCLUSIVE_QUEUE);
+#ifdef KDTRACE_HOOKS
+		sleep_time += lockstat_nsecs();
+		sleep_cnt++;
+#endif
+	}
+#ifdef KTR
+	if (cont_logged) {
+		CTR4(KTR_CONTENTION,
+		    "contention end: %s acquired by %p at %s:%d",
+		    m->lock_object.lo_name, (void *)tid, file, line);
+	}
+#endif
+	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_LOCK_ACQUIRE, m, contested,
+	    waittime, file, line);
+#ifdef KDTRACE_HOOKS
+	if (sleep_time)
+		LOCKSTAT_RECORD1(LS_MTX_LOCK_BLOCK, m, sleep_time);
+
+	/*
+	 * Only record the loops spinning and not sleeping. 
+	 */
+	if (spin_cnt > sleep_cnt)
+		LOCKSTAT_RECORD1(LS_MTX_LOCK_SPIN, m, (spin_cnt - sleep_cnt));
+#endif
+}
+
+static void
+_mtx_lock_spin_failed(struct mtx *m)
+{
+	struct thread *td;
+
+	td = mtx_owner(m);
+
+	/* If the mutex is unlocked, try again. */
+	if (td == NULL)
+		return;
+
+	printf( "spin lock %p (%s) held by %p (tid %d) too long\n",
+	    m, m->lock_object.lo_name, td, td->td_tid);
+#ifdef WITNESS
+	witness_display_spinlock(&m->lock_object, td, printf);
+#endif
+	panic("spin lock held too long");
+}
+
+#ifdef SMP
+/*
+ * _mtx_lock_spin_cookie: the tougher part of acquiring an MTX_SPIN lock.
+ *
+ * This is only called if we need to actually spin for the lock. Recursion
+ * is handled inline.
+ */
+void
+_mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t tid, int opts,
+    const char *file, int line)
+{
+	struct mtx *m;
+	int i = 0;
+#ifdef LOCK_PROFILING
+	int contested = 0;
+	uint64_t waittime = 0;
+#endif
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	m = mtxlock2mtx(c);
+
+	if (LOCK_LOG_TEST(&m->lock_object, opts))
+		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
+
+#ifdef HWPMC_HOOKS
+	PMC_SOFT_CALL( , , lock, failed);
+#endif
+	lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime);
+	while (!_mtx_obtain_lock(m, tid)) {
+
+		/* Give interrupts a chance while we spin. */
+		spinlock_exit();
+		while (m->mtx_lock != MTX_UNOWNED) {
+			if (i++ < 10000000) {
+				cpu_spinwait();
+				continue;
+			}
+			if (i < 60000000 || kdb_active || panicstr != NULL)
+				DELAY(1);
+			else
+				_mtx_lock_spin_failed(m);
+			cpu_spinwait();
+		}
+		spinlock_enter();
+	}
+
+	if (LOCK_LOG_TEST(&m->lock_object, opts))
+		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
+
+	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_SPIN_LOCK_ACQUIRE, m,
+	    contested, waittime, (file), (line));
+	LOCKSTAT_RECORD1(LS_MTX_SPIN_LOCK_SPIN, m, i);
+}
+#endif /* SMP */
+
+void
+thread_lock_flags_(struct thread *td, int opts, const char *file, int line)
+{
+	struct mtx *m;
+	uintptr_t tid;
+	int i;
+#ifdef LOCK_PROFILING
+	int contested = 0;
+	uint64_t waittime = 0;
+#endif
+#ifdef KDTRACE_HOOKS
+	uint64_t spin_cnt = 0;
+#endif
+
+	i = 0;
+	tid = (uintptr_t)curthread;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	for (;;) {
+retry:
+		spinlock_enter();
+		m = td->td_lock;
+		KASSERT(m->mtx_lock != MTX_DESTROYED,
+		    ("thread_lock() of destroyed mutex @ %s:%d", file, line));
+		KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
+		    ("thread_lock() of sleep mutex %s @ %s:%d",
+		    m->lock_object.lo_name, file, line));
+		if (mtx_owned(m))
+			KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0,
+	    ("thread_lock: recursed on non-recursive mutex %s @ %s:%d\n",
+			    m->lock_object.lo_name, file, line));
+		WITNESS_CHECKORDER(&m->lock_object,
+		    opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL);
+		while (!_mtx_obtain_lock(m, tid)) {
+#ifdef KDTRACE_HOOKS
+			spin_cnt++;
+#endif
+			if (m->mtx_lock == tid) {
+				m->mtx_recurse++;
+				break;
+			}
+#ifdef HWPMC_HOOKS
+			PMC_SOFT_CALL( , , lock, failed);
+#endif
+			lock_profile_obtain_lock_failed(&m->lock_object,
+			    &contested, &waittime);
+			/* Give interrupts a chance while we spin. */
+			spinlock_exit();
+			while (m->mtx_lock != MTX_UNOWNED) {
+				if (i++ < 10000000)
+					cpu_spinwait();
+				else if (i < 60000000 ||
+				    kdb_active || panicstr != NULL)
+					DELAY(1);
+				else
+					_mtx_lock_spin_failed(m);
+				cpu_spinwait();
+				if (m != td->td_lock)
+					goto retry;
+			}
+			spinlock_enter();
+		}
+		if (m == td->td_lock)
+			break;
+		__mtx_unlock_spin(m);	/* does spinlock_exit() */
+#ifdef KDTRACE_HOOKS
+		spin_cnt++;
+#endif
+	}
+	if (m->mtx_recurse == 0)
+		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_SPIN_LOCK_ACQUIRE,
+		    m, contested, waittime, (file), (line));
+	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
+	    line);
+	WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+	LOCKSTAT_RECORD1(LS_THREAD_LOCK_SPIN, m, spin_cnt);
+}
+
+struct mtx *
+thread_lock_block(struct thread *td)
+{
+	struct mtx *lock;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	lock = td->td_lock;
+	td->td_lock = &blocked_lock;
+	mtx_unlock_spin(lock);
+
+	return (lock);
+}
+
+void
+thread_lock_unblock(struct thread *td, struct mtx *new)
+{
+	mtx_assert(new, MA_OWNED);
+	MPASS(td->td_lock == &blocked_lock);
+	atomic_store_rel_ptr((volatile void *)&td->td_lock, (uintptr_t)new);
+}
+
+void
+thread_lock_set(struct thread *td, struct mtx *new)
+{
+	struct mtx *lock;
+
+	mtx_assert(new, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	lock = td->td_lock;
+	td->td_lock = new;
+	mtx_unlock_spin(lock);
+}
+
+/*
+ * __mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
+ *
+ * We are only called here if the lock is recursed or contested (i.e. we
+ * need to wake up a blocked thread).
+ */
+void
+__mtx_unlock_sleep(volatile uintptr_t *c, int opts, const char *file, int line)
+{
+	struct mtx *m;
+	struct turnstile *ts;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	m = mtxlock2mtx(c);
+
+	if (mtx_recursed(m)) {
+		if (--(m->mtx_recurse) == 0)
+			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
+		if (LOCK_LOG_TEST(&m->lock_object, opts))
+			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
+		return;
+	}
+
+	/*
+	 * We have to lock the chain before the turnstile so this turnstile
+	 * can be removed from the hash list if it is empty.
+	 */
+	turnstile_chain_lock(&m->lock_object);
+	ts = turnstile_lookup(&m->lock_object);
+	if (LOCK_LOG_TEST(&m->lock_object, opts))
+		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
+	MPASS(ts != NULL);
+	turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE);
+	_mtx_release_lock_quick(m);
+
+	/*
+	 * This turnstile is now no longer associated with the mutex.  We can
+	 * unlock the chain lock so a new turnstile may take it's place.
+	 */
+	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+	turnstile_chain_unlock(&m->lock_object);
+}
+
+/*
+ * All the unlocking of MTX_SPIN locks is done inline.
+ * See the __mtx_unlock_spin() macro for the details.
+ */
+
+/*
+ * The backing function for the INVARIANTS-enabled mtx_assert()
+ */
+#ifdef INVARIANT_SUPPORT
+void
+__mtx_assert(const volatile uintptr_t *c, int what, const char *file, int line)
+{
+	const struct mtx *m;
+
+	if (panicstr != NULL || dumping)
+		return;
+
+	m = mtxlock2mtx(c);
+
+	switch (what) {
+	case MA_OWNED:
+	case MA_OWNED | MA_RECURSED:
+	case MA_OWNED | MA_NOTRECURSED:
+		if (!mtx_owned(m))
+			panic("mutex %s not owned at %s:%d",
+			    m->lock_object.lo_name, file, line);
+		if (mtx_recursed(m)) {
+			if ((what & MA_NOTRECURSED) != 0)
+				panic("mutex %s recursed at %s:%d",
+				    m->lock_object.lo_name, file, line);
+		} else if ((what & MA_RECURSED) != 0) {
+			panic("mutex %s unrecursed at %s:%d",
+			    m->lock_object.lo_name, file, line);
+		}
+		break;
+	case MA_NOTOWNED:
+		if (mtx_owned(m))
+			panic("mutex %s owned at %s:%d",
+			    m->lock_object.lo_name, file, line);
+		break;
+	default:
+		panic("unknown mtx_assert at %s:%d", file, line);
+	}
+}
+#endif
+
+/*
+ * The MUTEX_DEBUG-enabled mtx_validate()
+ *
+ * Most of these checks have been moved off into the LO_INITIALIZED flag
+ * maintained by the witness code.
+ */
+#ifdef MUTEX_DEBUG
+
+void	mtx_validate(struct mtx *);
+
+void
+mtx_validate(struct mtx *m)
+{
+
+/*
+ * XXX: When kernacc() does not require Giant we can reenable this check
+ */
+#ifdef notyet
+	/*
+	 * Can't call kernacc() from early init386(), especially when
+	 * initializing Giant mutex, because some stuff in kernacc()
+	 * requires Giant itself.
+	 */
+	if (!cold)
+		if (!kernacc((caddr_t)m, sizeof(m),
+		    VM_PROT_READ | VM_PROT_WRITE))
+			panic("Can't read and write to mutex %p", m);
+#endif
+}
+#endif
+
+/*
+ * General init routine used by the MTX_SYSINIT() macro.
+ */
+void
+mtx_sysinit(void *arg)
+{
+	struct mtx_args *margs = arg;
+
+	mtx_init((struct mtx *)margs->ma_mtx, margs->ma_desc, NULL,
+	    margs->ma_opts);
+}
+
+/*
+ * Mutex initialization routine; initialize lock `m' of type contained in
+ * `opts' with options contained in `opts' and name `name.'  The optional
+ * lock type `type' is used as a general lock category name for use with
+ * witness.
+ */
+void
+_mtx_init(volatile uintptr_t *c, const char *name, const char *type, int opts)
+{
+	struct mtx *m;
+	struct lock_class *class;
+	int flags;
+
+	m = mtxlock2mtx(c);
+
+	MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE |
+		MTX_NOWITNESS | MTX_DUPOK | MTX_NOPROFILE)) == 0);
+	ASSERT_ATOMIC_LOAD_PTR(m->mtx_lock,
+	    ("%s: mtx_lock not aligned for %s: %p", __func__, name,
+	    &m->mtx_lock));
+
+#ifdef MUTEX_DEBUG
+	/* Diagnostic and error correction */
+	mtx_validate(m);
+#endif
+
+	/* Determine lock class and lock flags. */
+	if (opts & MTX_SPIN)
+		class = &lock_class_mtx_spin;
+	else
+		class = &lock_class_mtx_sleep;
+	flags = 0;
+	if (opts & MTX_QUIET)
+		flags |= LO_QUIET;
+	if (opts & MTX_RECURSE)
+		flags |= LO_RECURSABLE;
+	if ((opts & MTX_NOWITNESS) == 0)
+		flags |= LO_WITNESS;
+	if (opts & MTX_DUPOK)
+		flags |= LO_DUPOK;
+	if (opts & MTX_NOPROFILE)
+		flags |= LO_NOPROFILE;
+
+	/* Initialize mutex. */
+	lock_init(&m->lock_object, class, name, type, flags);
+
+	m->mtx_lock = MTX_UNOWNED;
+	m->mtx_recurse = 0;
+}
+
+/*
+ * Remove lock `m' from all_mtx queue.  We don't allow MTX_QUIET to be
+ * passed in as a flag here because if the corresponding mtx_init() was
+ * called with MTX_QUIET set, then it will already be set in the mutex's
+ * flags.
+ */
+void
+_mtx_destroy(volatile uintptr_t *c)
+{
+	struct mtx *m;
+
+	m = mtxlock2mtx(c);
+
+	if (!mtx_owned(m))
+		MPASS(mtx_unowned(m));
+	else {
+		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
+
+		/* Perform the non-mtx related part of mtx_unlock_spin(). */
+		if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin)
+			spinlock_exit();
+		else
+			curthread->td_locks--;
+
+		lock_profile_release_lock(&m->lock_object);
+		/* Tell witness this isn't locked to make it happy. */
+		WITNESS_UNLOCK(&m->lock_object, LOP_EXCLUSIVE, __FILE__,
+		    __LINE__);
+	}
+
+	m->mtx_lock = MTX_DESTROYED;
+	lock_destroy(&m->lock_object);
+}
+
+/*
+ * Intialize the mutex code and system mutexes.  This is called from the MD
+ * startup code prior to mi_startup().  The per-CPU data space needs to be
+ * setup before this is called.
+ */
+void
+mutex_init(void)
+{
+
+	/* Setup turnstiles so that sleep mutexes work. */
+	init_turnstiles();
+
+	/*
+	 * Initialize mutexes.
+	 */
+	mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
+	mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN);
+	blocked_lock.mtx_lock = 0xdeadc0de;	/* Always blocked. */
+	mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+	mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
+	mtx_init(&devmtx, "cdev", NULL, MTX_DEF);
+	mtx_lock(&Giant);
+}
+
+#ifdef DDB
+void
+db_show_mtx(const struct lock_object *lock)
+{
+	struct thread *td;
+	const struct mtx *m;
+
+	m = (const struct mtx *)lock;
+
+	db_printf(" flags: {");
+	if (LOCK_CLASS(lock) == &lock_class_mtx_spin)
+		db_printf("SPIN");
+	else
+		db_printf("DEF");
+	if (m->lock_object.lo_flags & LO_RECURSABLE)
+		db_printf(", RECURSE");
+	if (m->lock_object.lo_flags & LO_DUPOK)
+		db_printf(", DUPOK");
+	db_printf("}\n");
+	db_printf(" state: {");
+	if (mtx_unowned(m))
+		db_printf("UNOWNED");
+	else if (mtx_destroyed(m))
+		db_printf("DESTROYED");
+	else {
+		db_printf("OWNED");
+		if (m->mtx_lock & MTX_CONTESTED)
+			db_printf(", CONTESTED");
+		if (m->mtx_lock & MTX_RECURSED)
+			db_printf(", RECURSED");
+	}
+	db_printf("}\n");
+	if (!mtx_unowned(m) && !mtx_destroyed(m)) {
+		td = mtx_owner(m);
+		db_printf(" owner: %p (tid %d, pid %d, \"%s\")\n", td,
+		    td->td_tid, td->td_proc->p_pid, td->td_name);
+		if (mtx_recursed(m))
+			db_printf(" recursed: %d\n", m->mtx_recurse);
+	}
+}
+#endif
diff --git a/sys/kern/kern_ntptime.c b/sys/kern/kern_ntptime.c
new file mode 100644
index 0000000..7c95575
--- /dev/null
+++ b/sys/kern/kern_ntptime.c
@@ -0,0 +1,1055 @@
+/*-
+ ***********************************************************************
+ *								       *
+ * Copyright (c) David L. Mills 1993-2001			       *
+ *								       *
+ * Permission to use, copy, modify, and distribute this software and   *
+ * its documentation for any purpose and without fee is hereby	       *
+ * granted, provided that the above copyright notice appears in all    *
+ * copies and that both the copyright notice and this permission       *
+ * notice appear in supporting documentation, and that the name	       *
+ * University of Delaware not be used in advertising or publicity      *
+ * pertaining to distribution of the software without specific,	       *
+ * written prior permission. The University of Delaware makes no       *
+ * representations about the suitability this software for any	       *
+ * purpose. It is provided "as is" without express or implied	       *
+ * warranty.							       *
+ *								       *
+ **********************************************************************/
+
+/*
+ * Adapted from the original sources for FreeBSD and timecounters by:
+ * Poul-Henning Kamp <phk@FreeBSD.org>.
+ *
+ * The 32bit version of the "LP" macros seems a bit past its "sell by" 
+ * date so I have retained only the 64bit version and included it directly
+ * in this file.
+ *
+ * Only minor changes done to interface with the timecounters over in
+ * sys/kern/kern_clock.c.   Some of the comments below may be (even more)
+ * confusing and/or plain wrong in that context.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ntp.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <sys/timetc.h>
+#include <sys/timepps.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+
+#ifdef PPS_SYNC
+FEATURE(pps_sync, "Support usage of external PPS signal by kernel PLL");
+#endif
+
+/*
+ * Single-precision macros for 64-bit machines
+ */
+typedef int64_t l_fp;
+#define L_ADD(v, u)	((v) += (u))
+#define L_SUB(v, u)	((v) -= (u))
+#define L_ADDHI(v, a)	((v) += (int64_t)(a) << 32)
+#define L_NEG(v)	((v) = -(v))
+#define L_RSHIFT(v, n) \
+	do { \
+		if ((v) < 0) \
+			(v) = -(-(v) >> (n)); \
+		else \
+			(v) = (v) >> (n); \
+	} while (0)
+#define L_MPY(v, a)	((v) *= (a))
+#define L_CLR(v)	((v) = 0)
+#define L_ISNEG(v)	((v) < 0)
+#define L_LINT(v, a)	((v) = (int64_t)(a) << 32)
+#define L_GINT(v)	((v) < 0 ? -(-(v) >> 32) : (v) >> 32)
+
+/*
+ * Generic NTP kernel interface
+ *
+ * These routines constitute the Network Time Protocol (NTP) interfaces
+ * for user and daemon application programs. The ntp_gettime() routine
+ * provides the time, maximum error (synch distance) and estimated error
+ * (dispersion) to client user application programs. The ntp_adjtime()
+ * routine is used by the NTP daemon to adjust the system clock to an
+ * externally derived time. The time offset and related variables set by
+ * this routine are used by other routines in this module to adjust the
+ * phase and frequency of the clock discipline loop which controls the
+ * system clock.
+ *
+ * When the kernel time is reckoned directly in nanoseconds (NTP_NANO
+ * defined), the time at each tick interrupt is derived directly from
+ * the kernel time variable. When the kernel time is reckoned in
+ * microseconds, (NTP_NANO undefined), the time is derived from the
+ * kernel time variable together with a variable representing the
+ * leftover nanoseconds at the last tick interrupt. In either case, the
+ * current nanosecond time is reckoned from these values plus an
+ * interpolated value derived by the clock routines in another
+ * architecture-specific module. The interpolation can use either a
+ * dedicated counter or a processor cycle counter (PCC) implemented in
+ * some architectures.
+ *
+ * Note that all routines must run at priority splclock or higher.
+ */
+/*
+ * Phase/frequency-lock loop (PLL/FLL) definitions
+ *
+ * The nanosecond clock discipline uses two variable types, time
+ * variables and frequency variables. Both types are represented as 64-
+ * bit fixed-point quantities with the decimal point between two 32-bit
+ * halves. On a 32-bit machine, each half is represented as a single
+ * word and mathematical operations are done using multiple-precision
+ * arithmetic. On a 64-bit machine, ordinary computer arithmetic is
+ * used.
+ *
+ * A time variable is a signed 64-bit fixed-point number in ns and
+ * fraction. It represents the remaining time offset to be amortized
+ * over succeeding tick interrupts. The maximum time offset is about
+ * 0.5 s and the resolution is about 2.3e-10 ns.
+ *
+ *			1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |s s s|			 ns				   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |			    fraction				   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * A frequency variable is a signed 64-bit fixed-point number in ns/s
+ * and fraction. It represents the ns and fraction to be added to the
+ * kernel time variable at each second. The maximum frequency offset is
+ * about +-500000 ns/s and the resolution is about 2.3e-10 ns/s.
+ *
+ *			1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |s s s s s s s s s s s s s|	          ns/s			   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |			    fraction				   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+/*
+ * The following variables establish the state of the PLL/FLL and the
+ * residual time and frequency offset of the local clock.
+ */
+#define SHIFT_PLL	4		/* PLL loop gain (shift) */
+#define SHIFT_FLL	2		/* FLL loop gain (shift) */
+
+static int time_state = TIME_OK;	/* clock state */
+int time_status = STA_UNSYNC;	/* clock status bits */
+static long time_tai;			/* TAI offset (s) */
+static long time_monitor;		/* last time offset scaled (ns) */
+static long time_constant;		/* poll interval (shift) (s) */
+static long time_precision = 1;		/* clock precision (ns) */
+static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */
+long time_esterror = MAXPHASE / 1000; /* estimated error (us) */
+static long time_reftime;		/* time at last adjustment (s) */
+static l_fp time_offset;		/* time offset (ns) */
+static l_fp time_freq;			/* frequency offset (ns/s) */
+static l_fp time_adj;			/* tick adjust (ns/s) */
+
+static int64_t time_adjtime;		/* correction from adjtime(2) (usec) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used when a pulse-per-second (PPS) signal
+ * is available and connected via a modem control lead. They establish
+ * the engineering parameters of the clock discipline loop when
+ * controlled by the PPS signal.
+ */
+#define PPS_FAVG	2		/* min freq avg interval (s) (shift) */
+#define PPS_FAVGDEF	8		/* default freq avg int (s) (shift) */
+#define PPS_FAVGMAX	15		/* max freq avg interval (s) (shift) */
+#define PPS_PAVG	4		/* phase avg interval (s) (shift) */
+#define PPS_VALID	120		/* PPS signal watchdog max (s) */
+#define PPS_MAXWANDER	100000		/* max PPS wander (ns/s) */
+#define PPS_POPCORN	2		/* popcorn spike threshold (shift) */
+
+static struct timespec pps_tf[3];	/* phase median filter */
+static l_fp pps_freq;			/* scaled frequency offset (ns/s) */
+static long pps_fcount;			/* frequency accumulator */
+static long pps_jitter;			/* nominal jitter (ns) */
+static long pps_stabil;			/* nominal stability (scaled ns/s) */
+static long pps_lastsec;		/* time at last calibration (s) */
+static int pps_valid;			/* signal watchdog counter */
+static int pps_shift = PPS_FAVG;	/* interval duration (s) (shift) */
+static int pps_shiftmax = PPS_FAVGDEF;	/* max interval duration (s) (shift) */
+static int pps_intcnt;			/* wander counter */
+
+/*
+ * PPS signal quality monitors
+ */
+static long pps_calcnt;			/* calibration intervals */
+static long pps_jitcnt;			/* jitter limit exceeded */
+static long pps_stbcnt;			/* stability limit exceeded */
+static long pps_errcnt;			/* calibration errors */
+#endif /* PPS_SYNC */
+/*
+ * End of phase/frequency-lock loop (PLL/FLL) definitions
+ */
+
+static void ntp_init(void);
+static void hardupdate(long offset);
+static void ntp_gettime1(struct ntptimeval *ntvp);
+static int ntp_is_time_error(void);
+
+static int
+ntp_is_time_error(void)
+{
+	/*
+	 * Status word error decode. If any of these conditions occur,
+	 * an error is returned, instead of the status word. Most
+	 * applications will care only about the fact the system clock
+	 * may not be trusted, not about the details.
+	 *
+	 * Hardware or software error
+	 */
+	if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
+
+	/*
+	 * PPS signal lost when either time or frequency synchronization
+	 * requested
+	 */
+	    (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+	    !(time_status & STA_PPSSIGNAL)) ||
+
+	/*
+	 * PPS jitter exceeded when time synchronization requested
+	 */
+	    (time_status & STA_PPSTIME &&
+	    time_status & STA_PPSJITTER) ||
+
+	/*
+	 * PPS wander exceeded or calibration error when frequency
+	 * synchronization requested
+	 */
+	    (time_status & STA_PPSFREQ &&
+	    time_status & (STA_PPSWANDER | STA_PPSERROR)))
+		return (1);
+
+	return (0);
+}
+
+static void
+ntp_gettime1(struct ntptimeval *ntvp)
+{
+	struct timespec atv;	/* nanosecond time */
+
+	GIANT_REQUIRED;
+
+	nanotime(&atv);
+	ntvp->time.tv_sec = atv.tv_sec;
+	ntvp->time.tv_nsec = atv.tv_nsec;
+	ntvp->maxerror = time_maxerror;
+	ntvp->esterror = time_esterror;
+	ntvp->tai = time_tai;
+	ntvp->time_state = time_state;
+
+	if (ntp_is_time_error())
+		ntvp->time_state = TIME_ERROR;
+}
+
+/*
+ * ntp_gettime() - NTP user application interface
+ *
+ * See the timex.h header file for synopsis and API description.  Note that
+ * the TAI offset is returned in the ntvtimeval.tai structure member.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ntp_gettime_args {
+	struct ntptimeval *ntvp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_ntp_gettime(struct thread *td, struct ntp_gettime_args *uap)
+{	
+	struct ntptimeval ntv;
+
+	mtx_lock(&Giant);
+	ntp_gettime1(&ntv);
+	mtx_unlock(&Giant);
+
+	td->td_retval[0] = ntv.time_state;
+	return (copyout(&ntv, uap->ntvp, sizeof(ntv)));
+}
+
+static int
+ntp_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct ntptimeval ntv;	/* temporary structure */
+
+	ntp_gettime1(&ntv);
+
+	return (sysctl_handle_opaque(oidp, &ntv, sizeof(ntv), req));
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, ntp_pll, CTLFLAG_RW, 0, "");
+SYSCTL_PROC(_kern_ntp_pll, OID_AUTO, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD,
+	0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", "");
+
+#ifdef PPS_SYNC
+SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shiftmax, CTLFLAG_RW,
+    &pps_shiftmax, 0, "Max interval duration (sec) (shift)");
+SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shift, CTLFLAG_RW,
+    &pps_shift, 0, "Interval duration (sec) (shift)");
+SYSCTL_LONG(_kern_ntp_pll, OID_AUTO, time_monitor, CTLFLAG_RD,
+    &time_monitor, 0, "Last time offset scaled (ns)");
+
+SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, pps_freq, CTLFLAG_RD,
+    &pps_freq, sizeof(pps_freq), "I", "Scaled frequency offset (ns/sec)");
+SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, time_freq, CTLFLAG_RD,
+    &time_freq, sizeof(time_freq), "I", "Frequency offset (ns/sec)");
+#endif
+
+/*
+ * ntp_adjtime() - NTP daemon application interface
+ *
+ * See the timex.h header file for synopsis and API description.  Note that
+ * the timex.constant structure member has a dual purpose to set the time
+ * constant and to set the TAI offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ntp_adjtime_args {
+	struct timex *tp;
+};
+#endif
+
+int
+sys_ntp_adjtime(struct thread *td, struct ntp_adjtime_args *uap)
+{
+	struct timex ntv;	/* temporary structure */
+	long freq;		/* frequency ns/s) */
+	int modes;		/* mode bits from structure */
+	int s;			/* caller priority */
+	int error;
+
+	error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv));
+	if (error)
+		return(error);
+
+	/*
+	 * Update selected clock variables - only the superuser can
+	 * change anything. Note that there is no error checking here on
+	 * the assumption the superuser should know what it is doing.
+	 * Note that either the time constant or TAI offset are loaded
+	 * from the ntv.constant member, depending on the mode bits. If
+	 * the STA_PLL bit in the status word is cleared, the state and
+	 * status words are reset to the initial values at boot.
+	 */
+	mtx_lock(&Giant);
+	modes = ntv.modes;
+	if (modes)
+		error = priv_check(td, PRIV_NTP_ADJTIME);
+	if (error)
+		goto done2;
+	s = splclock();
+	if (modes & MOD_MAXERROR)
+		time_maxerror = ntv.maxerror;
+	if (modes & MOD_ESTERROR)
+		time_esterror = ntv.esterror;
+	if (modes & MOD_STATUS) {
+		if (time_status & STA_PLL && !(ntv.status & STA_PLL)) {
+			time_state = TIME_OK;
+			time_status = STA_UNSYNC;
+#ifdef PPS_SYNC
+			pps_shift = PPS_FAVG;
+#endif /* PPS_SYNC */
+		}
+		time_status &= STA_RONLY;
+		time_status |= ntv.status & ~STA_RONLY;
+	}
+	if (modes & MOD_TIMECONST) {
+		if (ntv.constant < 0)
+			time_constant = 0;
+		else if (ntv.constant > MAXTC)
+			time_constant = MAXTC;
+		else
+			time_constant = ntv.constant;
+	}
+	if (modes & MOD_TAI) {
+		if (ntv.constant > 0) /* XXX zero & negative numbers ? */
+			time_tai = ntv.constant;
+	}
+#ifdef PPS_SYNC
+	if (modes & MOD_PPSMAX) {
+		if (ntv.shift < PPS_FAVG)
+			pps_shiftmax = PPS_FAVG;
+		else if (ntv.shift > PPS_FAVGMAX)
+			pps_shiftmax = PPS_FAVGMAX;
+		else
+			pps_shiftmax = ntv.shift;
+	}
+#endif /* PPS_SYNC */
+	if (modes & MOD_NANO)
+		time_status |= STA_NANO;
+	if (modes & MOD_MICRO)
+		time_status &= ~STA_NANO;
+	if (modes & MOD_CLKB)
+		time_status |= STA_CLK;
+	if (modes & MOD_CLKA)
+		time_status &= ~STA_CLK;
+	if (modes & MOD_FREQUENCY) {
+		freq = (ntv.freq * 1000LL) >> 16;
+		if (freq > MAXFREQ)
+			L_LINT(time_freq, MAXFREQ);
+		else if (freq < -MAXFREQ)
+			L_LINT(time_freq, -MAXFREQ);
+		else {
+			/*
+			 * ntv.freq is [PPM * 2^16] = [us/s * 2^16]
+			 * time_freq is [ns/s * 2^32]
+			 */
+			time_freq = ntv.freq * 1000LL * 65536LL;
+		}
+#ifdef PPS_SYNC
+		pps_freq = time_freq;
+#endif /* PPS_SYNC */
+	}
+	if (modes & MOD_OFFSET) {
+		if (time_status & STA_NANO)
+			hardupdate(ntv.offset);
+		else
+			hardupdate(ntv.offset * 1000);
+	}
+
+	/*
+	 * Retrieve all clock variables. Note that the TAI offset is
+	 * returned only by ntp_gettime();
+	 */
+	if (time_status & STA_NANO)
+		ntv.offset = L_GINT(time_offset);
+	else
+		ntv.offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */
+	ntv.freq = L_GINT((time_freq / 1000LL) << 16);
+	ntv.maxerror = time_maxerror;
+	ntv.esterror = time_esterror;
+	ntv.status = time_status;
+	ntv.constant = time_constant;
+	if (time_status & STA_NANO)
+		ntv.precision = time_precision;
+	else
+		ntv.precision = time_precision / 1000;
+	ntv.tolerance = MAXFREQ * SCALE_PPM;
+#ifdef PPS_SYNC
+	ntv.shift = pps_shift;
+	ntv.ppsfreq = L_GINT((pps_freq / 1000LL) << 16);
+	if (time_status & STA_NANO)
+		ntv.jitter = pps_jitter;
+	else
+		ntv.jitter = pps_jitter / 1000;
+	ntv.stabil = pps_stabil;
+	ntv.calcnt = pps_calcnt;
+	ntv.errcnt = pps_errcnt;
+	ntv.jitcnt = pps_jitcnt;
+	ntv.stbcnt = pps_stbcnt;
+#endif /* PPS_SYNC */
+	splx(s);
+
+	error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv));
+	if (error)
+		goto done2;
+
+	if (ntp_is_time_error())
+		td->td_retval[0] = TIME_ERROR;
+	else
+		td->td_retval[0] = time_state;
+
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * second_overflow() - called after ntp_tick_adjust()
+ *
+ * This routine is ordinarily called immediately following the above
+ * routine ntp_tick_adjust(). While these two routines are normally
+ * combined, they are separated here only for the purposes of
+ * simulation.
+ */
+void
+ntp_update_second(int64_t *adjustment, time_t *newsec)
+{
+	int tickrate;
+	l_fp ftemp;		/* 32/64-bit temporary */
+
+	/*
+	 * On rollover of the second both the nanosecond and microsecond
+	 * clocks are updated and the state machine cranked as
+	 * necessary. The phase adjustment to be used for the next
+	 * second is calculated and the maximum error is increased by
+	 * the tolerance.
+	 */
+	time_maxerror += MAXFREQ / 1000;
+
+	/*
+	 * Leap second processing. If in leap-insert state at
+	 * the end of the day, the system clock is set back one
+	 * second; if in leap-delete state, the system clock is
+	 * set ahead one second. The nano_time() routine or
+	 * external clock driver will insure that reported time
+	 * is always monotonic.
+	 */
+	switch (time_state) {
+
+		/*
+		 * No warning.
+		 */
+		case TIME_OK:
+		if (time_status & STA_INS)
+			time_state = TIME_INS;
+		else if (time_status & STA_DEL)
+			time_state = TIME_DEL;
+		break;
+
+		/*
+		 * Insert second 23:59:60 following second
+		 * 23:59:59.
+		 */
+		case TIME_INS:
+		if (!(time_status & STA_INS))
+			time_state = TIME_OK;
+		else if ((*newsec) % 86400 == 0) {
+			(*newsec)--;
+			time_state = TIME_OOP;
+			time_tai++;
+		}
+		break;
+
+		/*
+		 * Delete second 23:59:59.
+		 */
+		case TIME_DEL:
+		if (!(time_status & STA_DEL))
+			time_state = TIME_OK;
+		else if (((*newsec) + 1) % 86400 == 0) {
+			(*newsec)++;
+			time_tai--;
+			time_state = TIME_WAIT;
+		}
+		break;
+
+		/*
+		 * Insert second in progress.
+		 */
+		case TIME_OOP:
+			time_state = TIME_WAIT;
+		break;
+
+		/*
+		 * Wait for status bits to clear.
+		 */
+		case TIME_WAIT:
+		if (!(time_status & (STA_INS | STA_DEL)))
+			time_state = TIME_OK;
+	}
+
+	/*
+	 * Compute the total time adjustment for the next second
+	 * in ns. The offset is reduced by a factor depending on
+	 * whether the PPS signal is operating. Note that the
+	 * value is in effect scaled by the clock frequency,
+	 * since the adjustment is added at each tick interrupt.
+	 */
+	ftemp = time_offset;
+#ifdef PPS_SYNC
+	/* XXX even if PPS signal dies we should finish adjustment ? */
+	if (time_status & STA_PPSTIME && time_status &
+	    STA_PPSSIGNAL)
+		L_RSHIFT(ftemp, pps_shift);
+	else
+		L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
+#else
+		L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
+#endif /* PPS_SYNC */
+	time_adj = ftemp;
+	L_SUB(time_offset, ftemp);
+	L_ADD(time_adj, time_freq);
+	
+	/*
+	 * Apply any correction from adjtime(2).  If more than one second
+	 * off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM)
+	 * until the last second is slewed the final < 500 usecs.
+	 */
+	if (time_adjtime != 0) {
+		if (time_adjtime > 1000000)
+			tickrate = 5000;
+		else if (time_adjtime < -1000000)
+			tickrate = -5000;
+		else if (time_adjtime > 500)
+			tickrate = 500;
+		else if (time_adjtime < -500)
+			tickrate = -500;
+		else
+			tickrate = time_adjtime;
+		time_adjtime -= tickrate;
+		L_LINT(ftemp, tickrate * 1000);
+		L_ADD(time_adj, ftemp);
+	}
+	*adjustment = time_adj;
+		
+#ifdef PPS_SYNC
+	if (pps_valid > 0)
+		pps_valid--;
+	else
+		time_status &= ~STA_PPSSIGNAL;
+#endif /* PPS_SYNC */
+}
+
+/*
+ * ntp_init() - initialize variables and structures
+ *
+ * This routine must be called after the kernel variables hz and tick
+ * are set or changed and before the next tick interrupt. In this
+ * particular implementation, these values are assumed set elsewhere in
+ * the kernel. The design allows the clock frequency and tick interval
+ * to be changed while the system is running. So, this routine should
+ * probably be integrated with the code that does that.
+ */
+static void
+ntp_init()
+{
+
+	/*
+	 * The following variables are initialized only at startup. Only
+	 * those structures not cleared by the compiler need to be
+	 * initialized, and these only in the simulator. In the actual
+	 * kernel, any nonzero values here will quickly evaporate.
+	 */
+	L_CLR(time_offset);
+	L_CLR(time_freq);
+#ifdef PPS_SYNC
+	pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0;
+	pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0;
+	pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0;
+	pps_fcount = 0;
+	L_CLR(pps_freq);
+#endif /* PPS_SYNC */	   
+}
+
+SYSINIT(ntpclocks, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, ntp_init, NULL);
+
+/*
+ * hardupdate() - local clock update
+ *
+ * This routine is called by ntp_adjtime() to update the local clock
+ * phase and frequency. The implementation is of an adaptive-parameter,
+ * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
+ * time and frequency offset estimates for each call. If the kernel PPS
+ * discipline code is configured (PPS_SYNC), the PPS signal itself
+ * determines the new time offset, instead of the calling argument.
+ * Presumably, calls to ntp_adjtime() occur only when the caller
+ * believes the local clock is valid within some bound (+-128 ms with
+ * NTP). If the caller's time is far different than the PPS time, an
+ * argument will ensue, and it's not clear who will lose.
+ *
+ * For uncompensated quartz crystal oscillators and nominal update
+ * intervals less than 256 s, operation should be in phase-lock mode,
+ * where the loop is disciplined to phase. For update intervals greater
+ * than 1024 s, operation should be in frequency-lock mode, where the
+ * loop is disciplined to frequency. Between 256 s and 1024 s, the mode
+ * is selected by the STA_MODE status bit.
+ */
+static void
+hardupdate(offset)
+	long offset;		/* clock offset (ns) */
+{
+	long mtemp;
+	l_fp ftemp;
+
+	/*
+	 * Select how the phase is to be controlled and from which
+	 * source. If the PPS signal is present and enabled to
+	 * discipline the time, the PPS offset is used; otherwise, the
+	 * argument offset is used.
+	 */
+	if (!(time_status & STA_PLL))
+		return;
+	if (!(time_status & STA_PPSTIME && time_status &
+	    STA_PPSSIGNAL)) {
+		if (offset > MAXPHASE)
+			time_monitor = MAXPHASE;
+		else if (offset < -MAXPHASE)
+			time_monitor = -MAXPHASE;
+		else
+			time_monitor = offset;
+		L_LINT(time_offset, time_monitor);
+	}
+
+	/*
+	 * Select how the frequency is to be controlled and in which
+	 * mode (PLL or FLL). If the PPS signal is present and enabled
+	 * to discipline the frequency, the PPS frequency is used;
+	 * otherwise, the argument offset is used to compute it.
+	 */
+	if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) {
+		time_reftime = time_second;
+		return;
+	}
+	if (time_status & STA_FREQHOLD || time_reftime == 0)
+		time_reftime = time_second;
+	mtemp = time_second - time_reftime;
+	L_LINT(ftemp, time_monitor);
+	L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1);
+	L_MPY(ftemp, mtemp);
+	L_ADD(time_freq, ftemp);
+	time_status &= ~STA_MODE;
+	if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp >
+	    MAXSEC)) {
+		L_LINT(ftemp, (time_monitor << 4) / mtemp);
+		L_RSHIFT(ftemp, SHIFT_FLL + 4);
+		L_ADD(time_freq, ftemp);
+		time_status |= STA_MODE;
+	}
+	time_reftime = time_second;
+	if (L_GINT(time_freq) > MAXFREQ)
+		L_LINT(time_freq, MAXFREQ);
+	else if (L_GINT(time_freq) < -MAXFREQ)
+		L_LINT(time_freq, -MAXFREQ);
+}
+
+#ifdef PPS_SYNC
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS interrupt in order to discipline
+ * the CPU clock oscillator to the PPS signal. There are two independent
+ * first-order feedback loops, one for the phase, the other for the
+ * frequency. The phase loop measures and grooms the PPS phase offset
+ * and leaves it in a handy spot for the seconds overflow routine. The
+ * frequency loop averages successive PPS phase differences and
+ * calculates the PPS frequency offset, which is also processed by the
+ * seconds overflow routine. The code requires the caller to capture the
+ * time and architecture-dependent hardware counter values in
+ * nanoseconds at the on-time PPS signal transition.
+ *
+ * Note that, on some Unix systems this routine runs at an interrupt
+ * priority level higher than the timer interrupt routine hardclock().
+ * Therefore, the variables used are distinct from the hardclock()
+ * variables, except for the actual time and frequency variables, which
+ * are determined by this routine and updated atomically.
+ */
+void
+hardpps(tsp, nsec)
+	struct timespec *tsp;	/* time at PPS */
+	long nsec;		/* hardware counter at PPS */
+{
+	long u_sec, u_nsec, v_nsec; /* temps */
+	l_fp ftemp;
+
+	/*
+	 * The signal is first processed by a range gate and frequency
+	 * discriminator. The range gate rejects noise spikes outside
+	 * the range +-500 us. The frequency discriminator rejects input
+	 * signals with apparent frequency outside the range 1 +-500
+	 * PPM. If two hits occur in the same second, we ignore the
+	 * later hit; if not and a hit occurs outside the range gate,
+	 * keep the later hit for later comparison, but do not process
+	 * it.
+	 */
+	time_status |= STA_PPSSIGNAL | STA_PPSJITTER;
+	time_status &= ~(STA_PPSWANDER | STA_PPSERROR);
+	pps_valid = PPS_VALID;
+	u_sec = tsp->tv_sec;
+	u_nsec = tsp->tv_nsec;
+	if (u_nsec >= (NANOSECOND >> 1)) {
+		u_nsec -= NANOSECOND;
+		u_sec++;
+	}
+	v_nsec = u_nsec - pps_tf[0].tv_nsec;
+	if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND -
+	    MAXFREQ)
+		return;
+	pps_tf[2] = pps_tf[1];
+	pps_tf[1] = pps_tf[0];
+	pps_tf[0].tv_sec = u_sec;
+	pps_tf[0].tv_nsec = u_nsec;
+
+	/*
+	 * Compute the difference between the current and previous
+	 * counter values. If the difference exceeds 0.5 s, assume it
+	 * has wrapped around, so correct 1.0 s. If the result exceeds
+	 * the tick interval, the sample point has crossed a tick
+	 * boundary during the last second, so correct the tick. Very
+	 * intricate.
+	 */
+	u_nsec = nsec;
+	if (u_nsec > (NANOSECOND >> 1))
+		u_nsec -= NANOSECOND;
+	else if (u_nsec < -(NANOSECOND >> 1))
+		u_nsec += NANOSECOND;
+	pps_fcount += u_nsec;
+	if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ)
+		return;
+	time_status &= ~STA_PPSJITTER;
+
+	/*
+	 * A three-stage median filter is used to help denoise the PPS
+	 * time. The median sample becomes the time offset estimate; the
+	 * difference between the other two samples becomes the time
+	 * dispersion (jitter) estimate.
+	 */
+	if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) {
+		if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) {
+			v_nsec = pps_tf[1].tv_nsec;	/* 0 1 2 */
+			u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec;
+		} else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) {
+			v_nsec = pps_tf[0].tv_nsec;	/* 2 0 1 */
+			u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec;
+		} else {
+			v_nsec = pps_tf[2].tv_nsec;	/* 0 2 1 */
+			u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec;
+		}
+	} else {
+		if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) {
+			v_nsec = pps_tf[1].tv_nsec;	/* 2 1 0 */
+			u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec;
+		} else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) {
+			v_nsec = pps_tf[0].tv_nsec;	/* 1 0 2 */
+			u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec;
+		} else {
+			v_nsec = pps_tf[2].tv_nsec;	/* 1 2 0 */
+			u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec;
+		}
+	}
+
+	/*
+	 * Nominal jitter is due to PPS signal noise and interrupt
+	 * latency. If it exceeds the popcorn threshold, the sample is
+	 * discarded. otherwise, if so enabled, the time offset is
+	 * updated. We can tolerate a modest loss of data here without
+	 * much degrading time accuracy.
+	 *
+	 * The measurements being checked here were made with the system
+	 * timecounter, so the popcorn threshold is not allowed to fall below
+	 * the number of nanoseconds in two ticks of the timecounter.  For a
+	 * timecounter running faster than 1 GHz the lower bound is 2ns, just
+	 * to avoid a nonsensical threshold of zero.
+	*/
+	if (u_nsec > lmax(pps_jitter << PPS_POPCORN, 
+	    2 * (NANOSECOND / (long)qmin(NANOSECOND, tc_getfrequency())))) {
+		time_status |= STA_PPSJITTER;
+		pps_jitcnt++;
+	} else if (time_status & STA_PPSTIME) {
+		time_monitor = -v_nsec;
+		L_LINT(time_offset, time_monitor);
+	}
+	pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG;
+	u_sec = pps_tf[0].tv_sec - pps_lastsec;
+	if (u_sec < (1 << pps_shift))
+		return;
+
+	/*
+	 * At the end of the calibration interval the difference between
+	 * the first and last counter values becomes the scaled
+	 * frequency. It will later be divided by the length of the
+	 * interval to determine the frequency update. If the frequency
+	 * exceeds a sanity threshold, or if the actual calibration
+	 * interval is not equal to the expected length, the data are
+	 * discarded. We can tolerate a modest loss of data here without
+	 * much degrading frequency accuracy.
+	 */
+	pps_calcnt++;
+	v_nsec = -pps_fcount;
+	pps_lastsec = pps_tf[0].tv_sec;
+	pps_fcount = 0;
+	u_nsec = MAXFREQ << pps_shift;
+	if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 <<
+	    pps_shift)) {
+		time_status |= STA_PPSERROR;
+		pps_errcnt++;
+		return;
+	}
+
+	/*
+	 * Here the raw frequency offset and wander (stability) is
+	 * calculated. If the wander is less than the wander threshold
+	 * for four consecutive averaging intervals, the interval is
+	 * doubled; if it is greater than the threshold for four
+	 * consecutive intervals, the interval is halved. The scaled
+	 * frequency offset is converted to frequency offset. The
+	 * stability metric is calculated as the average of recent
+	 * frequency changes, but is used only for performance
+	 * monitoring.
+	 */
+	L_LINT(ftemp, v_nsec);
+	L_RSHIFT(ftemp, pps_shift);
+	L_SUB(ftemp, pps_freq);
+	u_nsec = L_GINT(ftemp);
+	if (u_nsec > PPS_MAXWANDER) {
+		L_LINT(ftemp, PPS_MAXWANDER);
+		pps_intcnt--;
+		time_status |= STA_PPSWANDER;
+		pps_stbcnt++;
+	} else if (u_nsec < -PPS_MAXWANDER) {
+		L_LINT(ftemp, -PPS_MAXWANDER);
+		pps_intcnt--;
+		time_status |= STA_PPSWANDER;
+		pps_stbcnt++;
+	} else {
+		pps_intcnt++;
+	}
+	if (pps_intcnt >= 4) {
+		pps_intcnt = 4;
+		if (pps_shift < pps_shiftmax) {
+			pps_shift++;
+			pps_intcnt = 0;
+		}
+	} else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) {
+		pps_intcnt = -4;
+		if (pps_shift > PPS_FAVG) {
+			pps_shift--;
+			pps_intcnt = 0;
+		}
+	}
+	if (u_nsec < 0)
+		u_nsec = -u_nsec;
+	pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG;
+
+	/*
+	 * The PPS frequency is recalculated and clamped to the maximum
+	 * MAXFREQ. If enabled, the system clock frequency is updated as
+	 * well.
+	 */
+	L_ADD(pps_freq, ftemp);
+	u_nsec = L_GINT(pps_freq);
+	if (u_nsec > MAXFREQ)
+		L_LINT(pps_freq, MAXFREQ);
+	else if (u_nsec < -MAXFREQ)
+		L_LINT(pps_freq, -MAXFREQ);
+	if (time_status & STA_PPSFREQ)
+		time_freq = pps_freq;
+}
+#endif /* PPS_SYNC */
+
+#ifndef _SYS_SYSPROTO_H_
+struct adjtime_args {
+	struct timeval *delta;
+	struct timeval *olddelta;
+};
+#endif
+/* ARGSUSED */
+int
+sys_adjtime(struct thread *td, struct adjtime_args *uap)
+{
+	struct timeval delta, olddelta, *deltap;
+	int error;
+
+	if (uap->delta) {
+		error = copyin(uap->delta, &delta, sizeof(delta));
+		if (error)
+			return (error);
+		deltap = &delta;
+	} else
+		deltap = NULL;
+	error = kern_adjtime(td, deltap, &olddelta);
+	if (uap->olddelta && error == 0)
+		error = copyout(&olddelta, uap->olddelta, sizeof(olddelta));
+	return (error);
+}
+
+int
+kern_adjtime(struct thread *td, struct timeval *delta, struct timeval *olddelta)
+{
+	struct timeval atv;
+	int error;
+
+	mtx_lock(&Giant);
+	if (olddelta) {
+		atv.tv_sec = time_adjtime / 1000000;
+		atv.tv_usec = time_adjtime % 1000000;
+		if (atv.tv_usec < 0) {
+			atv.tv_usec += 1000000;
+			atv.tv_sec--;
+		}
+		*olddelta = atv;
+	}
+	if (delta) {
+		if ((error = priv_check(td, PRIV_ADJTIME))) {
+			mtx_unlock(&Giant);
+			return (error);
+		}
+		time_adjtime = (int64_t)delta->tv_sec * 1000000 +
+		    delta->tv_usec;
+	}
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+static struct callout resettodr_callout;
+static int resettodr_period = 1800;
+
+static void
+periodic_resettodr(void *arg __unused)
+{
+
+	if (!ntp_is_time_error()) {
+		mtx_lock(&Giant);
+		resettodr();
+		mtx_unlock(&Giant);
+	}
+	if (resettodr_period > 0)
+		callout_schedule(&resettodr_callout, resettodr_period * hz);
+}
+
+static void
+shutdown_resettodr(void *arg __unused, int howto __unused)
+{
+
+	callout_drain(&resettodr_callout);
+	if (resettodr_period > 0 && !ntp_is_time_error()) {
+		mtx_lock(&Giant);
+		resettodr();
+		mtx_unlock(&Giant);
+	}
+}
+
+static int
+sysctl_resettodr_period(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+
+	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+	if (error || !req->newptr)
+		return (error);
+	if (resettodr_period == 0)
+		callout_stop(&resettodr_callout);
+	else
+		callout_reset(&resettodr_callout, resettodr_period * hz,
+		    periodic_resettodr, NULL);
+	return (0);
+}
+
+SYSCTL_PROC(_machdep, OID_AUTO, rtc_save_period, CTLTYPE_INT|CTLFLAG_RW,
+	&resettodr_period, 1800, sysctl_resettodr_period, "I",
+	"Save system time to RTC with this period (in seconds)");
+TUNABLE_INT("machdep.rtc_save_period", &resettodr_period);
+
+static void
+start_periodic_resettodr(void *arg __unused)
+{
+
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_resettodr, NULL,
+	    SHUTDOWN_PRI_FIRST);
+	callout_init(&resettodr_callout, 1);
+	if (resettodr_period == 0)
+		return;
+	callout_reset(&resettodr_callout, resettodr_period * hz,
+	    periodic_resettodr, NULL);
+}
+
+SYSINIT(periodic_resettodr, SI_SUB_LAST, SI_ORDER_MIDDLE,
+	start_periodic_resettodr, NULL);
diff --git a/sys/kern/kern_osd.c b/sys/kern/kern_osd.c
new file mode 100644
index 0000000..184c4f0
--- /dev/null
+++ b/sys/kern/kern_osd.c
@@ -0,0 +1,403 @@
+/*-
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/errno.h>
+#include <sys/jail.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rmlock.h>
+#include <sys/sx.h>
+#include <sys/queue.h>
+#include <sys/proc.h>
+#include <sys/osd.h>
+
+/* OSD (Object Specific Data) */
+
+static MALLOC_DEFINE(M_OSD, "osd", "Object Specific Data");
+
+static int osd_debug = 0;
+TUNABLE_INT("debug.osd", &osd_debug);
+SYSCTL_INT(_debug, OID_AUTO, osd, CTLFLAG_RW, &osd_debug, 0, "OSD debug level");
+
+#define	OSD_DEBUG(...)	do {						\
+	if (osd_debug) {						\
+		printf("OSD (%s:%u): ", __func__, __LINE__);		\
+		printf(__VA_ARGS__);					\
+		printf("\n");						\
+	}								\
+} while (0)
+
+static void do_osd_del(u_int type, struct osd *osd, u_int slot,
+    int list_locked);
+
+/*
+ * Lists of objects with OSD.
+ *
+ * Lock key:
+ *  (m) osd_module_lock
+ *  (o) osd_object_lock
+ *  (l) osd_list_lock
+ */
+static LIST_HEAD(, osd)	osd_list[OSD_LAST + 1];		/* (m) */
+static osd_method_t *osd_methods[OSD_LAST + 1];		/* (m) */
+static u_int osd_nslots[OSD_LAST + 1];			/* (m) */
+static osd_destructor_t *osd_destructors[OSD_LAST + 1];	/* (o) */
+static const u_int osd_nmethods[OSD_LAST + 1] = {
+	[OSD_JAIL] = PR_MAXMETHOD,
+};
+
+static struct sx osd_module_lock[OSD_LAST + 1];
+static struct rmlock osd_object_lock[OSD_LAST + 1];
+static struct mtx osd_list_lock[OSD_LAST + 1];
+
+static void
+osd_default_destructor(void *value __unused)
+{
+	/* Do nothing. */
+}
+
+int
+osd_register(u_int type, osd_destructor_t destructor, osd_method_t *methods)
+{
+	void *newptr;
+	u_int i, m;
+
+	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+
+	/*
+	 * If no destructor is given, use default one. We need to use some
+	 * destructor, because NULL destructor means unused slot.
+	 */
+	if (destructor == NULL)
+		destructor = osd_default_destructor;
+
+	sx_xlock(&osd_module_lock[type]);
+	/*
+	 * First, we try to find unused slot.
+	 */
+	for (i = 0; i < osd_nslots[type]; i++) {
+		if (osd_destructors[type][i] == NULL) {
+			OSD_DEBUG("Unused slot found (type=%u, slot=%u).",
+			    type, i);
+			break;
+		}
+	}
+	/*
+	 * If no unused slot was found, allocate one.
+	 */
+	if (i == osd_nslots[type]) {
+		osd_nslots[type]++;
+		if (osd_nmethods[type] != 0)
+			osd_methods[type] = realloc(osd_methods[type],
+			    sizeof(osd_method_t) * osd_nslots[type] *
+			    osd_nmethods[type], M_OSD, M_WAITOK);
+		newptr = malloc(sizeof(osd_destructor_t) * osd_nslots[type],
+		    M_OSD, M_WAITOK);
+		rm_wlock(&osd_object_lock[type]);
+		bcopy(osd_destructors[type], newptr,
+		    sizeof(osd_destructor_t) * i);
+		free(osd_destructors[type], M_OSD);
+		osd_destructors[type] = newptr;
+		rm_wunlock(&osd_object_lock[type]);
+		OSD_DEBUG("New slot allocated (type=%u, slot=%u).",
+		    type, i + 1);
+	}
+
+	osd_destructors[type][i] = destructor;
+	if (osd_nmethods[type] != 0) {
+		for (m = 0; m < osd_nmethods[type]; m++)
+			osd_methods[type][i * osd_nmethods[type] + m] =
+			    methods != NULL ? methods[m] : NULL;
+	}
+	sx_xunlock(&osd_module_lock[type]);
+	return (i + 1);
+}
+
+void
+osd_deregister(u_int type, u_int slot)
+{
+	struct osd *osd, *tosd;
+
+	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+	KASSERT(slot > 0, ("Invalid slot."));
+	KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot."));
+
+	sx_xlock(&osd_module_lock[type]);
+	rm_wlock(&osd_object_lock[type]);
+	/*
+	 * Free all OSD for the given slot.
+	 */
+	mtx_lock(&osd_list_lock[type]);
+	LIST_FOREACH_SAFE(osd, &osd_list[type], osd_next, tosd)
+		do_osd_del(type, osd, slot, 1);
+	mtx_unlock(&osd_list_lock[type]);
+	/*
+	 * Set destructor to NULL to free the slot.
+	 */
+	osd_destructors[type][slot - 1] = NULL;
+	if (slot == osd_nslots[type]) {
+		osd_nslots[type]--;
+		osd_destructors[type] = realloc(osd_destructors[type],
+		    sizeof(osd_destructor_t) * osd_nslots[type], M_OSD,
+		    M_NOWAIT | M_ZERO);
+		if (osd_nmethods[type] != 0)
+			osd_methods[type] = realloc(osd_methods[type],
+			    sizeof(osd_method_t) * osd_nslots[type] *
+			    osd_nmethods[type], M_OSD, M_NOWAIT | M_ZERO);
+		/*
+		 * We always reallocate to smaller size, so we assume it will
+		 * always succeed.
+		 */
+		KASSERT(osd_destructors[type] != NULL &&
+		    (osd_nmethods[type] == 0 || osd_methods[type] != NULL),
+		    ("realloc() failed"));
+		OSD_DEBUG("Deregistration of the last slot (type=%u, slot=%u).",
+		    type, slot);
+	} else {
+		OSD_DEBUG("Slot deregistration (type=%u, slot=%u).",
+		    type, slot);
+	}
+	rm_wunlock(&osd_object_lock[type]);
+	sx_xunlock(&osd_module_lock[type]);
+}
+
+int
+osd_set(u_int type, struct osd *osd, u_int slot, void *value)
+{
+	struct rm_priotracker tracker;
+
+	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+	KASSERT(slot > 0, ("Invalid slot."));
+	KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot."));
+
+	rm_rlock(&osd_object_lock[type], &tracker);
+	if (slot > osd->osd_nslots) {
+		if (value == NULL) {
+			OSD_DEBUG(
+			    "Not allocating null slot (type=%u, slot=%u).",
+			    type, slot);
+			rm_runlock(&osd_object_lock[type], &tracker);
+			return (0);
+		} else if (osd->osd_nslots == 0) {
+			/*
+			 * First OSD for this object, so we need to allocate
+			 * space and put it onto the list.
+			 */
+			osd->osd_slots = malloc(sizeof(void *) * slot, M_OSD,
+			    M_NOWAIT | M_ZERO);
+			if (osd->osd_slots == NULL) {
+				rm_runlock(&osd_object_lock[type], &tracker);
+				return (ENOMEM);
+			}
+			osd->osd_nslots = slot;
+			mtx_lock(&osd_list_lock[type]);
+			LIST_INSERT_HEAD(&osd_list[type], osd, osd_next);
+			mtx_unlock(&osd_list_lock[type]);
+			OSD_DEBUG("Setting first slot (type=%u).", type);
+		} else {
+			void *newptr;
+
+			/*
+			 * Too few slots allocated here, needs to extend
+			 * the array.
+			 */
+			newptr = realloc(osd->osd_slots, sizeof(void *) * slot,
+			    M_OSD, M_NOWAIT | M_ZERO);
+			if (newptr == NULL) {
+				rm_runlock(&osd_object_lock[type], &tracker);
+				return (ENOMEM);
+			}
+			osd->osd_slots = newptr;
+			osd->osd_nslots = slot;
+			OSD_DEBUG("Growing slots array (type=%u).", type);
+		}
+	}
+	OSD_DEBUG("Setting slot value (type=%u, slot=%u, value=%p).", type,
+	    slot, value);
+	osd->osd_slots[slot - 1] = value;
+	rm_runlock(&osd_object_lock[type], &tracker);
+	return (0);
+}
+
+void *
+osd_get(u_int type, struct osd *osd, u_int slot)
+{
+	struct rm_priotracker tracker;
+	void *value;
+
+	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+	KASSERT(slot > 0, ("Invalid slot."));
+	KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot."));
+
+	rm_rlock(&osd_object_lock[type], &tracker);
+	if (slot > osd->osd_nslots) {
+		value = NULL;
+		OSD_DEBUG("Slot doesn't exist (type=%u, slot=%u).", type, slot);
+	} else {
+		value = osd->osd_slots[slot - 1];
+		OSD_DEBUG("Returning slot value (type=%u, slot=%u, value=%p).",
+		    type, slot, value);
+	}
+	rm_runlock(&osd_object_lock[type], &tracker);
+	return (value);
+}
+
+void
+osd_del(u_int type, struct osd *osd, u_int slot)
+{
+	struct rm_priotracker tracker;
+
+	rm_rlock(&osd_object_lock[type], &tracker);
+	do_osd_del(type, osd, slot, 0);
+	rm_runlock(&osd_object_lock[type], &tracker);
+}
+
+static void
+do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked)
+{
+	int i;
+
+	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+	KASSERT(slot > 0, ("Invalid slot."));
+	KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot."));
+
+	OSD_DEBUG("Deleting slot (type=%u, slot=%u).", type, slot);
+
+	if (slot > osd->osd_nslots) {
+		OSD_DEBUG("Slot doesn't exist (type=%u, slot=%u).", type, slot);
+		return;
+	}
+	if (osd->osd_slots[slot - 1] != NULL) {
+		osd_destructors[type][slot - 1](osd->osd_slots[slot - 1]);
+		osd->osd_slots[slot - 1] = NULL;
+	}
+	for (i = osd->osd_nslots - 1; i >= 0; i--) {
+		if (osd->osd_slots[i] != NULL) {
+			OSD_DEBUG("Slot still has a value (type=%u, slot=%u).",
+			    type, i + 1);
+			break;
+		}
+	}
+	if (i == -1) {
+		/* No values left for this object. */
+		OSD_DEBUG("No more slots left (type=%u).", type);
+		if (!list_locked)
+			mtx_lock(&osd_list_lock[type]);
+		LIST_REMOVE(osd, osd_next);
+		if (!list_locked)
+			mtx_unlock(&osd_list_lock[type]);
+		free(osd->osd_slots, M_OSD);
+		osd->osd_slots = NULL;
+		osd->osd_nslots = 0;
+	} else if (slot == osd->osd_nslots) {
+		/* This was the last slot. */
+		osd->osd_slots = realloc(osd->osd_slots,
+		    sizeof(void *) * (i + 1), M_OSD, M_NOWAIT | M_ZERO);
+		/*
+		 * We always reallocate to smaller size, so we assume it will
+		 * always succeed.
+		 */
+		KASSERT(osd->osd_slots != NULL, ("realloc() failed"));
+		osd->osd_nslots = i + 1;
+		OSD_DEBUG("Reducing slots array to %u (type=%u).",
+		    osd->osd_nslots, type);
+	}
+}
+
+int
+osd_call(u_int type, u_int method, void *obj, void *data)
+{
+	osd_method_t methodfun;
+	int error, i;
+
+	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+	KASSERT(method < osd_nmethods[type], ("Invalid method."));
+
+	/*
+	 * Call this method for every slot that defines it, stopping if an
+	 * error is encountered.
+	 */
+	error = 0;
+	sx_slock(&osd_module_lock[type]);
+	for (i = 0; i < osd_nslots[type]; i++) {
+		methodfun =
+		    osd_methods[type][i * osd_nmethods[type] + method];
+		if (methodfun != NULL && (error = methodfun(obj, data)) != 0)
+			break;
+	}
+	sx_sunlock(&osd_module_lock[type]);
+	return (error);
+}
+
+void
+osd_exit(u_int type, struct osd *osd)
+{
+	struct rm_priotracker tracker;
+	u_int i;
+
+	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+
+	if (osd->osd_nslots == 0) {
+		KASSERT(osd->osd_slots == NULL, ("Non-null osd_slots."));
+		/* No OSD attached, just leave. */
+		return;
+	}
+
+	rm_rlock(&osd_object_lock[type], &tracker);
+	for (i = 1; i <= osd->osd_nslots; i++) {
+		if (osd_destructors[type][i - 1] != NULL)
+			do_osd_del(type, osd, i, 0);
+		else
+			OSD_DEBUG("Unused slot (type=%u, slot=%u).", type, i);
+	}
+	rm_runlock(&osd_object_lock[type], &tracker);
+	OSD_DEBUG("Object exit (type=%u).", type);
+}
+
+static void
+osd_init(void *arg __unused)
+{
+	u_int i;
+
+	for (i = OSD_FIRST; i <= OSD_LAST; i++) {
+		osd_nslots[i] = 0;
+		LIST_INIT(&osd_list[i]);
+		sx_init(&osd_module_lock[i], "osd_module");
+		rm_init(&osd_object_lock[i], "osd_object");
+		mtx_init(&osd_list_lock[i], "osd_list", NULL, MTX_DEF);
+		osd_destructors[i] = NULL;
+		osd_methods[i] = NULL;
+	}
+}
+SYSINIT(osd, SI_SUB_LOCK, SI_ORDER_ANY, osd_init, NULL);
diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c
new file mode 100644
index 0000000..b37b9f3
--- /dev/null
+++ b/sys/kern/kern_physio.c
@@ -0,0 +1,170 @@
+/*-
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice immediately at the beginning of the file, without modification,
+ *    this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ *    John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ *    are met.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+int
+physio(struct cdev *dev, struct uio *uio, int ioflag)
+{
+	struct buf *bp;
+	struct cdevsw *csw;
+	caddr_t sa;
+	u_int iolen;
+	int error, i, mapped;
+
+	/* Keep the process UPAGES from being swapped. XXX: why ? */
+	PHOLD(curproc);
+
+	bp = getpbuf(NULL);
+	sa = bp->b_data;
+	error = 0;
+
+	/* XXX: sanity check */
+	if(dev->si_iosize_max < PAGE_SIZE) {
+		printf("WARNING: %s si_iosize_max=%d, using DFLTPHYS.\n",
+		    devtoname(dev), dev->si_iosize_max);
+		dev->si_iosize_max = DFLTPHYS;
+	}
+
+	/*
+	 * If the driver does not want I/O to be split, that means that we
+	 * need to reject any requests that will not fit into one buffer.
+	 */
+	if (dev->si_flags & SI_NOSPLIT &&
+	    (uio->uio_resid > dev->si_iosize_max || uio->uio_resid > MAXPHYS ||
+	    uio->uio_iovcnt > 1)) {
+		/*
+		 * Tell the user why his I/O was rejected.
+		 */
+		if (uio->uio_resid > dev->si_iosize_max)
+			uprintf("%s: request size=%zd > si_iosize_max=%d; "
+			    "cannot split request\n", devtoname(dev),
+			    uio->uio_resid, dev->si_iosize_max);
+		if (uio->uio_resid > MAXPHYS)
+			uprintf("%s: request size=%zd > MAXPHYS=%d; "
+			    "cannot split request\n", devtoname(dev),
+			    uio->uio_resid, MAXPHYS);
+		if (uio->uio_iovcnt > 1)
+			uprintf("%s: request vectors=%d > 1; "
+			    "cannot split request\n", devtoname(dev),
+			    uio->uio_iovcnt);
+
+		error = EFBIG;
+		goto doerror;
+	}
+
+	for (i = 0; i < uio->uio_iovcnt; i++) {
+		while (uio->uio_iov[i].iov_len) {
+			bp->b_flags = 0;
+			if (uio->uio_rw == UIO_READ) {
+				bp->b_iocmd = BIO_READ;
+				curthread->td_ru.ru_inblock++;
+			} else {
+				bp->b_iocmd = BIO_WRITE;
+				curthread->td_ru.ru_oublock++;
+			}
+			bp->b_iodone = bdone;
+			bp->b_data = uio->uio_iov[i].iov_base;
+			bp->b_bcount = uio->uio_iov[i].iov_len;
+			bp->b_offset = uio->uio_offset;
+			bp->b_iooffset = uio->uio_offset;
+			bp->b_saveaddr = sa;
+
+			/* Don't exceed drivers iosize limit */
+			if (bp->b_bcount > dev->si_iosize_max)
+				bp->b_bcount = dev->si_iosize_max;
+
+			/* 
+			 * Make sure the pbuf can map the request
+			 * XXX: The pbuf has kvasize = MAXPHYS so a request
+			 * XXX: larger than MAXPHYS - PAGE_SIZE must be
+			 * XXX: page aligned or it will be fragmented.
+			 */
+			iolen = ((vm_offset_t) bp->b_data) & PAGE_MASK;
+			if ((bp->b_bcount + iolen) > bp->b_kvasize) {
+				/*
+				 * This device does not want I/O to be split.
+				 */
+				if (dev->si_flags & SI_NOSPLIT) {
+					uprintf("%s: request ptr %p is not "
+					    "on a page boundary; cannot split "
+					    "request\n", devtoname(dev),
+					    bp->b_data);
+					error = EFBIG;
+					goto doerror;
+				}
+				bp->b_bcount = bp->b_kvasize;
+				if (iolen != 0)
+					bp->b_bcount -= PAGE_SIZE;
+			}
+			bp->b_bufsize = bp->b_bcount;
+
+			bp->b_blkno = btodb(bp->b_offset);
+
+			csw = dev->si_devsw;
+			if (uio->uio_segflg == UIO_USERSPACE) {
+				if (dev->si_flags & SI_UNMAPPED)
+					mapped = 0;
+				else
+					mapped = 1;
+				if (vmapbuf(bp, mapped) < 0) {
+					error = EFAULT;
+					goto doerror;
+				}
+			}
+
+			dev_strategy_csw(dev, csw, bp);
+			if (uio->uio_rw == UIO_READ)
+				bwait(bp, PRIBIO, "physrd");
+			else
+				bwait(bp, PRIBIO, "physwr");
+
+			if (uio->uio_segflg == UIO_USERSPACE)
+				vunmapbuf(bp);
+			iolen = bp->b_bcount - bp->b_resid;
+			if (iolen == 0 && !(bp->b_ioflags & BIO_ERROR))
+				goto doerror;	/* EOF */
+			uio->uio_iov[i].iov_len -= iolen;
+			uio->uio_iov[i].iov_base =
+			    (char *)uio->uio_iov[i].iov_base + iolen;
+			uio->uio_resid -= iolen;
+			uio->uio_offset += iolen;
+			if( bp->b_ioflags & BIO_ERROR) {
+				error = bp->b_error;
+				goto doerror;
+			}
+		}
+	}
+doerror:
+	relpbuf(bp, NULL);
+	PRELE(curproc);
+	return (error);
+}
diff --git a/sys/kern/kern_pmc.c b/sys/kern/kern_pmc.c
new file mode 100644
index 0000000..2b50be0
--- /dev/null
+++ b/sys/kern/kern_pmc.c
@@ -0,0 +1,345 @@
+/*-
+ * Copyright (c) 2003-2008 Joseph Koshy
+ * Copyright (c) 2007 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by A. Joseph Koshy under
+ * sponsorship from the FreeBSD Foundation and Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_hwpmc_hooks.h"
+
+#include <sys/types.h>
+#include <sys/ctype.h>
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pmc.h>
+#include <sys/pmckern.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#ifdef	HWPMC_HOOKS
+FEATURE(hwpmc_hooks, "Kernel support for HW PMC");
+#define	PMC_KERNEL_VERSION	PMC_VERSION
+#else
+#define	PMC_KERNEL_VERSION	0
+#endif
+
+MALLOC_DECLARE(M_PMCHOOKS);
+MALLOC_DEFINE(M_PMCHOOKS, "pmchooks", "Memory space for PMC hooks");
+
+const int pmc_kernel_version = PMC_KERNEL_VERSION;
+
+/* Hook variable. */
+int (*pmc_hook)(struct thread *td, int function, void *arg) = NULL;
+
+/* Interrupt handler */
+int (*pmc_intr)(int cpu, struct trapframe *tf) = NULL;
+
+/* Bitmask of CPUs requiring servicing at hardclock time */
+volatile cpuset_t pmc_cpumask;
+
+/*
+ * A global count of SS mode PMCs.  When non-zero, this means that
+ * we have processes that are sampling the system as a whole.
+ */
+volatile int pmc_ss_count;
+
+/*
+ * Since PMC(4) may not be loaded in the current kernel, the
+ * convention followed is that a non-NULL value of 'pmc_hook' implies
+ * the presence of this kernel module.
+ *
+ * This requires us to protect 'pmc_hook' with a
+ * shared (sx) lock -- thus making the process of calling into PMC(4)
+ * somewhat more expensive than a simple 'if' check and indirect call.
+ */
+struct sx pmc_sx;
+
+/*
+ * PMC Soft per cpu trapframe.
+ */
+struct trapframe pmc_tf[MAXCPU];
+
+/*
+ * PMC Soft use a global table to store registered events.
+ */
+
+SYSCTL_NODE(_kern, OID_AUTO, hwpmc, CTLFLAG_RW, 0, "HWPMC parameters");
+
+static int pmc_softevents = 16;
+TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "softevents", &pmc_softevents);
+SYSCTL_INT(_kern_hwpmc, OID_AUTO, softevents, CTLFLAG_TUN|CTLFLAG_RD,
+    &pmc_softevents, 0, "maximum number of soft events");
+
+struct mtx pmc_softs_mtx;
+int pmc_softs_count;
+struct pmc_soft **pmc_softs;
+
+MTX_SYSINIT(pmc_soft_mtx, &pmc_softs_mtx, "pmc-softs", MTX_SPIN);
+
+static void
+pmc_init_sx(void)
+{
+	sx_init_flags(&pmc_sx, "pmc-sx", SX_NOWITNESS);
+}
+
+SYSINIT(pmcsx, SI_SUB_LOCK, SI_ORDER_MIDDLE, pmc_init_sx, NULL);
+
+/*
+ * Helper functions.
+ */
+
+/*
+ * A note on the CPU numbering scheme used by the hwpmc(4) driver.
+ *
+ * CPUs are denoted using numbers in the range 0..[pmc_cpu_max()-1].
+ * CPUs could be numbered "sparsely" in this range; the predicate
+ * `pmc_cpu_is_present()' is used to test whether a given CPU is
+ * physically present.
+ *
+ * Further, a CPU that is physically present may be administratively
+ * disabled or otherwise unavailable for use by hwpmc(4).  The
+ * `pmc_cpu_is_active()' predicate tests for CPU usability.  An
+ * "active" CPU participates in thread scheduling and can field
+ * interrupts raised by PMC hardware.
+ *
+ * On systems with hyperthreaded CPUs, multiple logical CPUs may share
+ * PMC hardware resources.  For such processors one logical CPU is
+ * denoted as the primary owner of the in-CPU PMC resources. The
+ * pmc_cpu_is_primary() predicate is used to distinguish this primary
+ * CPU from the others.
+ */
+
+int
+pmc_cpu_is_active(int cpu)
+{
+#ifdef	SMP
+	return (pmc_cpu_is_present(cpu) &&
+	    !CPU_ISSET(cpu, &hlt_cpus_mask));
+#else
+	return (1);
+#endif
+}
+
+/* Deprecated. */
+int
+pmc_cpu_is_disabled(int cpu)
+{
+	return (!pmc_cpu_is_active(cpu));
+}
+
+int
+pmc_cpu_is_present(int cpu)
+{
+#ifdef	SMP
+	return (!CPU_ABSENT(cpu));
+#else
+	return (1);
+#endif
+}
+
+int
+pmc_cpu_is_primary(int cpu)
+{
+#ifdef	SMP
+	return (!CPU_ISSET(cpu, &logical_cpus_mask));
+#else
+	return (1);
+#endif
+}
+
+
+/*
+ * Return the maximum CPU number supported by the system.  The return
+ * value is used for scaling internal data structures and for runtime
+ * checks.
+ */
+unsigned int
+pmc_cpu_max(void)
+{
+#ifdef	SMP
+	return (mp_maxid+1);
+#else
+	return (1);
+#endif
+}
+
+#ifdef	INVARIANTS
+
+/*
+ * Return the count of CPUs in the `active' state in the system.
+ */
+int
+pmc_cpu_max_active(void)
+{
+#ifdef	SMP
+	/*
+	 * When support for CPU hot-plugging is added to the kernel,
+	 * this function would change to return the current number
+	 * of "active" CPUs.
+	 */
+	return (mp_ncpus);
+#else
+	return (1);
+#endif
+}
+
+#endif
+
+/*
+ * Cleanup event name:
+ * - remove duplicate '_'
+ * - all uppercase
+ */
+static void
+pmc_soft_namecleanup(char *name)
+{
+	char *p, *q;
+
+	p = q = name;
+
+	for ( ; *p == '_' ; p++)
+		;
+	for ( ; *p ; p++) {
+		if (*p == '_' && (*(p + 1) == '_' || *(p + 1) == '\0'))
+			continue;
+		else
+			*q++ = toupper(*p);
+	}
+	*q = '\0';
+}
+
+void
+pmc_soft_ev_register(struct pmc_soft *ps)
+{
+	static int warned = 0;
+	int n;
+
+	ps->ps_running  = 0;
+	ps->ps_ev.pm_ev_code = 0; /* invalid */
+	pmc_soft_namecleanup(ps->ps_ev.pm_ev_name);
+
+	mtx_lock_spin(&pmc_softs_mtx);
+
+	if (pmc_softs_count >= pmc_softevents) {
+		/*
+		 * XXX Reusing events can enter a race condition where
+		 * new allocated event will be used as an old one.
+		 */
+		for (n = 0; n < pmc_softevents; n++)
+			if (pmc_softs[n] == NULL)
+				break;
+		if (n == pmc_softevents) {
+			mtx_unlock_spin(&pmc_softs_mtx);
+			if (!warned) {
+				printf("hwpmc: too many soft events, "
+				    "increase kern.hwpmc.softevents tunable\n");
+				warned = 1;
+			}
+			return;
+		}
+
+		ps->ps_ev.pm_ev_code = PMC_EV_SOFT_FIRST + n;
+		pmc_softs[n] = ps;
+	} else {
+		ps->ps_ev.pm_ev_code = PMC_EV_SOFT_FIRST + pmc_softs_count;
+		pmc_softs[pmc_softs_count++] = ps;
+	}
+
+	mtx_unlock_spin(&pmc_softs_mtx);
+}
+
+void
+pmc_soft_ev_deregister(struct pmc_soft *ps)
+{
+
+	KASSERT(ps != NULL, ("pmc_soft_deregister: called with NULL"));
+
+	mtx_lock_spin(&pmc_softs_mtx);
+
+	if (ps->ps_ev.pm_ev_code != 0 &&
+	    (ps->ps_ev.pm_ev_code - PMC_EV_SOFT_FIRST) < pmc_softevents) {
+		KASSERT(ps->ps_ev.pm_ev_code >= PMC_EV_SOFT_FIRST &&
+		    ps->ps_ev.pm_ev_code <= PMC_EV_SOFT_LAST,
+		    ("pmc_soft_deregister: invalid event value"));
+		pmc_softs[ps->ps_ev.pm_ev_code - PMC_EV_SOFT_FIRST] = NULL;
+	}
+
+	mtx_unlock_spin(&pmc_softs_mtx);
+}
+
+struct pmc_soft *
+pmc_soft_ev_acquire(enum pmc_event ev)
+{
+	struct pmc_soft *ps;
+
+	if (ev == 0 || (ev - PMC_EV_SOFT_FIRST) >= pmc_softevents)
+		return NULL;
+
+	KASSERT(ev >= PMC_EV_SOFT_FIRST &&
+	    ev <= PMC_EV_SOFT_LAST,
+	    ("event out of range"));
+
+	mtx_lock_spin(&pmc_softs_mtx);
+
+	ps = pmc_softs[ev - PMC_EV_SOFT_FIRST];
+	if (ps == NULL)
+		mtx_unlock_spin(&pmc_softs_mtx);
+
+	return ps;
+}
+
+void
+pmc_soft_ev_release(struct pmc_soft *ps)
+{
+
+	mtx_unlock_spin(&pmc_softs_mtx);
+}
+
+/*
+ *  Initialise hwpmc.
+ */
+static void
+init_hwpmc(void *dummy __unused)
+{
+	if (pmc_softevents <= 0 ||
+	    pmc_softevents > PMC_EV_DYN_COUNT) {
+		(void) printf("hwpmc: tunable \"softevents\"=%d out of "
+		    "range.\n", pmc_softevents);
+		pmc_softevents = PMC_EV_DYN_COUNT;
+	}
+	pmc_softs = malloc(pmc_softevents * sizeof(struct pmc_soft *), M_PMCHOOKS, M_NOWAIT|M_ZERO);
+	KASSERT(pmc_softs != NULL, ("cannot allocate soft events table"));
+}
+
+SYSINIT(hwpmc, SI_SUB_KDTRACE, SI_ORDER_FIRST, init_hwpmc, NULL);
+
diff --git a/sys/kern/kern_poll.c b/sys/kern/kern_poll.c
new file mode 100644
index 0000000..349f338
--- /dev/null
+++ b/sys/kern/kern_poll.c
@@ -0,0 +1,567 @@
+/*-
+ * Copyright (c) 2001-2002 Luigi Rizzo
+ *
+ * Supported by: the Xorp Project (www.xorp.org)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_device_polling.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/proc.h>
+#include <sys/eventhandler.h>
+#include <sys/resourcevar.h>
+#include <sys/socket.h>			/* needed by net/if.h		*/
+#include <sys/sockio.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+
+#include <net/if.h>			/* for IFF_* flags		*/
+#include <net/netisr.h>			/* for NETISR_POLL		*/
+#include <net/vnet.h>
+
+void hardclock_device_poll(void);	/* hook from hardclock		*/
+
+static struct mtx	poll_mtx;
+
+/*
+ * Polling support for [network] device drivers.
+ *
+ * Drivers which support this feature can register with the
+ * polling code.
+ *
+ * If registration is successful, the driver must disable interrupts,
+ * and further I/O is performed through the handler, which is invoked
+ * (at least once per clock tick) with 3 arguments: the "arg" passed at
+ * register time (a struct ifnet pointer), a command, and a "count" limit.
+ *
+ * The command can be one of the following:
+ *  POLL_ONLY: quick move of "count" packets from input/output queues.
+ *  POLL_AND_CHECK_STATUS: as above, plus check status registers or do
+ *	other more expensive operations. This command is issued periodically
+ *	but less frequently than POLL_ONLY.
+ *
+ * The count limit specifies how much work the handler can do during the
+ * call -- typically this is the number of packets to be received, or
+ * transmitted, etc. (drivers are free to interpret this number, as long
+ * as the max time spent in the function grows roughly linearly with the
+ * count).
+ *
+ * Polling is enabled and disabled via setting IFCAP_POLLING flag on
+ * the interface. The driver ioctl handler should register interface
+ * with polling and disable interrupts, if registration was successful.
+ *
+ * A second variable controls the sharing of CPU between polling/kernel
+ * network processing, and other activities (typically userlevel tasks):
+ * kern.polling.user_frac (between 0 and 100, default 50) sets the share
+ * of CPU allocated to user tasks. CPU is allocated proportionally to the
+ * shares, by dynamically adjusting the "count" (poll_burst).
+ *
+ * Other parameters can should be left to their default values.
+ * The following constraints hold
+ *
+ *	1 <= poll_each_burst <= poll_burst <= poll_burst_max
+ *	MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX
+ */
+
+#define MIN_POLL_BURST_MAX	10
+#define MAX_POLL_BURST_MAX	20000
+
+static uint32_t poll_burst = 5;
+static uint32_t poll_burst_max = 150;	/* good for 100Mbit net and HZ=1000 */
+static uint32_t poll_each_burst = 5;
+
+static SYSCTL_NODE(_kern, OID_AUTO, polling, CTLFLAG_RW, 0,
+	"Device polling parameters");
+
+SYSCTL_UINT(_kern_polling, OID_AUTO, burst, CTLFLAG_RD,
+	&poll_burst, 0, "Current polling burst size");
+
+static int	netisr_poll_scheduled;
+static int	netisr_pollmore_scheduled;
+static int	poll_shutting_down;
+
+static int poll_burst_max_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	uint32_t val = poll_burst_max;
+	int error;
+
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error || !req->newptr )
+		return (error);
+	if (val < MIN_POLL_BURST_MAX || val > MAX_POLL_BURST_MAX)
+		return (EINVAL);
+
+	mtx_lock(&poll_mtx);
+	poll_burst_max = val;
+	if (poll_burst > poll_burst_max)
+		poll_burst = poll_burst_max;
+	if (poll_each_burst > poll_burst_max)
+		poll_each_burst = MIN_POLL_BURST_MAX;
+	mtx_unlock(&poll_mtx);
+
+	return (0);
+}
+SYSCTL_PROC(_kern_polling, OID_AUTO, burst_max, CTLTYPE_UINT | CTLFLAG_RW,
+	0, sizeof(uint32_t), poll_burst_max_sysctl, "I", "Max Polling burst size");
+
+static int poll_each_burst_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	uint32_t val = poll_each_burst;
+	int error;
+
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error || !req->newptr )
+		return (error);
+	if (val < 1)
+		return (EINVAL);
+
+	mtx_lock(&poll_mtx);
+	if (val > poll_burst_max) {
+		mtx_unlock(&poll_mtx);
+		return (EINVAL);
+	}
+	poll_each_burst = val;
+	mtx_unlock(&poll_mtx);
+
+	return (0);
+}
+SYSCTL_PROC(_kern_polling, OID_AUTO, each_burst, CTLTYPE_UINT | CTLFLAG_RW,
+	0, sizeof(uint32_t), poll_each_burst_sysctl, "I",
+	"Max size of each burst");
+
+static uint32_t poll_in_idle_loop=0;	/* do we poll in idle loop ? */
+SYSCTL_UINT(_kern_polling, OID_AUTO, idle_poll, CTLFLAG_RW,
+	&poll_in_idle_loop, 0, "Enable device polling in idle loop");
+
+static uint32_t user_frac = 50;
+static int user_frac_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	uint32_t val = user_frac;
+	int error;
+
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error || !req->newptr )
+		return (error);
+	if (val > 99)
+		return (EINVAL);
+
+	mtx_lock(&poll_mtx);
+	user_frac = val;
+	mtx_unlock(&poll_mtx);
+
+	return (0);
+}
+SYSCTL_PROC(_kern_polling, OID_AUTO, user_frac, CTLTYPE_UINT | CTLFLAG_RW,
+	0, sizeof(uint32_t), user_frac_sysctl, "I",
+	"Desired user fraction of cpu time");
+
+static uint32_t reg_frac_count = 0;
+static uint32_t reg_frac = 20 ;
+static int reg_frac_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	uint32_t val = reg_frac;
+	int error;
+
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error || !req->newptr )
+		return (error);
+	if (val < 1 || val > hz)
+		return (EINVAL);
+
+	mtx_lock(&poll_mtx);
+	reg_frac = val;
+	if (reg_frac_count >= reg_frac)
+		reg_frac_count = 0;
+	mtx_unlock(&poll_mtx);
+
+	return (0);
+}
+SYSCTL_PROC(_kern_polling, OID_AUTO, reg_frac, CTLTYPE_UINT | CTLFLAG_RW,
+	0, sizeof(uint32_t), reg_frac_sysctl, "I",
+	"Every this many cycles check registers");
+
+static uint32_t short_ticks;
+SYSCTL_UINT(_kern_polling, OID_AUTO, short_ticks, CTLFLAG_RD,
+	&short_ticks, 0, "Hardclock ticks shorter than they should be");
+
+static uint32_t lost_polls;
+SYSCTL_UINT(_kern_polling, OID_AUTO, lost_polls, CTLFLAG_RD,
+	&lost_polls, 0, "How many times we would have lost a poll tick");
+
+static uint32_t pending_polls;
+SYSCTL_UINT(_kern_polling, OID_AUTO, pending_polls, CTLFLAG_RD,
+	&pending_polls, 0, "Do we need to poll again");
+
+static int residual_burst = 0;
+SYSCTL_INT(_kern_polling, OID_AUTO, residual_burst, CTLFLAG_RD,
+	&residual_burst, 0, "# of residual cycles in burst");
+
+static uint32_t poll_handlers; /* next free entry in pr[]. */
+SYSCTL_UINT(_kern_polling, OID_AUTO, handlers, CTLFLAG_RD,
+	&poll_handlers, 0, "Number of registered poll handlers");
+
+static uint32_t phase;
+SYSCTL_UINT(_kern_polling, OID_AUTO, phase, CTLFLAG_RD,
+	&phase, 0, "Polling phase");
+
+static uint32_t suspect;
+SYSCTL_UINT(_kern_polling, OID_AUTO, suspect, CTLFLAG_RD,
+	&suspect, 0, "suspect event");
+
+static uint32_t stalled;
+SYSCTL_UINT(_kern_polling, OID_AUTO, stalled, CTLFLAG_RD,
+	&stalled, 0, "potential stalls");
+
+static uint32_t idlepoll_sleeping; /* idlepoll is sleeping */
+SYSCTL_UINT(_kern_polling, OID_AUTO, idlepoll_sleeping, CTLFLAG_RD,
+	&idlepoll_sleeping, 0, "idlepoll is sleeping");
+
+
+#define POLL_LIST_LEN  128
+struct pollrec {
+	poll_handler_t	*handler;
+	struct ifnet	*ifp;
+};
+
+static struct pollrec pr[POLL_LIST_LEN];
+
+static void
+poll_shutdown(void *arg, int howto)
+{
+
+	poll_shutting_down = 1;
+}
+
+static void
+init_device_poll(void)
+{
+
+	mtx_init(&poll_mtx, "polling", NULL, MTX_DEF);
+	EVENTHANDLER_REGISTER(shutdown_post_sync, poll_shutdown, NULL,
+	    SHUTDOWN_PRI_LAST);
+}
+SYSINIT(device_poll, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, init_device_poll, NULL);
+
+
+/*
+ * Hook from hardclock. Tries to schedule a netisr, but keeps track
+ * of lost ticks due to the previous handler taking too long.
+ * Normally, this should not happen, because polling handler should
+ * run for a short time. However, in some cases (e.g. when there are
+ * changes in link status etc.) the drivers take a very long time
+ * (even in the order of milliseconds) to reset and reconfigure the
+ * device, causing apparent lost polls.
+ *
+ * The first part of the code is just for debugging purposes, and tries
+ * to count how often hardclock ticks are shorter than they should,
+ * meaning either stray interrupts or delayed events.
+ */
+void
+hardclock_device_poll(void)
+{
+	static struct timeval prev_t, t;
+	int delta;
+
+	if (poll_handlers == 0 || poll_shutting_down)
+		return;
+
+	microuptime(&t);
+	delta = (t.tv_usec - prev_t.tv_usec) +
+		(t.tv_sec - prev_t.tv_sec)*1000000;
+	if (delta * hz < 500000)
+		short_ticks++;
+	else
+		prev_t = t;
+
+	if (pending_polls > 100) {
+		/*
+		 * Too much, assume it has stalled (not always true
+		 * see comment above).
+		 */
+		stalled++;
+		pending_polls = 0;
+		phase = 0;
+	}
+
+	if (phase <= 2) {
+		if (phase != 0)
+			suspect++;
+		phase = 1;
+		netisr_poll_scheduled = 1;
+		netisr_pollmore_scheduled = 1;
+		netisr_sched_poll();
+		phase = 2;
+	}
+	if (pending_polls++ > 0)
+		lost_polls++;
+}
+
+/*
+ * ether_poll is called from the idle loop.
+ */
+static void
+ether_poll(int count)
+{
+	int i;
+
+	mtx_lock(&poll_mtx);
+
+	if (count > poll_each_burst)
+		count = poll_each_burst;
+
+	for (i = 0 ; i < poll_handlers ; i++)
+		pr[i].handler(pr[i].ifp, POLL_ONLY, count);
+
+	mtx_unlock(&poll_mtx);
+}
+
+/*
+ * netisr_pollmore is called after other netisr's, possibly scheduling
+ * another NETISR_POLL call, or adapting the burst size for the next cycle.
+ *
+ * It is very bad to fetch large bursts of packets from a single card at once,
+ * because the burst could take a long time to be completely processed, or
+ * could saturate the intermediate queue (ipintrq or similar) leading to
+ * losses or unfairness. To reduce the problem, and also to account better for
+ * time spent in network-related processing, we split the burst in smaller
+ * chunks of fixed size, giving control to the other netisr's between chunks.
+ * This helps in improving the fairness, reducing livelock (because we
+ * emulate more closely the "process to completion" that we have with
+ * fastforwarding) and accounting for the work performed in low level
+ * handling and forwarding.
+ */
+
+static struct timeval poll_start_t;
+
+void
+netisr_pollmore()
+{
+	struct timeval t;
+	int kern_load;
+
+	mtx_lock(&poll_mtx);
+	if (!netisr_pollmore_scheduled) {
+		mtx_unlock(&poll_mtx);
+		return;
+	}
+	netisr_pollmore_scheduled = 0;
+	phase = 5;
+	if (residual_burst > 0) {
+		netisr_poll_scheduled = 1;
+		netisr_pollmore_scheduled = 1;
+		netisr_sched_poll();
+		mtx_unlock(&poll_mtx);
+		/* will run immediately on return, followed by netisrs */
+		return;
+	}
+	/* here we can account time spent in netisr's in this tick */
+	microuptime(&t);
+	kern_load = (t.tv_usec - poll_start_t.tv_usec) +
+		(t.tv_sec - poll_start_t.tv_sec)*1000000;	/* us */
+	kern_load = (kern_load * hz) / 10000;			/* 0..100 */
+	if (kern_load > (100 - user_frac)) { /* try decrease ticks */
+		if (poll_burst > 1)
+			poll_burst--;
+	} else {
+		if (poll_burst < poll_burst_max)
+			poll_burst++;
+	}
+
+	pending_polls--;
+	if (pending_polls == 0) /* we are done */
+		phase = 0;
+	else {
+		/*
+		 * Last cycle was long and caused us to miss one or more
+		 * hardclock ticks. Restart processing again, but slightly
+		 * reduce the burst size to prevent that this happens again.
+		 */
+		poll_burst -= (poll_burst / 8);
+		if (poll_burst < 1)
+			poll_burst = 1;
+		netisr_poll_scheduled = 1;
+		netisr_pollmore_scheduled = 1;
+		netisr_sched_poll();
+		phase = 6;
+	}
+	mtx_unlock(&poll_mtx);
+}
+
+/*
+ * netisr_poll is typically scheduled once per tick.
+ */
+void
+netisr_poll(void)
+{
+	int i, cycles;
+	enum poll_cmd arg = POLL_ONLY;
+
+	mtx_lock(&poll_mtx);
+	if (!netisr_poll_scheduled) {
+		mtx_unlock(&poll_mtx);
+		return;
+	}
+	netisr_poll_scheduled = 0;
+	phase = 3;
+	if (residual_burst == 0) { /* first call in this tick */
+		microuptime(&poll_start_t);
+		if (++reg_frac_count == reg_frac) {
+			arg = POLL_AND_CHECK_STATUS;
+			reg_frac_count = 0;
+		}
+
+		residual_burst = poll_burst;
+	}
+	cycles = (residual_burst < poll_each_burst) ?
+		residual_burst : poll_each_burst;
+	residual_burst -= cycles;
+
+	for (i = 0 ; i < poll_handlers ; i++)
+		pr[i].handler(pr[i].ifp, arg, cycles);
+
+	phase = 4;
+	mtx_unlock(&poll_mtx);
+}
+
+/*
+ * Try to register routine for polling. Returns 0 if successful
+ * (and polling should be enabled), error code otherwise.
+ * A device is not supposed to register itself multiple times.
+ *
+ * This is called from within the *_ioctl() functions.
+ */
+int
+ether_poll_register(poll_handler_t *h, struct ifnet *ifp)
+{
+	int i;
+
+	KASSERT(h != NULL, ("%s: handler is NULL", __func__));
+	KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
+
+	mtx_lock(&poll_mtx);
+	if (poll_handlers >= POLL_LIST_LEN) {
+		/*
+		 * List full, cannot register more entries.
+		 * This should never happen; if it does, it is probably a
+		 * broken driver trying to register multiple times. Checking
+		 * this at runtime is expensive, and won't solve the problem
+		 * anyways, so just report a few times and then give up.
+		 */
+		static int verbose = 10 ;
+		if (verbose >0) {
+			log(LOG_ERR, "poll handlers list full, "
+			    "maybe a broken driver ?\n");
+			verbose--;
+		}
+		mtx_unlock(&poll_mtx);
+		return (ENOMEM); /* no polling for you */
+	}
+
+	for (i = 0 ; i < poll_handlers ; i++)
+		if (pr[i].ifp == ifp && pr[i].handler != NULL) {
+			mtx_unlock(&poll_mtx);
+			log(LOG_DEBUG, "ether_poll_register: %s: handler"
+			    " already registered\n", ifp->if_xname);
+			return (EEXIST);
+		}
+
+	pr[poll_handlers].handler = h;
+	pr[poll_handlers].ifp = ifp;
+	poll_handlers++;
+	mtx_unlock(&poll_mtx);
+	if (idlepoll_sleeping)
+		wakeup(&idlepoll_sleeping);
+	return (0);
+}
+
+/*
+ * Remove interface from the polling list. Called from *_ioctl(), too.
+ */
+int
+ether_poll_deregister(struct ifnet *ifp)
+{
+	int i;
+
+	KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
+
+	mtx_lock(&poll_mtx);
+
+	for (i = 0 ; i < poll_handlers ; i++)
+		if (pr[i].ifp == ifp) /* found it */
+			break;
+	if (i == poll_handlers) {
+		log(LOG_DEBUG, "ether_poll_deregister: %s: not found!\n",
+		    ifp->if_xname);
+		mtx_unlock(&poll_mtx);
+		return (ENOENT);
+	}
+	poll_handlers--;
+	if (i < poll_handlers) { /* Last entry replaces this one. */
+		pr[i].handler = pr[poll_handlers].handler;
+		pr[i].ifp = pr[poll_handlers].ifp;
+	}
+	mtx_unlock(&poll_mtx);
+	return (0);
+}
+
+static void
+poll_idle(void)
+{
+	struct thread *td = curthread;
+	struct rtprio rtp;
+
+	rtp.prio = RTP_PRIO_MAX;	/* lowest priority */
+	rtp.type = RTP_PRIO_IDLE;
+	PROC_SLOCK(td->td_proc);
+	rtp_to_pri(&rtp, td);
+	PROC_SUNLOCK(td->td_proc);
+
+	for (;;) {
+		if (poll_in_idle_loop && poll_handlers > 0) {
+			idlepoll_sleeping = 0;
+			ether_poll(poll_each_burst);
+			thread_lock(td);
+			mi_switch(SW_VOL, NULL);
+			thread_unlock(td);
+		} else {
+			idlepoll_sleeping = 1;
+			tsleep(&idlepoll_sleeping, 0, "pollid", hz * 3);
+		}
+	}
+}
+
+static struct proc *idlepoll;
+static struct kproc_desc idlepoll_kp = {
+	 "idlepoll",
+	 poll_idle,
+	 &idlepoll
+};
+SYSINIT(idlepoll, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, kproc_start,
+    &idlepoll_kp);
diff --git a/sys/kern/kern_priv.c b/sys/kern/kern_priv.c
new file mode 100644
index 0000000..4d266ab
--- /dev/null
+++ b/sys/kern/kern_priv.c
@@ -0,0 +1,185 @@
+/*-
+ * Copyright (c) 2006 nCircle Network Security, Inc.
+ * Copyright (c) 2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert N. M. Watson for the TrustedBSD
+ * Project under contract to nCircle Network Security, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY,
+ * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_kdtrace.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/sdt.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <security/mac/mac_framework.h>
+
+/*
+ * `suser_enabled' (which can be set by the security.bsd.suser_enabled
+ * sysctl) determines whether the system 'super-user' policy is in effect.  If
+ * it is nonzero, an effective uid of 0 connotes special privilege,
+ * overriding many mandatory and discretionary protections.  If it is zero,
+ * uid 0 is offered no special privilege in the kernel security policy.
+ * Setting it to zero may seriously impact the functionality of many existing
+ * userland programs, and should not be done without careful consideration of
+ * the consequences.
+ */
+static int	suser_enabled = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, suser_enabled, CTLFLAG_RW,
+    &suser_enabled, 0, "processes with uid 0 have privilege");
+TUNABLE_INT("security.bsd.suser_enabled", &suser_enabled);
+
+static int	unprivileged_mlock = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_mlock, CTLFLAG_RW|CTLFLAG_TUN,
+    &unprivileged_mlock, 0, "Allow non-root users to call mlock(2)");
+TUNABLE_INT("security.bsd.unprivileged_mlock", &unprivileged_mlock);
+
+SDT_PROVIDER_DEFINE(priv);
+SDT_PROBE_DEFINE1(priv, kernel, priv_check, priv_ok, priv-ok, "int");
+SDT_PROBE_DEFINE1(priv, kernel, priv_check, priv_err, priv-err, "int");
+
+/*
+ * Check a credential for privilege.  Lots of good reasons to deny privilege;
+ * only a few to grant it.
+ */
+int
+priv_check_cred(struct ucred *cred, int priv, int flags)
+{
+	int error;
+
+	KASSERT(PRIV_VALID(priv), ("priv_check_cred: invalid privilege %d",
+	    priv));
+
+	/*
+	 * We first evaluate policies that may deny the granting of
+	 * privilege unilaterally.
+	 */
+#ifdef MAC
+	error = mac_priv_check(cred, priv);
+	if (error)
+		goto out;
+#endif
+
+	/*
+	 * Jail policy will restrict certain privileges that may otherwise be
+	 * be granted.
+	 */
+	error = prison_priv_check(cred, priv);
+	if (error)
+		goto out;
+
+	if (unprivileged_mlock) {
+		/*
+		 * Allow unprivileged users to call mlock(2)/munlock(2) and
+		 * mlockall(2)/munlockall(2).
+		 */
+		switch (priv) {
+		case PRIV_VM_MLOCK:
+		case PRIV_VM_MUNLOCK:
+			error = 0;
+			goto out;
+		}
+	}
+
+	/*
+	 * Having determined if privilege is restricted by various policies,
+	 * now determine if privilege is granted.  At this point, any policy
+	 * may grant privilege.  For now, we allow short-circuit boolean
+	 * evaluation, so may not call all policies.  Perhaps we should.
+	 *
+	 * Superuser policy grants privilege based on the effective (or in
+	 * the case of specific privileges, real) uid being 0.  We allow the
+	 * superuser policy to be globally disabled, although this is
+	 * currenty of limited utility.
+	 */
+	if (suser_enabled) {
+		switch (priv) {
+		case PRIV_MAXFILES:
+		case PRIV_MAXPROC:
+		case PRIV_PROC_LIMIT:
+			if (cred->cr_ruid == 0) {
+				error = 0;
+				goto out;
+			}
+			break;
+		default:
+			if (cred->cr_uid == 0) {
+				error = 0;
+				goto out;
+			}
+			break;
+		}
+	}
+
+	/*
+	 * Writes to kernel/physical memory are a typical root-only operation,
+	 * but non-root users are expected to be able to read it (provided they
+	 * have permission to access /dev/[k]mem).
+	 */
+	if (priv == PRIV_KMEM_READ) {
+		error = 0;
+		goto out;
+	}
+
+	/*
+	 * Now check with MAC, if enabled, to see if a policy module grants
+	 * privilege.
+	 */
+#ifdef MAC
+	if (mac_priv_grant(cred, priv) == 0) {
+		error = 0;
+		goto out;
+	}
+#endif
+
+	/*
+	 * The default is deny, so if no policies have granted it, reject
+	 * with a privilege error here.
+	 */
+	error = EPERM;
+out:
+	if (error)
+		SDT_PROBE1(priv, kernel, priv_check, priv_err, priv);
+	else
+		SDT_PROBE1(priv, kernel, priv_check, priv_ok, priv);
+	return (error);
+}
+
+int
+priv_check(struct thread *td, int priv)
+{
+
+	KASSERT(td == curthread, ("priv_check: td != curthread"));
+
+	return (priv_check_cred(td->td_ucred, priv, 0));
+}
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
new file mode 100644
index 0000000..3fa7a7f
--- /dev/null
+++ b/sys/kern/kern_proc.c
@@ -0,0 +1,2740 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_proc.c	8.7 (Berkeley) 2/14/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_ddb.h"
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+#include "opt_kstack_pages.h"
+#include "opt_stack.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/elf.h>
+#include <sys/exec.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/loginclass.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/ptrace.h>
+#include <sys/refcount.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/sysent.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/stack.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/filedesc.h>
+#include <sys/tty.h>
+#include <sys/signalvar.h>
+#include <sys/sdt.h>
+#include <sys/sx.h>
+#include <sys/user.h>
+#include <sys/jail.h>
+#include <sys/vnode.h>
+#include <sys/eventhandler.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_util.h>
+#endif
+
+SDT_PROVIDER_DEFINE(proc);
+SDT_PROBE_DEFINE4(proc, kernel, ctor, entry, entry, "struct proc *", "int",
+    "void *", "int");
+SDT_PROBE_DEFINE4(proc, kernel, ctor, return, return, "struct proc *", "int",
+    "void *", "int");
+SDT_PROBE_DEFINE4(proc, kernel, dtor, entry, entry, "struct proc *", "int",
+    "void *", "struct thread *");
+SDT_PROBE_DEFINE3(proc, kernel, dtor, return, return, "struct proc *", "int",
+    "void *");
+SDT_PROBE_DEFINE3(proc, kernel, init, entry, entry, "struct proc *", "int",
+    "int");
+SDT_PROBE_DEFINE3(proc, kernel, init, return, return, "struct proc *", "int",
+    "int");
+
+MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
+MALLOC_DEFINE(M_SESSION, "session", "session header");
+static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
+MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");
+
+static void doenterpgrp(struct proc *, struct pgrp *);
+static void orphanpg(struct pgrp *pg);
+static void fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp);
+static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp);
+static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp,
+    int preferthread);
+static void pgadjustjobc(struct pgrp *pgrp, int entering);
+static void pgdelete(struct pgrp *);
+static int proc_ctor(void *mem, int size, void *arg, int flags);
+static void proc_dtor(void *mem, int size, void *arg);
+static int proc_init(void *mem, int size, int flags);
+static void proc_fini(void *mem, int size);
+static void pargs_free(struct pargs *pa);
+static struct proc *zpfind_locked(pid_t pid);
+
+/*
+ * Other process lists
+ */
+struct pidhashhead *pidhashtbl;
+u_long pidhash;
+struct pgrphashhead *pgrphashtbl;
+u_long pgrphash;
+struct proclist allproc;
+struct proclist zombproc;
+struct sx allproc_lock;
+struct sx proctree_lock;
+struct mtx ppeers_lock;
+uma_zone_t proc_zone;
+
+int kstack_pages = KSTACK_PAGES;
+SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0,
+    "Kernel stack size in pages");
+
+CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
+#ifdef COMPAT_FREEBSD32
+CTASSERT(sizeof(struct kinfo_proc32) == KINFO_PROC32_SIZE);
+#endif
+
+/*
+ * Initialize global process hashing structures.
+ */
+void
+procinit()
+{
+
+	sx_init(&allproc_lock, "allproc");
+	sx_init(&proctree_lock, "proctree");
+	mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF);
+	LIST_INIT(&allproc);
+	LIST_INIT(&zombproc);
+	pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
+	pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
+	proc_zone = uma_zcreate("PROC", sched_sizeof_proc(),
+	    proc_ctor, proc_dtor, proc_init, proc_fini,
+	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	uihashinit();
+}
+
+/*
+ * Prepare a proc for use.
+ */
+static int
+proc_ctor(void *mem, int size, void *arg, int flags)
+{
+	struct proc *p;
+
+	p = (struct proc *)mem;
+	SDT_PROBE(proc, kernel, ctor , entry, p, size, arg, flags, 0);
+	EVENTHANDLER_INVOKE(process_ctor, p);
+	SDT_PROBE(proc, kernel, ctor , return, p, size, arg, flags, 0);
+	return (0);
+}
+
+/*
+ * Reclaim a proc after use.
+ */
+static void
+proc_dtor(void *mem, int size, void *arg)
+{
+	struct proc *p;
+	struct thread *td;
+
+	/* INVARIANTS checks go here */
+	p = (struct proc *)mem;
+	td = FIRST_THREAD_IN_PROC(p);
+	SDT_PROBE(proc, kernel, dtor, entry, p, size, arg, td, 0);
+	if (td != NULL) {
+#ifdef INVARIANTS
+		KASSERT((p->p_numthreads == 1),
+		    ("bad number of threads in exiting process"));
+		KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr"));
+#endif
+		/* Free all OSD associated to this thread. */
+		osd_thread_exit(td);
+	}
+	EVENTHANDLER_INVOKE(process_dtor, p);
+	if (p->p_ksi != NULL)
+		KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue"));
+	SDT_PROBE(proc, kernel, dtor, return, p, size, arg, 0, 0);
+}
+
+/*
+ * Initialize type-stable parts of a proc (when newly created).
+ */
+static int
+proc_init(void *mem, int size, int flags)
+{
+	struct proc *p;
+
+	p = (struct proc *)mem;
+	SDT_PROBE(proc, kernel, init, entry, p, size, flags, 0, 0);
+	p->p_sched = (struct p_sched *)&p[1];
+	bzero(&p->p_mtx, sizeof(struct mtx));
+	mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+	mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
+	cv_init(&p->p_pwait, "ppwait");
+	cv_init(&p->p_dbgwait, "dbgwait");
+	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
+	EVENTHANDLER_INVOKE(process_init, p);
+	p->p_stats = pstats_alloc();
+	SDT_PROBE(proc, kernel, init, return, p, size, flags, 0, 0);
+	return (0);
+}
+
+/*
+ * UMA should ensure that this function is never called.
+ * Freeing a proc structure would violate type stability.
+ */
+static void
+proc_fini(void *mem, int size)
+{
+#ifdef notnow
+	struct proc *p;
+
+	p = (struct proc *)mem;
+	EVENTHANDLER_INVOKE(process_fini, p);
+	pstats_free(p->p_stats);
+	thread_free(FIRST_THREAD_IN_PROC(p));
+	mtx_destroy(&p->p_mtx);
+	if (p->p_ksi != NULL)
+		ksiginfo_free(p->p_ksi);
+#else
+	panic("proc reclaimed");
+#endif
+}
+
+/*
+ * Is p an inferior of the current process?
+ */
+int
+inferior(p)
+	register struct proc *p;
+{
+
+	sx_assert(&proctree_lock, SX_LOCKED);
+	for (; p != curproc; p = p->p_pptr)
+		if (p->p_pid == 0)
+			return (0);
+	return (1);
+}
+
+struct proc *
+pfind_locked(pid_t pid)
+{
+	struct proc *p;
+
+	sx_assert(&allproc_lock, SX_LOCKED);
+	LIST_FOREACH(p, PIDHASH(pid), p_hash) {
+		if (p->p_pid == pid) {
+			PROC_LOCK(p);
+			if (p->p_state == PRS_NEW) {
+				PROC_UNLOCK(p);
+				p = NULL;
+			}
+			break;
+		}
+	}
+	return (p);
+}
+
+/*
+ * Locate a process by number; return only "live" processes -- i.e., neither
+ * zombies nor newly born but incompletely initialized processes.  By not
+ * returning processes in the PRS_NEW state, we allow callers to avoid
+ * testing for that condition to avoid dereferencing p_ucred, et al.
+ */
+struct proc *
+pfind(pid_t pid)
+{
+	struct proc *p;
+
+	sx_slock(&allproc_lock);
+	p = pfind_locked(pid);
+	sx_sunlock(&allproc_lock);
+	return (p);
+}
+
+static struct proc *
+pfind_tid_locked(pid_t tid)
+{
+	struct proc *p;
+	struct thread *td;
+
+	sx_assert(&allproc_lock, SX_LOCKED);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		if (p->p_state == PRS_NEW) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		FOREACH_THREAD_IN_PROC(p, td) {
+			if (td->td_tid == tid)
+				goto found;
+		}
+		PROC_UNLOCK(p);
+	}
+found:
+	return (p);
+}
+
+/*
+ * Locate a process group by number.
+ * The caller must hold proctree_lock.
+ */
+struct pgrp *
+pgfind(pgid)
+	register pid_t pgid;
+{
+	register struct pgrp *pgrp;
+
+	sx_assert(&proctree_lock, SX_LOCKED);
+
+	LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) {
+		if (pgrp->pg_id == pgid) {
+			PGRP_LOCK(pgrp);
+			return (pgrp);
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Locate process and do additional manipulations, depending on flags.
+ */
+int
+pget(pid_t pid, int flags, struct proc **pp)
+{
+	struct proc *p;
+	int error;
+
+	sx_slock(&allproc_lock);
+	if (pid <= PID_MAX) {
+		p = pfind_locked(pid);
+		if (p == NULL && (flags & PGET_NOTWEXIT) == 0)
+			p = zpfind_locked(pid);
+	} else if ((flags & PGET_NOTID) == 0) {
+		p = pfind_tid_locked(pid);
+	} else {
+		p = NULL;
+	}
+	sx_sunlock(&allproc_lock);
+	if (p == NULL)
+		return (ESRCH);
+	if ((flags & PGET_CANSEE) != 0) {
+		error = p_cansee(curthread, p);
+		if (error != 0)
+			goto errout;
+	}
+	if ((flags & PGET_CANDEBUG) != 0) {
+		error = p_candebug(curthread, p);
+		if (error != 0)
+			goto errout;
+	}
+	if ((flags & PGET_ISCURRENT) != 0 && curproc != p) {
+		error = EPERM;
+		goto errout;
+	}
+	if ((flags & PGET_NOTWEXIT) != 0 && (p->p_flag & P_WEXIT) != 0) {
+		error = ESRCH;
+		goto errout;
+	}
+	if ((flags & PGET_NOTINEXEC) != 0 && (p->p_flag & P_INEXEC) != 0) {
+		/*
+		 * XXXRW: Not clear ESRCH is the right error during proc
+		 * execve().
+		 */
+		error = ESRCH;
+		goto errout;
+	}
+	if ((flags & PGET_HOLD) != 0) {
+		_PHOLD(p);
+		PROC_UNLOCK(p);
+	}
+	*pp = p;
+	return (0);
+errout:
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+/*
+ * Create a new process group.
+ * pgid must be equal to the pid of p.
+ * Begin a new session if required.
+ */
+int
+enterpgrp(p, pgid, pgrp, sess)
+	register struct proc *p;
+	pid_t pgid;
+	struct pgrp *pgrp;
+	struct session *sess;
+{
+
+	sx_assert(&proctree_lock, SX_XLOCKED);
+
+	KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL"));
+	KASSERT(p->p_pid == pgid,
+	    ("enterpgrp: new pgrp and pid != pgid"));
+	KASSERT(pgfind(pgid) == NULL,
+	    ("enterpgrp: pgrp with pgid exists"));
+	KASSERT(!SESS_LEADER(p),
+	    ("enterpgrp: session leader attempted setpgrp"));
+
+	mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
+
+	if (sess != NULL) {
+		/*
+		 * new session
+		 */
+		mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF);
+		PROC_LOCK(p);
+		p->p_flag &= ~P_CONTROLT;
+		PROC_UNLOCK(p);
+		PGRP_LOCK(pgrp);
+		sess->s_leader = p;
+		sess->s_sid = p->p_pid;
+		refcount_init(&sess->s_count, 1);
+		sess->s_ttyvp = NULL;
+		sess->s_ttydp = NULL;
+		sess->s_ttyp = NULL;
+		bcopy(p->p_session->s_login, sess->s_login,
+			    sizeof(sess->s_login));
+		pgrp->pg_session = sess;
+		KASSERT(p == curproc,
+		    ("enterpgrp: mksession and p != curproc"));
+	} else {
+		pgrp->pg_session = p->p_session;
+		sess_hold(pgrp->pg_session);
+		PGRP_LOCK(pgrp);
+	}
+	pgrp->pg_id = pgid;
+	LIST_INIT(&pgrp->pg_members);
+
+	/*
+	 * As we have an exclusive lock of proctree_lock,
+	 * this should not deadlock.
+	 */
+	LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
+	pgrp->pg_jobc = 0;
+	SLIST_INIT(&pgrp->pg_sigiolst);
+	PGRP_UNLOCK(pgrp);
+
+	doenterpgrp(p, pgrp);
+
+	return (0);
+}
+
+/*
+ * Move p to an existing process group
+ */
+int
+enterthispgrp(p, pgrp)
+	register struct proc *p;
+	struct pgrp *pgrp;
+{
+
+	sx_assert(&proctree_lock, SX_XLOCKED);
+	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
+	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
+	KASSERT(pgrp->pg_session == p->p_session,
+		("%s: pgrp's session %p, p->p_session %p.\n",
+		__func__,
+		pgrp->pg_session,
+		p->p_session));
+	KASSERT(pgrp != p->p_pgrp,
+		("%s: p belongs to pgrp.", __func__));
+
+	doenterpgrp(p, pgrp);
+
+	return (0);
+}
+
+/*
+ * Move p to a process group
+ */
+static void
+doenterpgrp(p, pgrp)
+	struct proc *p;
+	struct pgrp *pgrp;
+{
+	struct pgrp *savepgrp;
+
+	sx_assert(&proctree_lock, SX_XLOCKED);
+	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
+	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
+
+	savepgrp = p->p_pgrp;
+
+	/*
+	 * Adjust eligibility of affected pgrps to participate in job control.
+	 * Increment eligibility counts before decrementing, otherwise we
+	 * could reach 0 spuriously during the first call.
+	 */
+	fixjobc(p, pgrp, 1);
+	fixjobc(p, p->p_pgrp, 0);
+
+	PGRP_LOCK(pgrp);
+	PGRP_LOCK(savepgrp);
+	PROC_LOCK(p);
+	LIST_REMOVE(p, p_pglist);
+	p->p_pgrp = pgrp;
+	PROC_UNLOCK(p);
+	LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
+	PGRP_UNLOCK(savepgrp);
+	PGRP_UNLOCK(pgrp);
+	if (LIST_EMPTY(&savepgrp->pg_members))
+		pgdelete(savepgrp);
+}
+
+/*
+ * remove process from process group
+ */
+int
+leavepgrp(p)
+	register struct proc *p;
+{
+	struct pgrp *savepgrp;
+
+	sx_assert(&proctree_lock, SX_XLOCKED);
+	savepgrp = p->p_pgrp;
+	PGRP_LOCK(savepgrp);
+	PROC_LOCK(p);
+	LIST_REMOVE(p, p_pglist);
+	p->p_pgrp = NULL;
+	PROC_UNLOCK(p);
+	PGRP_UNLOCK(savepgrp);
+	if (LIST_EMPTY(&savepgrp->pg_members))
+		pgdelete(savepgrp);
+	return (0);
+}
+
+/*
+ * delete a process group
+ */
+static void
+pgdelete(pgrp)
+	register struct pgrp *pgrp;
+{
+	struct session *savesess;
+	struct tty *tp;
+
+	sx_assert(&proctree_lock, SX_XLOCKED);
+	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
+
+	/*
+	 * Reset any sigio structures pointing to us as a result of
+	 * F_SETOWN with our pgid.
+	 */
+	funsetownlst(&pgrp->pg_sigiolst);
+
+	PGRP_LOCK(pgrp);
+	tp = pgrp->pg_session->s_ttyp;
+	LIST_REMOVE(pgrp, pg_hash);
+	savesess = pgrp->pg_session;
+	PGRP_UNLOCK(pgrp);
+
+	/* Remove the reference to the pgrp before deallocating it. */
+	if (tp != NULL) {
+		tty_lock(tp);
+		tty_rel_pgrp(tp, pgrp);
+	}
+
+	mtx_destroy(&pgrp->pg_mtx);
+	free(pgrp, M_PGRP);
+	sess_release(savesess);
+}
+
+static void
+pgadjustjobc(pgrp, entering)
+	struct pgrp *pgrp;
+	int entering;
+{
+
+	PGRP_LOCK(pgrp);
+	if (entering)
+		pgrp->pg_jobc++;
+	else {
+		--pgrp->pg_jobc;
+		if (pgrp->pg_jobc == 0)
+			orphanpg(pgrp);
+	}
+	PGRP_UNLOCK(pgrp);
+}
+
+/*
+ * Adjust pgrp jobc counters when specified process changes process group.
+ * We count the number of processes in each process group that "qualify"
+ * the group for terminal job control (those with a parent in a different
+ * process group of the same session).  If that count reaches zero, the
+ * process group becomes orphaned.  Check both the specified process'
+ * process group and that of its children.
+ * entering == 0 => p is leaving specified group.
+ * entering == 1 => p is entering specified group.
+ */
+void
+fixjobc(p, pgrp, entering)
+	register struct proc *p;
+	register struct pgrp *pgrp;
+	int entering;
+{
+	register struct pgrp *hispgrp;
+	register struct session *mysession;
+
+	sx_assert(&proctree_lock, SX_LOCKED);
+	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
+
+	/*
+	 * Check p's parent to see whether p qualifies its own process
+	 * group; if so, adjust count for p's process group.
+	 */
+	mysession = pgrp->pg_session;
+	if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
+	    hispgrp->pg_session == mysession)
+		pgadjustjobc(pgrp, entering);
+
+	/*
+	 * Check this process' children to see whether they qualify
+	 * their process groups; if so, adjust counts for children's
+	 * process groups.
+	 */
+	LIST_FOREACH(p, &p->p_children, p_sibling) {
+		hispgrp = p->p_pgrp;
+		if (hispgrp == pgrp ||
+		    hispgrp->pg_session != mysession)
+			continue;
+		PROC_LOCK(p);
+		if (p->p_state == PRS_ZOMBIE) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		PROC_UNLOCK(p);
+		pgadjustjobc(hispgrp, entering);
+	}
+}
+
+/*
+ * A process group has become orphaned;
+ * if there are any stopped processes in the group,
+ * hang-up all process in that group.
+ */
+static void
+orphanpg(pg)
+	struct pgrp *pg;
+{
+	register struct proc *p;
+
+	PGRP_LOCK_ASSERT(pg, MA_OWNED);
+
+	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+		PROC_LOCK(p);
+		if (P_SHOULDSTOP(p)) {
+			PROC_UNLOCK(p);
+			LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+				PROC_LOCK(p);
+				kern_psignal(p, SIGHUP);
+				kern_psignal(p, SIGCONT);
+				PROC_UNLOCK(p);
+			}
+			return;
+		}
+		PROC_UNLOCK(p);
+	}
+}
+
+void
+sess_hold(struct session *s)
+{
+
+	refcount_acquire(&s->s_count);
+}
+
+void
+sess_release(struct session *s)
+{
+
+	if (refcount_release(&s->s_count)) {
+		if (s->s_ttyp != NULL) {
+			tty_lock(s->s_ttyp);
+			tty_rel_sess(s->s_ttyp, s);
+		}
+		mtx_destroy(&s->s_mtx);
+		free(s, M_SESSION);
+	}
+}
+
+#ifdef DDB
+
+DB_SHOW_COMMAND(pgrpdump, pgrpdump)
+{
+	register struct pgrp *pgrp;
+	register struct proc *p;
+	register int i;
+
+	for (i = 0; i <= pgrphash; i++) {
+		if (!LIST_EMPTY(&pgrphashtbl[i])) {
+			printf("\tindx %d\n", i);
+			LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
+				printf(
+			"\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
+				    (void *)pgrp, (long)pgrp->pg_id,
+				    (void *)pgrp->pg_session,
+				    pgrp->pg_session->s_count,
+				    (void *)LIST_FIRST(&pgrp->pg_members));
+				LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
+					printf("\t\tpid %ld addr %p pgrp %p\n", 
+					    (long)p->p_pid, (void *)p,
+					    (void *)p->p_pgrp);
+				}
+			}
+		}
+	}
+}
+#endif /* DDB */
+
+/*
+ * Calculate the kinfo_proc members which contain process-wide
+ * informations.
+ * Must be called with the target process locked.
+ */
+static void
+fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp)
+{
+	struct thread *td;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	kp->ki_estcpu = 0;
+	kp->ki_pctcpu = 0;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		kp->ki_pctcpu += sched_pctcpu(td);
+		kp->ki_estcpu += td->td_estcpu;
+		thread_unlock(td);
+	}
+}
+
+/*
+ * Clear kinfo_proc and fill in any information that is common
+ * to all threads in the process.
+ * Must be called with the target process locked.
+ */
+static void
+fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp)
+{
+	struct thread *td0;
+	struct tty *tp;
+	struct session *sp;
+	struct ucred *cred;
+	struct sigacts *ps;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	bzero(kp, sizeof(*kp));
+
+	kp->ki_structsize = sizeof(*kp);
+	kp->ki_paddr = p;
+	kp->ki_addr =/* p->p_addr; */0; /* XXX */
+	kp->ki_args = p->p_args;
+	kp->ki_textvp = p->p_textvp;
+#ifdef KTRACE
+	kp->ki_tracep = p->p_tracevp;
+	kp->ki_traceflag = p->p_traceflag;
+#endif
+	kp->ki_fd = p->p_fd;
+	kp->ki_vmspace = p->p_vmspace;
+	kp->ki_flag = p->p_flag;
+	cred = p->p_ucred;
+	if (cred) {
+		kp->ki_uid = cred->cr_uid;
+		kp->ki_ruid = cred->cr_ruid;
+		kp->ki_svuid = cred->cr_svuid;
+		kp->ki_cr_flags = 0;
+		if (cred->cr_flags & CRED_FLAG_CAPMODE)
+			kp->ki_cr_flags |= KI_CRF_CAPABILITY_MODE;
+		/* XXX bde doesn't like KI_NGROUPS */
+		if (cred->cr_ngroups > KI_NGROUPS) {
+			kp->ki_ngroups = KI_NGROUPS;
+			kp->ki_cr_flags |= KI_CRF_GRP_OVERFLOW;
+		} else
+			kp->ki_ngroups = cred->cr_ngroups;
+		bcopy(cred->cr_groups, kp->ki_groups,
+		    kp->ki_ngroups * sizeof(gid_t));
+		kp->ki_rgid = cred->cr_rgid;
+		kp->ki_svgid = cred->cr_svgid;
+		/* If jailed(cred), emulate the old P_JAILED flag. */
+		if (jailed(cred)) {
+			kp->ki_flag |= P_JAILED;
+			/* If inside the jail, use 0 as a jail ID. */
+			if (cred->cr_prison != curthread->td_ucred->cr_prison)
+				kp->ki_jid = cred->cr_prison->pr_id;
+		}
+		strlcpy(kp->ki_loginclass, cred->cr_loginclass->lc_name,
+		    sizeof(kp->ki_loginclass));
+	}
+	ps = p->p_sigacts;
+	if (ps) {
+		mtx_lock(&ps->ps_mtx);
+		kp->ki_sigignore = ps->ps_sigignore;
+		kp->ki_sigcatch = ps->ps_sigcatch;
+		mtx_unlock(&ps->ps_mtx);
+	}
+	if (p->p_state != PRS_NEW &&
+	    p->p_state != PRS_ZOMBIE &&
+	    p->p_vmspace != NULL) {
+		struct vmspace *vm = p->p_vmspace;
+
+		kp->ki_size = vm->vm_map.size;
+		kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/
+		FOREACH_THREAD_IN_PROC(p, td0) {
+			if (!TD_IS_SWAPPED(td0))
+				kp->ki_rssize += td0->td_kstack_pages;
+		}
+		kp->ki_swrss = vm->vm_swrss;
+		kp->ki_tsize = vm->vm_tsize;
+		kp->ki_dsize = vm->vm_dsize;
+		kp->ki_ssize = vm->vm_ssize;
+	} else if (p->p_state == PRS_ZOMBIE)
+		kp->ki_stat = SZOMB;
+	if (kp->ki_flag & P_INMEM)
+		kp->ki_sflag = PS_INMEM;
+	else
+		kp->ki_sflag = 0;
+	/* Calculate legacy swtime as seconds since 'swtick'. */
+	kp->ki_swtime = (ticks - p->p_swtick) / hz;
+	kp->ki_pid = p->p_pid;
+	kp->ki_nice = p->p_nice;
+	kp->ki_fibnum = p->p_fibnum;
+	kp->ki_start = p->p_stats->p_start;
+	timevaladd(&kp->ki_start, &boottime);
+	PROC_SLOCK(p);
+	rufetch(p, &kp->ki_rusage);
+	kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime);
+	calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime);
+	PROC_SUNLOCK(p);
+	calccru(p, &kp->ki_childutime, &kp->ki_childstime);
+	/* Some callers want child times in a single value. */
+	kp->ki_childtime = kp->ki_childstime;
+	timevaladd(&kp->ki_childtime, &kp->ki_childutime);
+
+	FOREACH_THREAD_IN_PROC(p, td0)
+		kp->ki_cow += td0->td_cow;
+
+	tp = NULL;
+	if (p->p_pgrp) {
+		kp->ki_pgid = p->p_pgrp->pg_id;
+		kp->ki_jobc = p->p_pgrp->pg_jobc;
+		sp = p->p_pgrp->pg_session;
+
+		if (sp != NULL) {
+			kp->ki_sid = sp->s_sid;
+			SESS_LOCK(sp);
+			strlcpy(kp->ki_login, sp->s_login,
+			    sizeof(kp->ki_login));
+			if (sp->s_ttyvp)
+				kp->ki_kiflag |= KI_CTTY;
+			if (SESS_LEADER(p))
+				kp->ki_kiflag |= KI_SLEADER;
+			/* XXX proctree_lock */
+			tp = sp->s_ttyp;
+			SESS_UNLOCK(sp);
+		}
+	}
+	if ((p->p_flag & P_CONTROLT) && tp != NULL) {
+		kp->ki_tdev = tty_udev(tp);
+		kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
+		if (tp->t_session)
+			kp->ki_tsid = tp->t_session->s_sid;
+	} else
+		kp->ki_tdev = NODEV;
+	if (p->p_comm[0] != '\0')
+		strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm));
+	if (p->p_sysent && p->p_sysent->sv_name != NULL &&
+	    p->p_sysent->sv_name[0] != '\0')
+		strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul));
+	kp->ki_siglist = p->p_siglist;
+	kp->ki_xstat = p->p_xstat;
+	kp->ki_acflag = p->p_acflag;
+	kp->ki_lock = p->p_lock;
+	if (p->p_pptr)
+		kp->ki_ppid = p->p_pptr->p_pid;
+}
+
+/*
+ * Fill in information that is thread specific.  Must be called with
+ * target process locked.  If 'preferthread' is set, overwrite certain
+ * process-related fields that are maintained for both threads and
+ * processes.
+ */
+static void
+fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread)
+{
+	struct proc *p;
+
+	p = td->td_proc;
+	kp->ki_tdaddr = td;
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	if (preferthread)
+		PROC_SLOCK(p);
+	thread_lock(td);
+	if (td->td_wmesg != NULL)
+		strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg));
+	else
+		bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg));
+	strlcpy(kp->ki_tdname, td->td_name, sizeof(kp->ki_tdname));
+	if (TD_ON_LOCK(td)) {
+		kp->ki_kiflag |= KI_LOCKBLOCK;
+		strlcpy(kp->ki_lockname, td->td_lockname,
+		    sizeof(kp->ki_lockname));
+	} else {
+		kp->ki_kiflag &= ~KI_LOCKBLOCK;
+		bzero(kp->ki_lockname, sizeof(kp->ki_lockname));
+	}
+
+	if (p->p_state == PRS_NORMAL) { /* approximate. */
+		if (TD_ON_RUNQ(td) ||
+		    TD_CAN_RUN(td) ||
+		    TD_IS_RUNNING(td)) {
+			kp->ki_stat = SRUN;
+		} else if (P_SHOULDSTOP(p)) {
+			kp->ki_stat = SSTOP;
+		} else if (TD_IS_SLEEPING(td)) {
+			kp->ki_stat = SSLEEP;
+		} else if (TD_ON_LOCK(td)) {
+			kp->ki_stat = SLOCK;
+		} else {
+			kp->ki_stat = SWAIT;
+		}
+	} else if (p->p_state == PRS_ZOMBIE) {
+		kp->ki_stat = SZOMB;
+	} else {
+		kp->ki_stat = SIDL;
+	}
+
+	/* Things in the thread */
+	kp->ki_wchan = td->td_wchan;
+	kp->ki_pri.pri_level = td->td_priority;
+	kp->ki_pri.pri_native = td->td_base_pri;
+	kp->ki_lastcpu = td->td_lastcpu;
+	kp->ki_oncpu = td->td_oncpu;
+	kp->ki_tdflags = td->td_flags;
+	kp->ki_tid = td->td_tid;
+	kp->ki_numthreads = p->p_numthreads;
+	kp->ki_pcb = td->td_pcb;
+	kp->ki_kstack = (void *)td->td_kstack;
+	kp->ki_slptime = (ticks - td->td_slptick) / hz;
+	kp->ki_pri.pri_class = td->td_pri_class;
+	kp->ki_pri.pri_user = td->td_user_pri;
+
+	if (preferthread) {
+		rufetchtd(td, &kp->ki_rusage);
+		kp->ki_runtime = cputick2usec(td->td_rux.rux_runtime);
+		kp->ki_pctcpu = sched_pctcpu(td);
+		kp->ki_estcpu = td->td_estcpu;
+		kp->ki_cow = td->td_cow;
+	}
+
+	/* We can't get this anymore but ps etc never used it anyway. */
+	kp->ki_rqindex = 0;
+
+	if (preferthread)
+		kp->ki_siglist = td->td_siglist;
+	kp->ki_sigmask = td->td_sigmask;
+	thread_unlock(td);
+	if (preferthread)
+		PROC_SUNLOCK(p);
+}
+
+/*
+ * Fill in a kinfo_proc structure for the specified process.
+ * Must be called with the target process locked.
+ */
+void
+fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp)
+{
+
+	MPASS(FIRST_THREAD_IN_PROC(p) != NULL);
+
+	fill_kinfo_proc_only(p, kp);
+	fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp, 0);
+	fill_kinfo_aggregate(p, kp);
+}
+
+struct pstats *
+pstats_alloc(void)
+{
+
+	return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO|M_WAITOK));
+}
+
+/*
+ * Copy parts of p_stats; zero the rest of p_stats (statistics).
+ */
+void
+pstats_fork(struct pstats *src, struct pstats *dst)
+{
+
+	bzero(&dst->pstat_startzero,
+	    __rangeof(struct pstats, pstat_startzero, pstat_endzero));
+	bcopy(&src->pstat_startcopy, &dst->pstat_startcopy,
+	    __rangeof(struct pstats, pstat_startcopy, pstat_endcopy));
+}
+
+void
+pstats_free(struct pstats *ps)
+{
+
+	free(ps, M_SUBPROC);
+}
+
+static struct proc *
+zpfind_locked(pid_t pid)
+{
+	struct proc *p;
+
+	sx_assert(&allproc_lock, SX_LOCKED);
+	LIST_FOREACH(p, &zombproc, p_list) {
+		if (p->p_pid == pid) {
+			PROC_LOCK(p);
+			break;
+		}
+	}
+	return (p);
+}
+
+/*
+ * Locate a zombie process by number
+ */
+struct proc *
+zpfind(pid_t pid)
+{
+	struct proc *p;
+
+	sx_slock(&allproc_lock);
+	p = zpfind_locked(pid);
+	sx_sunlock(&allproc_lock);
+	return (p);
+}
+
+#ifdef COMPAT_FREEBSD32
+
+/*
+ * This function is typically used to copy out the kernel address, so
+ * it can be replaced by assignment of zero.
+ */
+static inline uint32_t
+ptr32_trim(void *ptr)
+{
+	uintptr_t uptr;
+
+	uptr = (uintptr_t)ptr;
+	return ((uptr > UINT_MAX) ? 0 : uptr);
+}
+
+#define PTRTRIM_CP(src,dst,fld) \
+	do { (dst).fld = ptr32_trim((src).fld); } while (0)
+
+static void
+freebsd32_kinfo_proc_out(const struct kinfo_proc *ki, struct kinfo_proc32 *ki32)
+{
+	int i;
+
+	bzero(ki32, sizeof(struct kinfo_proc32));
+	ki32->ki_structsize = sizeof(struct kinfo_proc32);
+	CP(*ki, *ki32, ki_layout);
+	PTRTRIM_CP(*ki, *ki32, ki_args);
+	PTRTRIM_CP(*ki, *ki32, ki_paddr);
+	PTRTRIM_CP(*ki, *ki32, ki_addr);
+	PTRTRIM_CP(*ki, *ki32, ki_tracep);
+	PTRTRIM_CP(*ki, *ki32, ki_textvp);
+	PTRTRIM_CP(*ki, *ki32, ki_fd);
+	PTRTRIM_CP(*ki, *ki32, ki_vmspace);
+	PTRTRIM_CP(*ki, *ki32, ki_wchan);
+	CP(*ki, *ki32, ki_pid);
+	CP(*ki, *ki32, ki_ppid);
+	CP(*ki, *ki32, ki_pgid);
+	CP(*ki, *ki32, ki_tpgid);
+	CP(*ki, *ki32, ki_sid);
+	CP(*ki, *ki32, ki_tsid);
+	CP(*ki, *ki32, ki_jobc);
+	CP(*ki, *ki32, ki_tdev);
+	CP(*ki, *ki32, ki_siglist);
+	CP(*ki, *ki32, ki_sigmask);
+	CP(*ki, *ki32, ki_sigignore);
+	CP(*ki, *ki32, ki_sigcatch);
+	CP(*ki, *ki32, ki_uid);
+	CP(*ki, *ki32, ki_ruid);
+	CP(*ki, *ki32, ki_svuid);
+	CP(*ki, *ki32, ki_rgid);
+	CP(*ki, *ki32, ki_svgid);
+	CP(*ki, *ki32, ki_ngroups);
+	for (i = 0; i < KI_NGROUPS; i++)
+		CP(*ki, *ki32, ki_groups[i]);
+	CP(*ki, *ki32, ki_size);
+	CP(*ki, *ki32, ki_rssize);
+	CP(*ki, *ki32, ki_swrss);
+	CP(*ki, *ki32, ki_tsize);
+	CP(*ki, *ki32, ki_dsize);
+	CP(*ki, *ki32, ki_ssize);
+	CP(*ki, *ki32, ki_xstat);
+	CP(*ki, *ki32, ki_acflag);
+	CP(*ki, *ki32, ki_pctcpu);
+	CP(*ki, *ki32, ki_estcpu);
+	CP(*ki, *ki32, ki_slptime);
+	CP(*ki, *ki32, ki_swtime);
+	CP(*ki, *ki32, ki_cow);
+	CP(*ki, *ki32, ki_runtime);
+	TV_CP(*ki, *ki32, ki_start);
+	TV_CP(*ki, *ki32, ki_childtime);
+	CP(*ki, *ki32, ki_flag);
+	CP(*ki, *ki32, ki_kiflag);
+	CP(*ki, *ki32, ki_traceflag);
+	CP(*ki, *ki32, ki_stat);
+	CP(*ki, *ki32, ki_nice);
+	CP(*ki, *ki32, ki_lock);
+	CP(*ki, *ki32, ki_rqindex);
+	CP(*ki, *ki32, ki_oncpu);
+	CP(*ki, *ki32, ki_lastcpu);
+	bcopy(ki->ki_tdname, ki32->ki_tdname, TDNAMLEN + 1);
+	bcopy(ki->ki_wmesg, ki32->ki_wmesg, WMESGLEN + 1);
+	bcopy(ki->ki_login, ki32->ki_login, LOGNAMELEN + 1);
+	bcopy(ki->ki_lockname, ki32->ki_lockname, LOCKNAMELEN + 1);
+	bcopy(ki->ki_comm, ki32->ki_comm, COMMLEN + 1);
+	bcopy(ki->ki_emul, ki32->ki_emul, KI_EMULNAMELEN + 1);
+	bcopy(ki->ki_loginclass, ki32->ki_loginclass, LOGINCLASSLEN + 1);
+	CP(*ki, *ki32, ki_fibnum);
+	CP(*ki, *ki32, ki_cr_flags);
+	CP(*ki, *ki32, ki_jid);
+	CP(*ki, *ki32, ki_numthreads);
+	CP(*ki, *ki32, ki_tid);
+	CP(*ki, *ki32, ki_pri);
+	freebsd32_rusage_out(&ki->ki_rusage, &ki32->ki_rusage);
+	freebsd32_rusage_out(&ki->ki_rusage_ch, &ki32->ki_rusage_ch);
+	PTRTRIM_CP(*ki, *ki32, ki_pcb);
+	PTRTRIM_CP(*ki, *ki32, ki_kstack);
+	PTRTRIM_CP(*ki, *ki32, ki_udata);
+	CP(*ki, *ki32, ki_sflag);
+	CP(*ki, *ki32, ki_tdflags);
+}
+#endif
+
+int
+kern_proc_out(struct proc *p, struct sbuf *sb, int flags)
+{
+	struct thread *td;
+	struct kinfo_proc ki;
+#ifdef COMPAT_FREEBSD32
+	struct kinfo_proc32 ki32;
+#endif
+	int error;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	MPASS(FIRST_THREAD_IN_PROC(p) != NULL);
+
+	error = 0;
+	fill_kinfo_proc(p, &ki);
+	if ((flags & KERN_PROC_NOTHREADS) != 0) {
+#ifdef COMPAT_FREEBSD32
+		if ((flags & KERN_PROC_MASK32) != 0) {
+			freebsd32_kinfo_proc_out(&ki, &ki32);
+			error = sbuf_bcat(sb, &ki32, sizeof(ki32));
+		} else
+#endif
+			error = sbuf_bcat(sb, &ki, sizeof(ki));
+	} else {
+		FOREACH_THREAD_IN_PROC(p, td) {
+			fill_kinfo_thread(td, &ki, 1);
+#ifdef COMPAT_FREEBSD32
+			if ((flags & KERN_PROC_MASK32) != 0) {
+				freebsd32_kinfo_proc_out(&ki, &ki32);
+				error = sbuf_bcat(sb, &ki32, sizeof(ki32));
+			} else
+#endif
+				error = sbuf_bcat(sb, &ki, sizeof(ki));
+			if (error)
+				break;
+		}
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+static int
+sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags,
+    int doingzomb)
+{
+	struct sbuf sb;
+	struct kinfo_proc ki;
+	struct proc *np;
+	int error, error2;
+	pid_t pid;
+
+	pid = p->p_pid;
+	sbuf_new_for_sysctl(&sb, (char *)&ki, sizeof(ki), req);
+	error = kern_proc_out(p, &sb, flags);
+	error2 = sbuf_finish(&sb);
+	sbuf_delete(&sb);
+	if (error != 0)
+		return (error);
+	else if (error2 != 0)
+		return (error2);
+	if (doingzomb)
+		np = zpfind(pid);
+	else {
+		if (pid == 0)
+			return (0);
+		np = pfind(pid);
+	}
+	if (np == NULL)
+		return (ESRCH);
+	if (np != p) {
+		PROC_UNLOCK(np);
+		return (ESRCH);
+	}
+	PROC_UNLOCK(np);
+	return (0);
+}
+
+static int
+sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *)arg1;
+	u_int namelen = arg2;
+	struct proc *p;
+	int flags, doingzomb, oid_number;
+	int error = 0;
+
+	oid_number = oidp->oid_number;
+	if (oid_number != KERN_PROC_ALL &&
+	    (oid_number & KERN_PROC_INC_THREAD) == 0)
+		flags = KERN_PROC_NOTHREADS;
+	else {
+		flags = 0;
+		oid_number &= ~KERN_PROC_INC_THREAD;
+	}
+#ifdef COMPAT_FREEBSD32
+	if (req->flags & SCTL_MASK32)
+		flags |= KERN_PROC_MASK32;
+#endif
+	if (oid_number == KERN_PROC_PID) {
+		if (namelen != 1)
+			return (EINVAL);
+		error = sysctl_wire_old_buffer(req, 0);
+		if (error)
+			return (error);
+		error = pget((pid_t)name[0], PGET_CANSEE, &p);
+		if (error != 0)
+			return (error);
+		error = sysctl_out_proc(p, req, flags, 0);
+		return (error);
+	}
+
+	switch (oid_number) {
+	case KERN_PROC_ALL:
+		if (namelen != 0)
+			return (EINVAL);
+		break;
+	case KERN_PROC_PROC:
+		if (namelen != 0 && namelen != 1)
+			return (EINVAL);
+		break;
+	default:
+		if (namelen != 1)
+			return (EINVAL);
+		break;
+	}
+
+	if (!req->oldptr) {
+		/* overestimate by 5 procs */
+		error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
+		if (error)
+			return (error);
+	}
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sx_slock(&allproc_lock);
+	for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
+		if (!doingzomb)
+			p = LIST_FIRST(&allproc);
+		else
+			p = LIST_FIRST(&zombproc);
+		for (; p != 0; p = LIST_NEXT(p, p_list)) {
+			/*
+			 * Skip embryonic processes.
+			 */
+			PROC_LOCK(p);
+			if (p->p_state == PRS_NEW) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			KASSERT(p->p_ucred != NULL,
+			    ("process credential is NULL for non-NEW proc"));
+			/*
+			 * Show a user only appropriate processes.
+			 */
+			if (p_cansee(curthread, p)) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			/*
+			 * TODO - make more efficient (see notes below).
+			 * do by session.
+			 */
+			switch (oid_number) {
+
+			case KERN_PROC_GID:
+				if (p->p_ucred->cr_gid != (gid_t)name[0]) {
+					PROC_UNLOCK(p);
+					continue;
+				}
+				break;
+
+			case KERN_PROC_PGRP:
+				/* could do this by traversing pgrp */
+				if (p->p_pgrp == NULL ||
+				    p->p_pgrp->pg_id != (pid_t)name[0]) {
+					PROC_UNLOCK(p);
+					continue;
+				}
+				break;
+
+			case KERN_PROC_RGID:
+				if (p->p_ucred->cr_rgid != (gid_t)name[0]) {
+					PROC_UNLOCK(p);
+					continue;
+				}
+				break;
+
+			case KERN_PROC_SESSION:
+				if (p->p_session == NULL ||
+				    p->p_session->s_sid != (pid_t)name[0]) {
+					PROC_UNLOCK(p);
+					continue;
+				}
+				break;
+
+			case KERN_PROC_TTY:
+				if ((p->p_flag & P_CONTROLT) == 0 ||
+				    p->p_session == NULL) {
+					PROC_UNLOCK(p);
+					continue;
+				}
+				/* XXX proctree_lock */
+				SESS_LOCK(p->p_session);
+				if (p->p_session->s_ttyp == NULL ||
+				    tty_udev(p->p_session->s_ttyp) !=
+				    (dev_t)name[0]) {
+					SESS_UNLOCK(p->p_session);
+					PROC_UNLOCK(p);
+					continue;
+				}
+				SESS_UNLOCK(p->p_session);
+				break;
+
+			case KERN_PROC_UID:
+				if (p->p_ucred->cr_uid != (uid_t)name[0]) {
+					PROC_UNLOCK(p);
+					continue;
+				}
+				break;
+
+			case KERN_PROC_RUID:
+				if (p->p_ucred->cr_ruid != (uid_t)name[0]) {
+					PROC_UNLOCK(p);
+					continue;
+				}
+				break;
+
+			case KERN_PROC_PROC:
+				break;
+
+			default:
+				break;
+
+			}
+
+			error = sysctl_out_proc(p, req, flags, doingzomb);
+			if (error) {
+				sx_sunlock(&allproc_lock);
+				return (error);
+			}
+		}
+	}
+	sx_sunlock(&allproc_lock);
+	return (0);
+}
+
+struct pargs *
+pargs_alloc(int len)
+{
+	struct pargs *pa;
+
+	pa = malloc(sizeof(struct pargs) + len, M_PARGS,
+		M_WAITOK);
+	refcount_init(&pa->ar_ref, 1);
+	pa->ar_length = len;
+	return (pa);
+}
+
+static void
+pargs_free(struct pargs *pa)
+{
+
+	free(pa, M_PARGS);
+}
+
+void
+pargs_hold(struct pargs *pa)
+{
+
+	if (pa == NULL)
+		return;
+	refcount_acquire(&pa->ar_ref);
+}
+
+void
+pargs_drop(struct pargs *pa)
+{
+
+	if (pa == NULL)
+		return;
+	if (refcount_release(&pa->ar_ref))
+		pargs_free(pa);
+}
+
+static int
+proc_read_mem(struct thread *td, struct proc *p, vm_offset_t offset, void* buf,
+    size_t len)
+{
+	struct iovec iov;
+	struct uio uio;
+
+	iov.iov_base = (caddr_t)buf;
+	iov.iov_len = len;
+	uio.uio_iov = &iov;
+	uio.uio_iovcnt = 1;
+	uio.uio_offset = offset;
+	uio.uio_resid = (ssize_t)len;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_rw = UIO_READ;
+	uio.uio_td = td;
+
+	return (proc_rwmem(p, &uio));
+}
+
+static int
+proc_read_string(struct thread *td, struct proc *p, const char *sptr, char *buf,
+    size_t len)
+{
+	size_t i;
+	int error;
+
+	error = proc_read_mem(td, p, (vm_offset_t)sptr, buf, len);
+	/*
+	 * Reading the chunk may validly return EFAULT if the string is shorter
+	 * than the chunk and is aligned at the end of the page, assuming the
+	 * next page is not mapped.  So if EFAULT is returned do a fallback to
+	 * one byte read loop.
+	 */
+	if (error == EFAULT) {
+		for (i = 0; i < len; i++, buf++, sptr++) {
+			error = proc_read_mem(td, p, (vm_offset_t)sptr, buf, 1);
+			if (error != 0)
+				return (error);
+			if (*buf == '\0')
+				break;
+		}
+		error = 0;
+	}
+	return (error);
+}
+
+#define PROC_AUXV_MAX	256	/* Safety limit on auxv size. */
+
+enum proc_vector_type {
+	PROC_ARG,
+	PROC_ENV,
+	PROC_AUX,
+};
+
+#ifdef COMPAT_FREEBSD32
+static int
+get_proc_vector32(struct thread *td, struct proc *p, char ***proc_vectorp,
+    size_t *vsizep, enum proc_vector_type type)
+{
+	struct freebsd32_ps_strings pss;
+	Elf32_Auxinfo aux;
+	vm_offset_t vptr, ptr;
+	uint32_t *proc_vector32;
+	char **proc_vector;
+	size_t vsize, size;
+	int i, error;
+
+	error = proc_read_mem(td, p, (vm_offset_t)(p->p_sysent->sv_psstrings),
+	    &pss, sizeof(pss));
+	if (error != 0)
+		return (error);
+	switch (type) {
+	case PROC_ARG:
+		vptr = (vm_offset_t)PTRIN(pss.ps_argvstr);
+		vsize = pss.ps_nargvstr;
+		if (vsize > ARG_MAX)
+			return (ENOEXEC);
+		size = vsize * sizeof(int32_t);
+		break;
+	case PROC_ENV:
+		vptr = (vm_offset_t)PTRIN(pss.ps_envstr);
+		vsize = pss.ps_nenvstr;
+		if (vsize > ARG_MAX)
+			return (ENOEXEC);
+		size = vsize * sizeof(int32_t);
+		break;
+	case PROC_AUX:
+		vptr = (vm_offset_t)PTRIN(pss.ps_envstr) +
+		    (pss.ps_nenvstr + 1) * sizeof(int32_t);
+		if (vptr % 4 != 0)
+			return (ENOEXEC);
+		for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) {
+			error = proc_read_mem(td, p, ptr, &aux, sizeof(aux));
+			if (error != 0)
+				return (error);
+			if (aux.a_type == AT_NULL)
+				break;
+			ptr += sizeof(aux);
+		}
+		if (aux.a_type != AT_NULL)
+			return (ENOEXEC);
+		vsize = i + 1;
+		size = vsize * sizeof(aux);
+		break;
+	default:
+		KASSERT(0, ("Wrong proc vector type: %d", type));
+		return (EINVAL);
+	}
+	proc_vector32 = malloc(size, M_TEMP, M_WAITOK);
+	error = proc_read_mem(td, p, vptr, proc_vector32, size);
+	if (error != 0)
+		goto done;
+	if (type == PROC_AUX) {
+		*proc_vectorp = (char **)proc_vector32;
+		*vsizep = vsize;
+		return (0);
+	}
+	proc_vector = malloc(vsize * sizeof(char *), M_TEMP, M_WAITOK);
+	for (i = 0; i < (int)vsize; i++)
+		proc_vector[i] = PTRIN(proc_vector32[i]);
+	*proc_vectorp = proc_vector;
+	*vsizep = vsize;
+done:
+	free(proc_vector32, M_TEMP);
+	return (error);
+}
+#endif
+
+static int
+get_proc_vector(struct thread *td, struct proc *p, char ***proc_vectorp,
+    size_t *vsizep, enum proc_vector_type type)
+{
+	struct ps_strings pss;
+	Elf_Auxinfo aux;
+	vm_offset_t vptr, ptr;
+	char **proc_vector;
+	size_t vsize, size;
+	int error, i;
+
+#ifdef COMPAT_FREEBSD32
+	if (SV_PROC_FLAG(p, SV_ILP32) != 0)
+		return (get_proc_vector32(td, p, proc_vectorp, vsizep, type));
+#endif
+	error = proc_read_mem(td, p, (vm_offset_t)(p->p_sysent->sv_psstrings),
+	    &pss, sizeof(pss));
+	if (error != 0)
+		return (error);
+	switch (type) {
+	case PROC_ARG:
+		vptr = (vm_offset_t)pss.ps_argvstr;
+		vsize = pss.ps_nargvstr;
+		if (vsize > ARG_MAX)
+			return (ENOEXEC);
+		size = vsize * sizeof(char *);
+		break;
+	case PROC_ENV:
+		vptr = (vm_offset_t)pss.ps_envstr;
+		vsize = pss.ps_nenvstr;
+		if (vsize > ARG_MAX)
+			return (ENOEXEC);
+		size = vsize * sizeof(char *);
+		break;
+	case PROC_AUX:
+		/*
+		 * The aux array is just above env array on the stack. Check
+		 * that the address is naturally aligned.
+		 */
+		vptr = (vm_offset_t)pss.ps_envstr + (pss.ps_nenvstr + 1)
+		    * sizeof(char *);
+#if __ELF_WORD_SIZE == 64
+		if (vptr % sizeof(uint64_t) != 0)
+#else
+		if (vptr % sizeof(uint32_t) != 0)
+#endif
+			return (ENOEXEC);
+		/*
+		 * We count the array size reading the aux vectors from the
+		 * stack until AT_NULL vector is returned.  So (to keep the code
+		 * simple) we read the process stack twice: the first time here
+		 * to find the size and the second time when copying the vectors
+		 * to the allocated proc_vector.
+		 */
+		for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) {
+			error = proc_read_mem(td, p, ptr, &aux, sizeof(aux));
+			if (error != 0)
+				return (error);
+			if (aux.a_type == AT_NULL)
+				break;
+			ptr += sizeof(aux);
+		}
+		/*
+		 * If the PROC_AUXV_MAX entries are iterated over, and we have
+		 * not reached AT_NULL, it is most likely we are reading wrong
+		 * data: either the process doesn't have auxv array or data has
+		 * been modified. Return the error in this case.
+		 */
+		if (aux.a_type != AT_NULL)
+			return (ENOEXEC);
+		vsize = i + 1;
+		size = vsize * sizeof(aux);
+		break;
+	default:
+		KASSERT(0, ("Wrong proc vector type: %d", type));
+		return (EINVAL); /* In case we are built without INVARIANTS. */
+	}
+	proc_vector = malloc(size, M_TEMP, M_WAITOK);
+	if (proc_vector == NULL)
+		return (ENOMEM);
+	error = proc_read_mem(td, p, vptr, proc_vector, size);
+	if (error != 0) {
+		free(proc_vector, M_TEMP);
+		return (error);
+	}
+	*proc_vectorp = proc_vector;
+	*vsizep = vsize;
+
+	return (0);
+}
+
+#define GET_PS_STRINGS_CHUNK_SZ	256	/* Chunk size (bytes) for ps_strings operations. */
+
+static int
+get_ps_strings(struct thread *td, struct proc *p, struct sbuf *sb,
+    enum proc_vector_type type)
+{
+	size_t done, len, nchr, vsize;
+	int error, i;
+	char **proc_vector, *sptr;
+	char pss_string[GET_PS_STRINGS_CHUNK_SZ];
+
+	PROC_ASSERT_HELD(p);
+
+	/*
+	 * We are not going to read more than 2 * (PATH_MAX + ARG_MAX) bytes.
+	 */
+	nchr = 2 * (PATH_MAX + ARG_MAX);
+
+	error = get_proc_vector(td, p, &proc_vector, &vsize, type);
+	if (error != 0)
+		return (error);
+	for (done = 0, i = 0; i < (int)vsize && done < nchr; i++) {
+		/*
+		 * The program may have scribbled into its argv array, e.g. to
+		 * remove some arguments.  If that has happened, break out
+		 * before trying to read from NULL.
+		 */
+		if (proc_vector[i] == NULL)
+			break;
+		for (sptr = proc_vector[i]; ; sptr += GET_PS_STRINGS_CHUNK_SZ) {
+			error = proc_read_string(td, p, sptr, pss_string,
+			    sizeof(pss_string));
+			if (error != 0)
+				goto done;
+			len = strnlen(pss_string, GET_PS_STRINGS_CHUNK_SZ);
+			if (done + len >= nchr)
+				len = nchr - done - 1;
+			sbuf_bcat(sb, pss_string, len);
+			if (len != GET_PS_STRINGS_CHUNK_SZ)
+				break;
+			done += GET_PS_STRINGS_CHUNK_SZ;
+		}
+		sbuf_bcat(sb, "", 1);
+		done += len + 1;
+	}
+done:
+	free(proc_vector, M_TEMP);
+	return (error);
+}
+
+int
+proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb)
+{
+
+	return (get_ps_strings(curthread, p, sb, PROC_ARG));
+}
+
+int
+proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb)
+{
+
+	return (get_ps_strings(curthread, p, sb, PROC_ENV));
+}
+
+int
+proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb)
+{
+	size_t vsize, size;
+	char **auxv;
+	int error;
+
+	error = get_proc_vector(td, p, &auxv, &vsize, PROC_AUX);
+	if (error == 0) {
+#ifdef COMPAT_FREEBSD32
+		if (SV_PROC_FLAG(p, SV_ILP32) != 0)
+			size = vsize * sizeof(Elf32_Auxinfo);
+		else
+#endif
+			size = vsize * sizeof(Elf_Auxinfo);
+		error = sbuf_bcat(sb, auxv, size);
+		free(auxv, M_TEMP);
+	}
+	return (error);
+}
+
+/*
+ * This sysctl allows a process to retrieve the argument list or process
+ * title for another process without groping around in the address space
+ * of the other process.  It also allow a process to set its own "process 
+ * title to a string of its own choice.
+ */
+static int
+sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *)arg1;
+	u_int namelen = arg2;
+	struct pargs *newpa, *pa;
+	struct proc *p;
+	struct sbuf sb;
+	int flags, error = 0, error2;
+
+	if (namelen != 1)
+		return (EINVAL);
+
+	flags = PGET_CANSEE;
+	if (req->newptr != NULL)
+		flags |= PGET_ISCURRENT;
+	error = pget((pid_t)name[0], flags, &p);
+	if (error)
+		return (error);
+
+	pa = p->p_args;
+	if (pa != NULL) {
+		pargs_hold(pa);
+		PROC_UNLOCK(p);
+		error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length);
+		pargs_drop(pa);
+	} else if ((p->p_flag & (P_WEXIT | P_SYSTEM)) == 0) {
+		_PHOLD(p);
+		PROC_UNLOCK(p);
+		sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req);
+		error = proc_getargv(curthread, p, &sb);
+		error2 = sbuf_finish(&sb);
+		PRELE(p);
+		sbuf_delete(&sb);
+		if (error == 0 && error2 != 0)
+			error = error2;
+	} else {
+		PROC_UNLOCK(p);
+	}
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit)
+		return (ENOMEM);
+	newpa = pargs_alloc(req->newlen);
+	error = SYSCTL_IN(req, newpa->ar_args, req->newlen);
+	if (error != 0) {
+		pargs_free(newpa);
+		return (error);
+	}
+	PROC_LOCK(p);
+	pa = p->p_args;
+	p->p_args = newpa;
+	PROC_UNLOCK(p);
+	pargs_drop(pa);
+	return (0);
+}
+
+/*
+ * This sysctl allows a process to retrieve environment of another process.
+ */
+static int
+sysctl_kern_proc_env(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *)arg1;
+	u_int namelen = arg2;
+	struct proc *p;
+	struct sbuf sb;
+	int error, error2;
+
+	if (namelen != 1)
+		return (EINVAL);
+
+	error = pget((pid_t)name[0], PGET_WANTREAD, &p);
+	if (error != 0)
+		return (error);
+	if ((p->p_flag & P_SYSTEM) != 0) {
+		PRELE(p);
+		return (0);
+	}
+
+	sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req);
+	error = proc_getenvv(curthread, p, &sb);
+	error2 = sbuf_finish(&sb);
+	PRELE(p);
+	sbuf_delete(&sb);
+	return (error != 0 ? error : error2);
+}
+
+/*
+ * This sysctl allows a process to retrieve ELF auxiliary vector of
+ * another process.
+ */
+static int
+sysctl_kern_proc_auxv(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *)arg1;
+	u_int namelen = arg2;
+	struct proc *p;
+	struct sbuf sb;
+	int error, error2;
+
+	if (namelen != 1)
+		return (EINVAL);
+
+	error = pget((pid_t)name[0], PGET_WANTREAD, &p);
+	if (error != 0)
+		return (error);
+	if ((p->p_flag & P_SYSTEM) != 0) {
+		PRELE(p);
+		return (0);
+	}
+	sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req);
+	error = proc_getauxv(curthread, p, &sb);
+	error2 = sbuf_finish(&sb);
+	PRELE(p);
+	sbuf_delete(&sb);
+	return (error != 0 ? error : error2);
+}
+
+/*
+ * This sysctl allows a process to retrieve the path of the executable for
+ * itself or another process.
+ */
+static int
+sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS)
+{
+	pid_t *pidp = (pid_t *)arg1;
+	unsigned int arglen = arg2;
+	struct proc *p;
+	struct vnode *vp;
+	char *retbuf, *freebuf;
+	int error;
+
+	if (arglen != 1)
+		return (EINVAL);
+	if (*pidp == -1) {	/* -1 means this process */
+		p = req->td->td_proc;
+	} else {
+		error = pget(*pidp, PGET_CANSEE, &p);
+		if (error != 0)
+			return (error);
+	}
+
+	vp = p->p_textvp;
+	if (vp == NULL) {
+		if (*pidp != -1)
+			PROC_UNLOCK(p);
+		return (0);
+	}
+	vref(vp);
+	if (*pidp != -1)
+		PROC_UNLOCK(p);
+	error = vn_fullpath(req->td, vp, &retbuf, &freebuf);
+	vrele(vp);
+	if (error)
+		return (error);
+	error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1);
+	free(freebuf, M_TEMP);
+	return (error);
+}
+
+static int
+sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS)
+{
+	struct proc *p;
+	char *sv_name;
+	int *name;
+	int namelen;
+	int error;
+
+	namelen = arg2;
+	if (namelen != 1)
+		return (EINVAL);
+
+	name = (int *)arg1;
+	error = pget((pid_t)name[0], PGET_CANSEE, &p);
+	if (error != 0)
+		return (error);
+	sv_name = p->p_sysent->sv_name;
+	PROC_UNLOCK(p);
+	return (sysctl_handle_string(oidp, sv_name, 0, req));
+}
+
+#ifdef KINFO_OVMENTRY_SIZE
+CTASSERT(sizeof(struct kinfo_ovmentry) == KINFO_OVMENTRY_SIZE);
+#endif
+
+#ifdef COMPAT_FREEBSD7
+static int
+sysctl_kern_proc_ovmmap(SYSCTL_HANDLER_ARGS)
+{
+	vm_map_entry_t entry, tmp_entry;
+	unsigned int last_timestamp;
+	char *fullpath, *freepath;
+	struct kinfo_ovmentry *kve;
+	struct vattr va;
+	struct ucred *cred;
+	int error, *name;
+	struct vnode *vp;
+	struct proc *p;
+	vm_map_t map;
+	struct vmspace *vm;
+
+	name = (int *)arg1;
+	error = pget((pid_t)name[0], PGET_WANTREAD, &p);
+	if (error != 0)
+		return (error);
+	vm = vmspace_acquire_ref(p);
+	if (vm == NULL) {
+		PRELE(p);
+		return (ESRCH);
+	}
+	kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK);
+
+	map = &vm->vm_map;
+	vm_map_lock_read(map);
+	for (entry = map->header.next; entry != &map->header;
+	    entry = entry->next) {
+		vm_object_t obj, tobj, lobj;
+		vm_offset_t addr;
+
+		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
+			continue;
+
+		bzero(kve, sizeof(*kve));
+		kve->kve_structsize = sizeof(*kve);
+
+		kve->kve_private_resident = 0;
+		obj = entry->object.vm_object;
+		if (obj != NULL) {
+			VM_OBJECT_RLOCK(obj);
+			if (obj->shadow_count == 1)
+				kve->kve_private_resident =
+				    obj->resident_page_count;
+		}
+		kve->kve_resident = 0;
+		addr = entry->start;
+		while (addr < entry->end) {
+			if (pmap_extract(map->pmap, addr))
+				kve->kve_resident++;
+			addr += PAGE_SIZE;
+		}
+
+		for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
+			if (tobj != obj)
+				VM_OBJECT_RLOCK(tobj);
+			if (lobj != obj)
+				VM_OBJECT_RUNLOCK(lobj);
+			lobj = tobj;
+		}
+
+		kve->kve_start = (void*)entry->start;
+		kve->kve_end = (void*)entry->end;
+		kve->kve_offset = (off_t)entry->offset;
+
+		if (entry->protection & VM_PROT_READ)
+			kve->kve_protection |= KVME_PROT_READ;
+		if (entry->protection & VM_PROT_WRITE)
+			kve->kve_protection |= KVME_PROT_WRITE;
+		if (entry->protection & VM_PROT_EXECUTE)
+			kve->kve_protection |= KVME_PROT_EXEC;
+
+		if (entry->eflags & MAP_ENTRY_COW)
+			kve->kve_flags |= KVME_FLAG_COW;
+		if (entry->eflags & MAP_ENTRY_NEEDS_COPY)
+			kve->kve_flags |= KVME_FLAG_NEEDS_COPY;
+		if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
+			kve->kve_flags |= KVME_FLAG_NOCOREDUMP;
+
+		last_timestamp = map->timestamp;
+		vm_map_unlock_read(map);
+
+		kve->kve_fileid = 0;
+		kve->kve_fsid = 0;
+		freepath = NULL;
+		fullpath = "";
+		if (lobj) {
+			vp = NULL;
+			switch (lobj->type) {
+			case OBJT_DEFAULT:
+				kve->kve_type = KVME_TYPE_DEFAULT;
+				break;
+			case OBJT_VNODE:
+				kve->kve_type = KVME_TYPE_VNODE;
+				vp = lobj->handle;
+				vref(vp);
+				break;
+			case OBJT_SWAP:
+				kve->kve_type = KVME_TYPE_SWAP;
+				break;
+			case OBJT_DEVICE:
+				kve->kve_type = KVME_TYPE_DEVICE;
+				break;
+			case OBJT_PHYS:
+				kve->kve_type = KVME_TYPE_PHYS;
+				break;
+			case OBJT_DEAD:
+				kve->kve_type = KVME_TYPE_DEAD;
+				break;
+			case OBJT_SG:
+				kve->kve_type = KVME_TYPE_SG;
+				break;
+			default:
+				kve->kve_type = KVME_TYPE_UNKNOWN;
+				break;
+			}
+			if (lobj != obj)
+				VM_OBJECT_RUNLOCK(lobj);
+
+			kve->kve_ref_count = obj->ref_count;
+			kve->kve_shadow_count = obj->shadow_count;
+			VM_OBJECT_RUNLOCK(obj);
+			if (vp != NULL) {
+				vn_fullpath(curthread, vp, &fullpath,
+				    &freepath);
+				cred = curthread->td_ucred;
+				vn_lock(vp, LK_SHARED | LK_RETRY);
+				if (VOP_GETATTR(vp, &va, cred) == 0) {
+					kve->kve_fileid = va.va_fileid;
+					kve->kve_fsid = va.va_fsid;
+				}
+				vput(vp);
+			}
+		} else {
+			kve->kve_type = KVME_TYPE_NONE;
+			kve->kve_ref_count = 0;
+			kve->kve_shadow_count = 0;
+		}
+
+		strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path));
+		if (freepath != NULL)
+			free(freepath, M_TEMP);
+
+		error = SYSCTL_OUT(req, kve, sizeof(*kve));
+		vm_map_lock_read(map);
+		if (error)
+			break;
+		if (last_timestamp != map->timestamp) {
+			vm_map_lookup_entry(map, addr - 1, &tmp_entry);
+			entry = tmp_entry;
+		}
+	}
+	vm_map_unlock_read(map);
+	vmspace_free(vm);
+	PRELE(p);
+	free(kve, M_TEMP);
+	return (error);
+}
+#endif	/* COMPAT_FREEBSD7 */
+
+#ifdef KINFO_VMENTRY_SIZE
+CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE);
+#endif
+
+/*
+ * Must be called with the process locked and will return unlocked.
+ */
+int
+kern_proc_vmmap_out(struct proc *p, struct sbuf *sb)
+{
+	vm_map_entry_t entry, tmp_entry;
+	unsigned int last_timestamp;
+	char *fullpath, *freepath;
+	struct kinfo_vmentry *kve;
+	struct vattr va;
+	struct ucred *cred;
+	int error;
+	struct vnode *vp;
+	struct vmspace *vm;
+	vm_map_t map;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	_PHOLD(p);
+	PROC_UNLOCK(p);
+	vm = vmspace_acquire_ref(p);
+	if (vm == NULL) {
+		PRELE(p);
+		return (ESRCH);
+	}
+	kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK);
+
+	error = 0;
+	map = &vm->vm_map;
+	vm_map_lock_read(map);
+	for (entry = map->header.next; entry != &map->header;
+	    entry = entry->next) {
+		vm_object_t obj, tobj, lobj;
+		vm_offset_t addr;
+		vm_paddr_t locked_pa;
+		int mincoreinfo;
+
+		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
+			continue;
+
+		bzero(kve, sizeof(*kve));
+
+		kve->kve_private_resident = 0;
+		obj = entry->object.vm_object;
+		if (obj != NULL) {
+			VM_OBJECT_RLOCK(obj);
+			if (obj->shadow_count == 1)
+				kve->kve_private_resident =
+				    obj->resident_page_count;
+		}
+		kve->kve_resident = 0;
+		addr = entry->start;
+		while (addr < entry->end) {
+			locked_pa = 0;
+			mincoreinfo = pmap_mincore(map->pmap, addr, &locked_pa);
+			if (locked_pa != 0)
+				vm_page_unlock(PHYS_TO_VM_PAGE(locked_pa));
+			if (mincoreinfo & MINCORE_INCORE)
+				kve->kve_resident++;
+			if (mincoreinfo & MINCORE_SUPER)
+				kve->kve_flags |= KVME_FLAG_SUPER;
+			addr += PAGE_SIZE;
+		}
+
+		for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
+			if (tobj != obj)
+				VM_OBJECT_RLOCK(tobj);
+			if (lobj != obj)
+				VM_OBJECT_RUNLOCK(lobj);
+			lobj = tobj;
+		}
+
+		kve->kve_start = entry->start;
+		kve->kve_end = entry->end;
+		kve->kve_offset = entry->offset;
+
+		if (entry->protection & VM_PROT_READ)
+			kve->kve_protection |= KVME_PROT_READ;
+		if (entry->protection & VM_PROT_WRITE)
+			kve->kve_protection |= KVME_PROT_WRITE;
+		if (entry->protection & VM_PROT_EXECUTE)
+			kve->kve_protection |= KVME_PROT_EXEC;
+
+		if (entry->eflags & MAP_ENTRY_COW)
+			kve->kve_flags |= KVME_FLAG_COW;
+		if (entry->eflags & MAP_ENTRY_NEEDS_COPY)
+			kve->kve_flags |= KVME_FLAG_NEEDS_COPY;
+		if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
+			kve->kve_flags |= KVME_FLAG_NOCOREDUMP;
+		if (entry->eflags & MAP_ENTRY_GROWS_UP)
+			kve->kve_flags |= KVME_FLAG_GROWS_UP;
+		if (entry->eflags & MAP_ENTRY_GROWS_DOWN)
+			kve->kve_flags |= KVME_FLAG_GROWS_DOWN;
+
+		last_timestamp = map->timestamp;
+		vm_map_unlock_read(map);
+
+		freepath = NULL;
+		fullpath = "";
+		if (lobj) {
+			vp = NULL;
+			switch (lobj->type) {
+			case OBJT_DEFAULT:
+				kve->kve_type = KVME_TYPE_DEFAULT;
+				break;
+			case OBJT_VNODE:
+				kve->kve_type = KVME_TYPE_VNODE;
+				vp = lobj->handle;
+				vref(vp);
+				break;
+			case OBJT_SWAP:
+				kve->kve_type = KVME_TYPE_SWAP;
+				break;
+			case OBJT_DEVICE:
+				kve->kve_type = KVME_TYPE_DEVICE;
+				break;
+			case OBJT_PHYS:
+				kve->kve_type = KVME_TYPE_PHYS;
+				break;
+			case OBJT_DEAD:
+				kve->kve_type = KVME_TYPE_DEAD;
+				break;
+			case OBJT_SG:
+				kve->kve_type = KVME_TYPE_SG;
+				break;
+			default:
+				kve->kve_type = KVME_TYPE_UNKNOWN;
+				break;
+			}
+			if (lobj != obj)
+				VM_OBJECT_RUNLOCK(lobj);
+
+			kve->kve_ref_count = obj->ref_count;
+			kve->kve_shadow_count = obj->shadow_count;
+			VM_OBJECT_RUNLOCK(obj);
+			if (vp != NULL) {
+				vn_fullpath(curthread, vp, &fullpath,
+				    &freepath);
+				kve->kve_vn_type = vntype_to_kinfo(vp->v_type);
+				cred = curthread->td_ucred;
+				vn_lock(vp, LK_SHARED | LK_RETRY);
+				if (VOP_GETATTR(vp, &va, cred) == 0) {
+					kve->kve_vn_fileid = va.va_fileid;
+					kve->kve_vn_fsid = va.va_fsid;
+					kve->kve_vn_mode =
+					    MAKEIMODE(va.va_type, va.va_mode);
+					kve->kve_vn_size = va.va_size;
+					kve->kve_vn_rdev = va.va_rdev;
+					kve->kve_status = KF_ATTR_VALID;
+				}
+				vput(vp);
+			}
+		} else {
+			kve->kve_type = KVME_TYPE_NONE;
+			kve->kve_ref_count = 0;
+			kve->kve_shadow_count = 0;
+		}
+
+		strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path));
+		if (freepath != NULL)
+			free(freepath, M_TEMP);
+
+		/* Pack record size down */
+		kve->kve_structsize = offsetof(struct kinfo_vmentry, kve_path) +
+		    strlen(kve->kve_path) + 1;
+		kve->kve_structsize = roundup(kve->kve_structsize,
+		    sizeof(uint64_t));
+		error = sbuf_bcat(sb, kve, kve->kve_structsize);
+		vm_map_lock_read(map);
+		if (error)
+			break;
+		if (last_timestamp != map->timestamp) {
+			vm_map_lookup_entry(map, addr - 1, &tmp_entry);
+			entry = tmp_entry;
+		}
+	}
+	vm_map_unlock_read(map);
+	vmspace_free(vm);
+	PRELE(p);
+	free(kve, M_TEMP);
+	return (error);
+}
+
+static int
+sysctl_kern_proc_vmmap(SYSCTL_HANDLER_ARGS)
+{
+	struct proc *p;
+	struct sbuf sb;
+	int error, error2, *name;
+
+	name = (int *)arg1;
+	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_vmentry), req);
+	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
+	if (error != 0) {
+		sbuf_delete(&sb);
+		return (error);
+	}
+	error = kern_proc_vmmap_out(p, &sb);
+	error2 = sbuf_finish(&sb);
+	sbuf_delete(&sb);
+	return (error != 0 ? error : error2);
+}
+
+#if defined(STACK) || defined(DDB)
+static int
+sysctl_kern_proc_kstack(SYSCTL_HANDLER_ARGS)
+{
+	struct kinfo_kstack *kkstp;
+	int error, i, *name, numthreads;
+	lwpid_t *lwpidarray;
+	struct thread *td;
+	struct stack *st;
+	struct sbuf sb;
+	struct proc *p;
+
+	name = (int *)arg1;
+	error = pget((pid_t)name[0], PGET_NOTINEXEC | PGET_WANTREAD, &p);
+	if (error != 0)
+		return (error);
+
+	kkstp = malloc(sizeof(*kkstp), M_TEMP, M_WAITOK);
+	st = stack_create();
+
+	lwpidarray = NULL;
+	numthreads = 0;
+	PROC_LOCK(p);
+repeat:
+	if (numthreads < p->p_numthreads) {
+		if (lwpidarray != NULL) {
+			free(lwpidarray, M_TEMP);
+			lwpidarray = NULL;
+		}
+		numthreads = p->p_numthreads;
+		PROC_UNLOCK(p);
+		lwpidarray = malloc(sizeof(*lwpidarray) * numthreads, M_TEMP,
+		    M_WAITOK | M_ZERO);
+		PROC_LOCK(p);
+		goto repeat;
+	}
+	i = 0;
+
+	/*
+	 * XXXRW: During the below loop, execve(2) and countless other sorts
+	 * of changes could have taken place.  Should we check to see if the
+	 * vmspace has been replaced, or the like, in order to prevent
+	 * giving a snapshot that spans, say, execve(2), with some threads
+	 * before and some after?  Among other things, the credentials could
+	 * have changed, in which case the right to extract debug info might
+	 * no longer be assured.
+	 */
+	FOREACH_THREAD_IN_PROC(p, td) {
+		KASSERT(i < numthreads,
+		    ("sysctl_kern_proc_kstack: numthreads"));
+		lwpidarray[i] = td->td_tid;
+		i++;
+	}
+	numthreads = i;
+	for (i = 0; i < numthreads; i++) {
+		td = thread_find(p, lwpidarray[i]);
+		if (td == NULL) {
+			continue;
+		}
+		bzero(kkstp, sizeof(*kkstp));
+		(void)sbuf_new(&sb, kkstp->kkst_trace,
+		    sizeof(kkstp->kkst_trace), SBUF_FIXEDLEN);
+		thread_lock(td);
+		kkstp->kkst_tid = td->td_tid;
+		if (TD_IS_SWAPPED(td))
+			kkstp->kkst_state = KKST_STATE_SWAPPED;
+		else if (TD_IS_RUNNING(td))
+			kkstp->kkst_state = KKST_STATE_RUNNING;
+		else {
+			kkstp->kkst_state = KKST_STATE_STACKOK;
+			stack_save_td(st, td);
+		}
+		thread_unlock(td);
+		PROC_UNLOCK(p);
+		stack_sbuf_print(&sb, st);
+		sbuf_finish(&sb);
+		sbuf_delete(&sb);
+		error = SYSCTL_OUT(req, kkstp, sizeof(*kkstp));
+		PROC_LOCK(p);
+		if (error)
+			break;
+	}
+	_PRELE(p);
+	PROC_UNLOCK(p);
+	if (lwpidarray != NULL)
+		free(lwpidarray, M_TEMP);
+	stack_destroy(st);
+	free(kkstp, M_TEMP);
+	return (error);
+}
+#endif
+
+/*
+ * This sysctl allows a process to retrieve the full list of groups from
+ * itself or another process.
+ */
+static int
+sysctl_kern_proc_groups(SYSCTL_HANDLER_ARGS)
+{
+	pid_t *pidp = (pid_t *)arg1;
+	unsigned int arglen = arg2;
+	struct proc *p;
+	struct ucred *cred;
+	int error;
+
+	if (arglen != 1)
+		return (EINVAL);
+	if (*pidp == -1) {	/* -1 means this process */
+		p = req->td->td_proc;
+	} else {
+		error = pget(*pidp, PGET_CANSEE, &p);
+		if (error != 0)
+			return (error);
+	}
+
+	cred = crhold(p->p_ucred);
+	if (*pidp != -1)
+		PROC_UNLOCK(p);
+
+	error = SYSCTL_OUT(req, cred->cr_groups,
+	    cred->cr_ngroups * sizeof(gid_t));
+	crfree(cred);
+	return (error);
+}
+
+/*
+ * This sysctl allows a process to retrieve or/and set the resource limit for
+ * another process.
+ */
+static int
+sysctl_kern_proc_rlimit(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *)arg1;
+	u_int namelen = arg2;
+	struct rlimit rlim;
+	struct proc *p;
+	u_int which;
+	int flags, error;
+
+	if (namelen != 2)
+		return (EINVAL);
+
+	which = (u_int)name[1];
+	if (which >= RLIM_NLIMITS)
+		return (EINVAL);
+
+	if (req->newptr != NULL && req->newlen != sizeof(rlim))
+		return (EINVAL);
+
+	flags = PGET_HOLD | PGET_NOTWEXIT;
+	if (req->newptr != NULL)
+		flags |= PGET_CANDEBUG;
+	else
+		flags |= PGET_CANSEE;
+	error = pget((pid_t)name[0], flags, &p);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Retrieve limit.
+	 */
+	if (req->oldptr != NULL) {
+		PROC_LOCK(p);
+		lim_rlimit(p, which, &rlim);
+		PROC_UNLOCK(p);
+	}
+	error = SYSCTL_OUT(req, &rlim, sizeof(rlim));
+	if (error != 0)
+		goto errout;
+
+	/*
+	 * Set limit.
+	 */
+	if (req->newptr != NULL) {
+		error = SYSCTL_IN(req, &rlim, sizeof(rlim));
+		if (error == 0)
+			error = kern_proc_setrlimit(curthread, p, which, &rlim);
+	}
+
+errout:
+	PRELE(p);
+	return (error);
+}
+
+/*
+ * This sysctl allows a process to retrieve ps_strings structure location of
+ * another process.
+ */
+static int
+sysctl_kern_proc_ps_strings(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *)arg1;
+	u_int namelen = arg2;
+	struct proc *p;
+	vm_offset_t ps_strings;
+	int error;
+#ifdef COMPAT_FREEBSD32
+	uint32_t ps_strings32;
+#endif
+
+	if (namelen != 1)
+		return (EINVAL);
+
+	error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
+	if (error != 0)
+		return (error);
+#ifdef COMPAT_FREEBSD32
+	if ((req->flags & SCTL_MASK32) != 0) {
+		/*
+		 * We return 0 if the 32 bit emulation request is for a 64 bit
+		 * process.
+		 */
+		ps_strings32 = SV_PROC_FLAG(p, SV_ILP32) != 0 ?
+		    PTROUT(p->p_sysent->sv_psstrings) : 0;
+		PROC_UNLOCK(p);
+		error = SYSCTL_OUT(req, &ps_strings32, sizeof(ps_strings32));
+		return (error);
+	}
+#endif
+	ps_strings = p->p_sysent->sv_psstrings;
+	PROC_UNLOCK(p);
+	error = SYSCTL_OUT(req, &ps_strings, sizeof(ps_strings));
+	return (error);
+}
+
+/*
+ * This sysctl allows a process to retrieve umask of another process.
+ */
+static int
+sysctl_kern_proc_umask(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *)arg1;
+	u_int namelen = arg2;
+	struct proc *p;
+	int error;
+	u_short fd_cmask;
+
+	if (namelen != 1)
+		return (EINVAL);
+
+	error = pget((pid_t)name[0], PGET_WANTREAD, &p);
+	if (error != 0)
+		return (error);
+
+	FILEDESC_SLOCK(p->p_fd);
+	fd_cmask = p->p_fd->fd_cmask;
+	FILEDESC_SUNLOCK(p->p_fd);
+	PRELE(p);
+	error = SYSCTL_OUT(req, &fd_cmask, sizeof(fd_cmask));
+	return (error);
+}
+
+/*
+ * This sysctl allows a process to set and retrieve binary osreldate of
+ * another process.
+ */
+static int
+sysctl_kern_proc_osrel(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *)arg1;
+	u_int namelen = arg2;
+	struct proc *p;
+	int flags, error, osrel;
+
+	if (namelen != 1)
+		return (EINVAL);
+
+	if (req->newptr != NULL && req->newlen != sizeof(osrel))
+		return (EINVAL);
+
+	flags = PGET_HOLD | PGET_NOTWEXIT;
+	if (req->newptr != NULL)
+		flags |= PGET_CANDEBUG;
+	else
+		flags |= PGET_CANSEE;
+	error = pget((pid_t)name[0], flags, &p);
+	if (error != 0)
+		return (error);
+
+	error = SYSCTL_OUT(req, &p->p_osrel, sizeof(p->p_osrel));
+	if (error != 0)
+		goto errout;
+
+	if (req->newptr != NULL) {
+		error = SYSCTL_IN(req, &osrel, sizeof(osrel));
+		if (error != 0)
+			goto errout;
+		if (osrel < 0) {
+			error = EINVAL;
+			goto errout;
+		}
+		p->p_osrel = osrel;
+	}
+errout:
+	PRELE(p);
+	return (error);
+}
+
+SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD,  0, "Process table");
+
+SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT|
+	CTLFLAG_MPSAFE, 0, 0, sysctl_kern_proc, "S,proc",
+	"Return entire process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD | CTLFLAG_MPSAFE,
+	sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD | CTLFLAG_MPSAFE,
+	sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD | CTLFLAG_MPSAFE,
+	sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD |
+	CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD | CTLFLAG_MPSAFE,
+	sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD | CTLFLAG_MPSAFE,
+	sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD | CTLFLAG_MPSAFE,
+	sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD | CTLFLAG_MPSAFE,
+	sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD | CTLFLAG_MPSAFE,
+	sysctl_kern_proc, "Return process table, no threads");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args,
+	CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE,
+	sysctl_kern_proc_args, "Process argument list");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_ENV, env, CTLFLAG_RD | CTLFLAG_MPSAFE,
+	sysctl_kern_proc_env, "Process environment");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_AUXV, auxv, CTLFLAG_RD |
+	CTLFLAG_MPSAFE, sysctl_kern_proc_auxv, "Process ELF auxiliary vector");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD |
+	CTLFLAG_MPSAFE, sysctl_kern_proc_pathname, "Process executable path");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD |
+	CTLFLAG_MPSAFE, sysctl_kern_proc_sv_name,
+	"Process syscall vector name (ABI type)");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID | KERN_PROC_INC_THREAD), gid_td,
+	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td,
+	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td,
+	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD),
+	sid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td,
+	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td,
+	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td,
+	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td,
+	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td,
+	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc,
+	"Return process table, no threads");
+
+#ifdef COMPAT_FREEBSD7
+static SYSCTL_NODE(_kern_proc, KERN_PROC_OVMMAP, ovmmap, CTLFLAG_RD |
+	CTLFLAG_MPSAFE, sysctl_kern_proc_ovmmap, "Old Process vm map entries");
+#endif
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_VMMAP, vmmap, CTLFLAG_RD |
+	CTLFLAG_MPSAFE, sysctl_kern_proc_vmmap, "Process vm map entries");
+
+#if defined(STACK) || defined(DDB)
+static SYSCTL_NODE(_kern_proc, KERN_PROC_KSTACK, kstack, CTLFLAG_RD |
+	CTLFLAG_MPSAFE, sysctl_kern_proc_kstack, "Process kernel stacks");
+#endif
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_GROUPS, groups, CTLFLAG_RD |
+	CTLFLAG_MPSAFE, sysctl_kern_proc_groups, "Process groups");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_RLIMIT, rlimit, CTLFLAG_RW |
+	CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_rlimit,
+	"Process resource limits");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_PS_STRINGS, ps_strings, CTLFLAG_RD |
+	CTLFLAG_MPSAFE, sysctl_kern_proc_ps_strings,
+	"Process ps_strings location");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_UMASK, umask, CTLFLAG_RD |
+	CTLFLAG_MPSAFE, sysctl_kern_proc_umask, "Process umask");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_OSREL, osrel, CTLFLAG_RW |
+	CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_osrel,
+	"Process binary osreldate");
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
new file mode 100644
index 0000000..f99e053
--- /dev/null
+++ b/sys/kern/kern_prot.c
@@ -0,0 +1,2222 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
+ *	The Regents of the University of California.
+ * (c) UNIX System Laboratories, Inc.
+ * Copyright (c) 2000-2001 Robert N. M. Watson.
+ * All rights reserved.
+ *
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_prot.c	8.6 (Berkeley) 1/21/94
+ */
+
+/*
+ * System calls related to processes and protection
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/acct.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/loginclass.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
+#include <sys/sx.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/jail.h>
+#include <sys/pioctl.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+
+#ifdef REGRESSION
+FEATURE(regression,
+    "Kernel support for interfaces necessary for regression testing (SECURITY RISK!)");
+#endif
+
+#if defined(INET) || defined(INET6)
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#endif
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+static MALLOC_DEFINE(M_CRED, "cred", "credentials");
+
+SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0, "BSD security policy");
+
+static void crextend(struct ucred *cr, int n);
+static void crsetgroups_locked(struct ucred *cr, int ngrp,
+    gid_t *groups);
+
+#ifndef _SYS_SYSPROTO_H_
+struct getpid_args {
+	int	dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getpid(struct thread *td, struct getpid_args *uap)
+{
+	struct proc *p = td->td_proc;
+
+	td->td_retval[0] = p->p_pid;
+#if defined(COMPAT_43)
+	PROC_LOCK(p);
+	td->td_retval[1] = p->p_pptr->p_pid;
+	PROC_UNLOCK(p);
+#endif
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getppid_args {
+        int     dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getppid(struct thread *td, struct getppid_args *uap)
+{
+	struct proc *p = td->td_proc;
+
+	PROC_LOCK(p);
+	td->td_retval[0] = p->p_pptr->p_pid;
+	PROC_UNLOCK(p);
+	return (0);
+}
+
+/*
+ * Get process group ID; note that POSIX getpgrp takes no parameter.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgrp_args {
+        int     dummy;
+};
+#endif
+int
+sys_getpgrp(struct thread *td, struct getpgrp_args *uap)
+{
+	struct proc *p = td->td_proc;
+
+	PROC_LOCK(p);
+	td->td_retval[0] = p->p_pgrp->pg_id;
+	PROC_UNLOCK(p);
+	return (0);
+}
+
+/* Get an arbitary pid's process group id */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgid_args {
+	pid_t	pid;
+};
+#endif
+int
+sys_getpgid(struct thread *td, struct getpgid_args *uap)
+{
+	struct proc *p;
+	int error;
+
+	if (uap->pid == 0) {
+		p = td->td_proc;
+		PROC_LOCK(p);
+	} else {
+		p = pfind(uap->pid);
+		if (p == NULL)
+			return (ESRCH);
+		error = p_cansee(td, p);
+		if (error) {
+			PROC_UNLOCK(p);
+			return (error);
+		}
+	}
+	td->td_retval[0] = p->p_pgrp->pg_id;
+	PROC_UNLOCK(p);
+	return (0);
+}
+
+/*
+ * Get an arbitary pid's session id.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getsid_args {
+	pid_t	pid;
+};
+#endif
+int
+sys_getsid(struct thread *td, struct getsid_args *uap)
+{
+	struct proc *p;
+	int error;
+
+	if (uap->pid == 0) {
+		p = td->td_proc;
+		PROC_LOCK(p);
+	} else {
+		p = pfind(uap->pid);
+		if (p == NULL)
+			return (ESRCH);
+		error = p_cansee(td, p);
+		if (error) {
+			PROC_UNLOCK(p);
+			return (error);
+		}
+	}
+	td->td_retval[0] = p->p_session->s_sid;
+	PROC_UNLOCK(p);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getuid_args {
+        int     dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getuid(struct thread *td, struct getuid_args *uap)
+{
+
+	td->td_retval[0] = td->td_ucred->cr_ruid;
+#if defined(COMPAT_43)
+	td->td_retval[1] = td->td_ucred->cr_uid;
+#endif
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct geteuid_args {
+        int     dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_geteuid(struct thread *td, struct geteuid_args *uap)
+{
+
+	td->td_retval[0] = td->td_ucred->cr_uid;
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getgid_args {
+        int     dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getgid(struct thread *td, struct getgid_args *uap)
+{
+
+	td->td_retval[0] = td->td_ucred->cr_rgid;
+#if defined(COMPAT_43)
+	td->td_retval[1] = td->td_ucred->cr_groups[0];
+#endif
+	return (0);
+}
+
+/*
+ * Get effective group ID.  The "egid" is groups[0], and could be obtained
+ * via getgroups.  This syscall exists because it is somewhat painful to do
+ * correctly in a library function.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getegid_args {
+        int     dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getegid(struct thread *td, struct getegid_args *uap)
+{
+
+	td->td_retval[0] = td->td_ucred->cr_groups[0];
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getgroups_args {
+	u_int	gidsetsize;
+	gid_t	*gidset;
+};
+#endif
+int
+sys_getgroups(struct thread *td, register struct getgroups_args *uap)
+{
+	gid_t *groups;
+	u_int ngrp;
+	int error;
+
+	if (uap->gidsetsize < td->td_ucred->cr_ngroups) {
+		if (uap->gidsetsize == 0)
+			ngrp = 0;
+		else
+			return (EINVAL);
+	} else
+		ngrp = td->td_ucred->cr_ngroups;
+	groups = malloc(ngrp * sizeof(*groups), M_TEMP, M_WAITOK);
+	error = kern_getgroups(td, &ngrp, groups);
+	if (error)
+		goto out;
+	if (uap->gidsetsize > 0)
+		error = copyout(groups, uap->gidset, ngrp * sizeof(gid_t));
+	if (error == 0)
+		td->td_retval[0] = ngrp;
+out:
+	free(groups, M_TEMP);
+	return (error);
+}
+
+int
+kern_getgroups(struct thread *td, u_int *ngrp, gid_t *groups)
+{
+	struct ucred *cred;
+
+	cred = td->td_ucred;
+	if (*ngrp == 0) {
+		*ngrp = cred->cr_ngroups;
+		return (0);
+	}
+	if (*ngrp < cred->cr_ngroups)
+		return (EINVAL);
+	*ngrp = cred->cr_ngroups;
+	bcopy(cred->cr_groups, groups, *ngrp * sizeof(gid_t));
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setsid_args {
+        int     dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setsid(register struct thread *td, struct setsid_args *uap)
+{
+	struct pgrp *pgrp;
+	int error;
+	struct proc *p = td->td_proc;
+	struct pgrp *newpgrp;
+	struct session *newsess;
+
+	error = 0;
+	pgrp = NULL;
+
+	newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
+	newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO);
+
+	sx_xlock(&proctree_lock);
+
+	if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) {
+		if (pgrp != NULL)
+			PGRP_UNLOCK(pgrp);
+		error = EPERM;
+	} else {
+		(void)enterpgrp(p, p->p_pid, newpgrp, newsess);
+		td->td_retval[0] = p->p_pid;
+		newpgrp = NULL;
+		newsess = NULL;
+	}
+
+	sx_xunlock(&proctree_lock);
+
+	if (newpgrp != NULL)
+		free(newpgrp, M_PGRP);
+	if (newsess != NULL)
+		free(newsess, M_SESSION);
+
+	return (error);
+}
+
+/*
+ * set process group (setpgid/old setpgrp)
+ *
+ * caller does setpgid(targpid, targpgid)
+ *
+ * pid must be caller or child of caller (ESRCH)
+ * if a child
+ *	pid must be in same session (EPERM)
+ *	pid can't have done an exec (EACCES)
+ * if pgid != pid
+ * 	there must exist some pid in same session having pgid (EPERM)
+ * pid must not be session leader (EPERM)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setpgid_args {
+	int	pid;		/* target process id */
+	int	pgid;		/* target pgrp id */
+};
+#endif
+/* ARGSUSED */
+int
+sys_setpgid(struct thread *td, register struct setpgid_args *uap)
+{
+	struct proc *curp = td->td_proc;
+	register struct proc *targp;	/* target process */
+	register struct pgrp *pgrp;	/* target pgrp */
+	int error;
+	struct pgrp *newpgrp;
+
+	if (uap->pgid < 0)
+		return (EINVAL);
+
+	error = 0;
+
+	newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
+
+	sx_xlock(&proctree_lock);
+	if (uap->pid != 0 && uap->pid != curp->p_pid) {
+		if ((targp = pfind(uap->pid)) == NULL) {
+			error = ESRCH;
+			goto done;
+		}
+		if (!inferior(targp)) {
+			PROC_UNLOCK(targp);
+			error = ESRCH;
+			goto done;
+		}
+		if ((error = p_cansee(td, targp))) {
+			PROC_UNLOCK(targp);
+			goto done;
+		}
+		if (targp->p_pgrp == NULL ||
+		    targp->p_session != curp->p_session) {
+			PROC_UNLOCK(targp);
+			error = EPERM;
+			goto done;
+		}
+		if (targp->p_flag & P_EXEC) {
+			PROC_UNLOCK(targp);
+			error = EACCES;
+			goto done;
+		}
+		PROC_UNLOCK(targp);
+	} else
+		targp = curp;
+	if (SESS_LEADER(targp)) {
+		error = EPERM;
+		goto done;
+	}
+	if (uap->pgid == 0)
+		uap->pgid = targp->p_pid;
+	if ((pgrp = pgfind(uap->pgid)) == NULL) {
+		if (uap->pgid == targp->p_pid) {
+			error = enterpgrp(targp, uap->pgid, newpgrp,
+			    NULL);
+			if (error == 0)
+				newpgrp = NULL;
+		} else
+			error = EPERM;
+	} else {
+		if (pgrp == targp->p_pgrp) {
+			PGRP_UNLOCK(pgrp);
+			goto done;
+		}
+		if (pgrp->pg_id != targp->p_pid &&
+		    pgrp->pg_session != curp->p_session) {
+			PGRP_UNLOCK(pgrp);
+			error = EPERM;
+			goto done;
+		}
+		PGRP_UNLOCK(pgrp);
+		error = enterthispgrp(targp, pgrp);
+	}
+done:
+	sx_xunlock(&proctree_lock);
+	KASSERT((error == 0) || (newpgrp != NULL),
+	    ("setpgid failed and newpgrp is NULL"));
+	if (newpgrp != NULL)
+		free(newpgrp, M_PGRP);
+	return (error);
+}
+
+/*
+ * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD
+ * compatible.  It says that setting the uid/gid to euid/egid is a special
+ * case of "appropriate privilege".  Once the rules are expanded out, this
+ * basically means that setuid(nnn) sets all three id's, in all permitted
+ * cases unless _POSIX_SAVED_IDS is enabled.  In that case, setuid(getuid())
+ * does not set the saved id - this is dangerous for traditional BSD
+ * programs.  For this reason, we *really* do not want to set
+ * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2.
+ */
+#define POSIX_APPENDIX_B_4_2_2
+
+#ifndef _SYS_SYSPROTO_H_
+struct setuid_args {
+	uid_t	uid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setuid(struct thread *td, struct setuid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	uid_t uid;
+	struct uidinfo *uip;
+	int error;
+
+	uid = uap->uid;
+	AUDIT_ARG_UID(uid);
+	newcred = crget();
+	uip = uifind(uid);
+	PROC_LOCK(p);
+	/*
+	 * Copy credentials so other references do not see our changes.
+	 */
+	oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+	error = mac_cred_check_setuid(oldcred, uid);
+	if (error)
+		goto fail;
+#endif
+
+	/*
+	 * See if we have "permission" by POSIX 1003.1 rules.
+	 *
+	 * Note that setuid(geteuid()) is a special case of
+	 * "appropriate privileges" in appendix B.4.2.2.  We need
+	 * to use this clause to be compatible with traditional BSD
+	 * semantics.  Basically, it means that "setuid(xx)" sets all
+	 * three id's (assuming you have privs).
+	 *
+	 * Notes on the logic.  We do things in three steps.
+	 * 1: We determine if the euid is going to change, and do EPERM
+	 *    right away.  We unconditionally change the euid later if this
+	 *    test is satisfied, simplifying that part of the logic.
+	 * 2: We determine if the real and/or saved uids are going to
+	 *    change.  Determined by compile options.
+	 * 3: Change euid last. (after tests in #2 for "appropriate privs")
+	 */
+	if (uid != oldcred->cr_ruid &&		/* allow setuid(getuid()) */
+#ifdef _POSIX_SAVED_IDS
+	    uid != oldcred->cr_svuid &&		/* allow setuid(saved gid) */
+#endif
+#ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
+	    uid != oldcred->cr_uid &&		/* allow setuid(geteuid()) */
+#endif
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETUID, 0)) != 0)
+		goto fail;
+
+#ifdef _POSIX_SAVED_IDS
+	/*
+	 * Do we have "appropriate privileges" (are we root or uid == euid)
+	 * If so, we are changing the real uid and/or saved uid.
+	 */
+	if (
+#ifdef POSIX_APPENDIX_B_4_2_2	/* Use the clause from B.4.2.2 */
+	    uid == oldcred->cr_uid ||
+#endif
+	    /* We are using privs. */
+	    priv_check_cred(oldcred, PRIV_CRED_SETUID, 0) == 0)
+#endif
+	{
+		/*
+		 * Set the real uid and transfer proc count to new user.
+		 */
+		if (uid != oldcred->cr_ruid) {
+			change_ruid(newcred, uip);
+			setsugid(p);
+		}
+		/*
+		 * Set saved uid
+		 *
+		 * XXX always set saved uid even if not _POSIX_SAVED_IDS, as
+		 * the security of seteuid() depends on it.  B.4.2.2 says it
+		 * is important that we should do this.
+		 */
+		if (uid != oldcred->cr_svuid) {
+			change_svuid(newcred, uid);
+			setsugid(p);
+		}
+	}
+
+	/*
+	 * In all permitted cases, we are changing the euid.
+	 */
+	if (uid != oldcred->cr_uid) {
+		change_euid(newcred, uip);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+#ifdef RACCT
+	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+	uifree(uip);
+	crfree(oldcred);
+	return (0);
+
+fail:
+	PROC_UNLOCK(p);
+	uifree(uip);
+	crfree(newcred);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct seteuid_args {
+	uid_t	euid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_seteuid(struct thread *td, struct seteuid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	uid_t euid;
+	struct uidinfo *euip;
+	int error;
+
+	euid = uap->euid;
+	AUDIT_ARG_EUID(euid);
+	newcred = crget();
+	euip = uifind(euid);
+	PROC_LOCK(p);
+	/*
+	 * Copy credentials so other references do not see our changes.
+	 */
+	oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+	error = mac_cred_check_seteuid(oldcred, euid);
+	if (error)
+		goto fail;
+#endif
+
+	if (euid != oldcred->cr_ruid &&		/* allow seteuid(getuid()) */
+	    euid != oldcred->cr_svuid &&	/* allow seteuid(saved uid) */
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID, 0)) != 0)
+		goto fail;
+
+	/*
+	 * Everything's okay, do it.
+	 */
+	if (oldcred->cr_uid != euid) {
+		change_euid(newcred, euip);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	uifree(euip);
+	crfree(oldcred);
+	return (0);
+
+fail:
+	PROC_UNLOCK(p);
+	uifree(euip);
+	crfree(newcred);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setgid_args {
+	gid_t	gid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setgid(struct thread *td, struct setgid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	gid_t gid;
+	int error;
+
+	gid = uap->gid;
+	AUDIT_ARG_GID(gid);
+	newcred = crget();
+	PROC_LOCK(p);
+	oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+	error = mac_cred_check_setgid(oldcred, gid);
+	if (error)
+		goto fail;
+#endif
+
+	/*
+	 * See if we have "permission" by POSIX 1003.1 rules.
+	 *
+	 * Note that setgid(getegid()) is a special case of
+	 * "appropriate privileges" in appendix B.4.2.2.  We need
+	 * to use this clause to be compatible with traditional BSD
+	 * semantics.  Basically, it means that "setgid(xx)" sets all
+	 * three id's (assuming you have privs).
+	 *
+	 * For notes on the logic here, see setuid() above.
+	 */
+	if (gid != oldcred->cr_rgid &&		/* allow setgid(getgid()) */
+#ifdef _POSIX_SAVED_IDS
+	    gid != oldcred->cr_svgid &&		/* allow setgid(saved gid) */
+#endif
+#ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
+	    gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */
+#endif
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETGID, 0)) != 0)
+		goto fail;
+
+#ifdef _POSIX_SAVED_IDS
+	/*
+	 * Do we have "appropriate privileges" (are we root or gid == egid)
+	 * If so, we are changing the real uid and saved gid.
+	 */
+	if (
+#ifdef POSIX_APPENDIX_B_4_2_2	/* use the clause from B.4.2.2 */
+	    gid == oldcred->cr_groups[0] ||
+#endif
+	    /* We are using privs. */
+	    priv_check_cred(oldcred, PRIV_CRED_SETGID, 0) == 0)
+#endif
+	{
+		/*
+		 * Set real gid
+		 */
+		if (oldcred->cr_rgid != gid) {
+			change_rgid(newcred, gid);
+			setsugid(p);
+		}
+		/*
+		 * Set saved gid
+		 *
+		 * XXX always set saved gid even if not _POSIX_SAVED_IDS, as
+		 * the security of setegid() depends on it.  B.4.2.2 says it
+		 * is important that we should do this.
+		 */
+		if (oldcred->cr_svgid != gid) {
+			change_svgid(newcred, gid);
+			setsugid(p);
+		}
+	}
+	/*
+	 * In all cases permitted cases, we are changing the egid.
+	 * Copy credentials so other references do not see our changes.
+	 */
+	if (oldcred->cr_groups[0] != gid) {
+		change_egid(newcred, gid);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	crfree(oldcred);
+	return (0);
+
+fail:
+	PROC_UNLOCK(p);
+	crfree(newcred);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setegid_args {
+	gid_t	egid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setegid(struct thread *td, struct setegid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	gid_t egid;
+	int error;
+
+	egid = uap->egid;
+	AUDIT_ARG_EGID(egid);
+	newcred = crget();
+	PROC_LOCK(p);
+	oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+	error = mac_cred_check_setegid(oldcred, egid);
+	if (error)
+		goto fail;
+#endif
+
+	if (egid != oldcred->cr_rgid &&		/* allow setegid(getgid()) */
+	    egid != oldcred->cr_svgid &&	/* allow setegid(saved gid) */
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID, 0)) != 0)
+		goto fail;
+
+	if (oldcred->cr_groups[0] != egid) {
+		change_egid(newcred, egid);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	crfree(oldcred);
+	return (0);
+
+fail:
+	PROC_UNLOCK(p);
+	crfree(newcred);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setgroups_args {
+	u_int	gidsetsize;
+	gid_t	*gidset;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setgroups(struct thread *td, struct setgroups_args *uap)
+{
+	gid_t *groups = NULL;
+	int error;
+
+	if (uap->gidsetsize > ngroups_max + 1)
+		return (EINVAL);
+	groups = malloc(uap->gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK);
+	error = copyin(uap->gidset, groups, uap->gidsetsize * sizeof(gid_t));
+	if (error)
+		goto out;
+	error = kern_setgroups(td, uap->gidsetsize, groups);
+out:
+	free(groups, M_TEMP);
+	return (error);
+}
+
+int
+kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	int error;
+
+	if (ngrp > ngroups_max + 1)
+		return (EINVAL);
+	AUDIT_ARG_GROUPSET(groups, ngrp);
+	newcred = crget();
+	crextend(newcred, ngrp);
+	PROC_LOCK(p);
+	oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+	error = mac_cred_check_setgroups(oldcred, ngrp, groups);
+	if (error)
+		goto fail;
+#endif
+
+	error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0);
+	if (error)
+		goto fail;
+
+	if (ngrp < 1) {
+		/*
+		 * setgroups(0, NULL) is a legitimate way of clearing the
+		 * groups vector on non-BSD systems (which generally do not
+		 * have the egid in the groups[0]).  We risk security holes
+		 * when running non-BSD software if we do not do the same.
+		 */
+		newcred->cr_ngroups = 1;
+	} else {
+		crsetgroups_locked(newcred, ngrp, groups);
+	}
+	setsugid(p);
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	crfree(oldcred);
+	return (0);
+
+fail:
+	PROC_UNLOCK(p);
+	crfree(newcred);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setreuid_args {
+	uid_t	ruid;
+	uid_t	euid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setreuid(register struct thread *td, struct setreuid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	uid_t euid, ruid;
+	struct uidinfo *euip, *ruip;
+	int error;
+
+	euid = uap->euid;
+	ruid = uap->ruid;
+	AUDIT_ARG_EUID(euid);
+	AUDIT_ARG_RUID(ruid);
+	newcred = crget();
+	euip = uifind(euid);
+	ruip = uifind(ruid);
+	PROC_LOCK(p);
+	oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+	error = mac_cred_check_setreuid(oldcred, ruid, euid);
+	if (error)
+		goto fail;
+#endif
+
+	if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
+	      ruid != oldcred->cr_svuid) ||
+	     (euid != (uid_t)-1 && euid != oldcred->cr_uid &&
+	      euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) &&
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID, 0)) != 0)
+		goto fail;
+
+	if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
+		change_euid(newcred, euip);
+		setsugid(p);
+	}
+	if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
+		change_ruid(newcred, ruip);
+		setsugid(p);
+	}
+	if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) &&
+	    newcred->cr_svuid != newcred->cr_uid) {
+		change_svuid(newcred, newcred->cr_uid);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+#ifdef RACCT
+	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+	uifree(ruip);
+	uifree(euip);
+	crfree(oldcred);
+	return (0);
+
+fail:
+	PROC_UNLOCK(p);
+	uifree(ruip);
+	uifree(euip);
+	crfree(newcred);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setregid_args {
+	gid_t	rgid;
+	gid_t	egid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setregid(register struct thread *td, struct setregid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	gid_t egid, rgid;
+	int error;
+
+	egid = uap->egid;
+	rgid = uap->rgid;
+	AUDIT_ARG_EGID(egid);
+	AUDIT_ARG_RGID(rgid);
+	newcred = crget();
+	PROC_LOCK(p);
+	oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+	error = mac_cred_check_setregid(oldcred, rgid, egid);
+	if (error)
+		goto fail;
+#endif
+
+	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
+	    rgid != oldcred->cr_svgid) ||
+	     (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] &&
+	     egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID, 0)) != 0)
+		goto fail;
+
+	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+		change_egid(newcred, egid);
+		setsugid(p);
+	}
+	if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
+		change_rgid(newcred, rgid);
+		setsugid(p);
+	}
+	if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) &&
+	    newcred->cr_svgid != newcred->cr_groups[0]) {
+		change_svgid(newcred, newcred->cr_groups[0]);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	crfree(oldcred);
+	return (0);
+
+fail:
+	PROC_UNLOCK(p);
+	crfree(newcred);
+	return (error);
+}
+
+/*
+ * setresuid(ruid, euid, suid) is like setreuid except control over the saved
+ * uid is explicit.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setresuid_args {
+	uid_t	ruid;
+	uid_t	euid;
+	uid_t	suid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setresuid(register struct thread *td, struct setresuid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	uid_t euid, ruid, suid;
+	struct uidinfo *euip, *ruip;
+	int error;
+
+	euid = uap->euid;
+	ruid = uap->ruid;
+	suid = uap->suid;
+	AUDIT_ARG_EUID(euid);
+	AUDIT_ARG_RUID(ruid);
+	AUDIT_ARG_SUID(suid);
+	newcred = crget();
+	euip = uifind(euid);
+	ruip = uifind(ruid);
+	PROC_LOCK(p);
+	oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+	error = mac_cred_check_setresuid(oldcred, ruid, euid, suid);
+	if (error)
+		goto fail;
+#endif
+
+	if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
+	     ruid != oldcred->cr_svuid &&
+	      ruid != oldcred->cr_uid) ||
+	     (euid != (uid_t)-1 && euid != oldcred->cr_ruid &&
+	    euid != oldcred->cr_svuid &&
+	      euid != oldcred->cr_uid) ||
+	     (suid != (uid_t)-1 && suid != oldcred->cr_ruid &&
+	    suid != oldcred->cr_svuid &&
+	      suid != oldcred->cr_uid)) &&
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID, 0)) != 0)
+		goto fail;
+
+	if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
+		change_euid(newcred, euip);
+		setsugid(p);
+	}
+	if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
+		change_ruid(newcred, ruip);
+		setsugid(p);
+	}
+	if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) {
+		change_svuid(newcred, suid);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+#ifdef RACCT
+	racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+	uifree(ruip);
+	uifree(euip);
+	crfree(oldcred);
+	return (0);
+
+fail:
+	PROC_UNLOCK(p);
+	uifree(ruip);
+	uifree(euip);
+	crfree(newcred);
+	return (error);
+
+}
+
+/*
+ * setresgid(rgid, egid, sgid) is like setregid except control over the saved
+ * gid is explicit.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setresgid_args {
+	gid_t	rgid;
+	gid_t	egid;
+	gid_t	sgid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setresgid(register struct thread *td, struct setresgid_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *newcred, *oldcred;
+	gid_t egid, rgid, sgid;
+	int error;
+
+	egid = uap->egid;
+	rgid = uap->rgid;
+	sgid = uap->sgid;
+	AUDIT_ARG_EGID(egid);
+	AUDIT_ARG_RGID(rgid);
+	AUDIT_ARG_SGID(sgid);
+	newcred = crget();
+	PROC_LOCK(p);
+	oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+	error = mac_cred_check_setresgid(oldcred, rgid, egid, sgid);
+	if (error)
+		goto fail;
+#endif
+
+	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
+	      rgid != oldcred->cr_svgid &&
+	      rgid != oldcred->cr_groups[0]) ||
+	     (egid != (gid_t)-1 && egid != oldcred->cr_rgid &&
+	      egid != oldcred->cr_svgid &&
+	      egid != oldcred->cr_groups[0]) ||
+	     (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
+	      sgid != oldcred->cr_svgid &&
+	      sgid != oldcred->cr_groups[0])) &&
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID, 0)) != 0)
+		goto fail;
+
+	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+		change_egid(newcred, egid);
+		setsugid(p);
+	}
+	if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
+		change_rgid(newcred, rgid);
+		setsugid(p);
+	}
+	if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) {
+		change_svgid(newcred, sgid);
+		setsugid(p);
+	}
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	crfree(oldcred);
+	return (0);
+
+fail:
+	PROC_UNLOCK(p);
+	crfree(newcred);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getresuid_args {
+	uid_t	*ruid;
+	uid_t	*euid;
+	uid_t	*suid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getresuid(register struct thread *td, struct getresuid_args *uap)
+{
+	struct ucred *cred;
+	int error1 = 0, error2 = 0, error3 = 0;
+
+	cred = td->td_ucred;
+	if (uap->ruid)
+		error1 = copyout(&cred->cr_ruid,
+		    uap->ruid, sizeof(cred->cr_ruid));
+	if (uap->euid)
+		error2 = copyout(&cred->cr_uid,
+		    uap->euid, sizeof(cred->cr_uid));
+	if (uap->suid)
+		error3 = copyout(&cred->cr_svuid,
+		    uap->suid, sizeof(cred->cr_svuid));
+	return (error1 ? error1 : error2 ? error2 : error3);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getresgid_args {
+	gid_t	*rgid;
+	gid_t	*egid;
+	gid_t	*sgid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getresgid(register struct thread *td, struct getresgid_args *uap)
+{
+	struct ucred *cred;
+	int error1 = 0, error2 = 0, error3 = 0;
+
+	cred = td->td_ucred;
+	if (uap->rgid)
+		error1 = copyout(&cred->cr_rgid,
+		    uap->rgid, sizeof(cred->cr_rgid));
+	if (uap->egid)
+		error2 = copyout(&cred->cr_groups[0],
+		    uap->egid, sizeof(cred->cr_groups[0]));
+	if (uap->sgid)
+		error3 = copyout(&cred->cr_svgid,
+		    uap->sgid, sizeof(cred->cr_svgid));
+	return (error1 ? error1 : error2 ? error2 : error3);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct issetugid_args {
+	int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_issetugid(register struct thread *td, struct issetugid_args *uap)
+{
+	struct proc *p = td->td_proc;
+
+	/*
+	 * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time,
+	 * we use P_SUGID because we consider changing the owners as
+	 * "tainting" as well.
+	 * This is significant for procs that start as root and "become"
+	 * a user without an exec - programs cannot know *everything*
+	 * that libc *might* have put in their data segment.
+	 */
+	PROC_LOCK(p);
+	td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0;
+	PROC_UNLOCK(p);
+	return (0);
+}
+
+int
+sys___setugid(struct thread *td, struct __setugid_args *uap)
+{
+#ifdef REGRESSION
+	struct proc *p;
+
+	p = td->td_proc;
+	switch (uap->flag) {
+	case 0:
+		PROC_LOCK(p);
+		p->p_flag &= ~P_SUGID;
+		PROC_UNLOCK(p);
+		return (0);
+	case 1:
+		PROC_LOCK(p);
+		p->p_flag |= P_SUGID;
+		PROC_UNLOCK(p);
+		return (0);
+	default:
+		return (EINVAL);
+	}
+#else /* !REGRESSION */
+
+	return (ENOSYS);
+#endif /* REGRESSION */
+}
+
+/*
+ * Check if gid is a member of the group set.
+ */
+int
+groupmember(gid_t gid, struct ucred *cred)
+{
+	int l;
+	int h;
+	int m;
+
+	if (cred->cr_groups[0] == gid)
+		return(1);
+
+	/*
+	 * If gid was not our primary group, perform a binary search
+	 * of the supplemental groups.  This is possible because we
+	 * sort the groups in crsetgroups().
+	 */
+	l = 1;
+	h = cred->cr_ngroups;
+	while (l < h) {
+		m = l + ((h - l) / 2);
+		if (cred->cr_groups[m] < gid)
+			l = m + 1; 
+		else
+			h = m; 
+	}
+	if ((l < cred->cr_ngroups) && (cred->cr_groups[l] == gid))
+		return (1);
+
+	return (0);
+}
+
+/*
+ * Test the active securelevel against a given level.  securelevel_gt()
+ * implements (securelevel > level).  securelevel_ge() implements
+ * (securelevel >= level).  Note that the logic is inverted -- these
+ * functions return EPERM on "success" and 0 on "failure".
+ *
+ * Due to care taken when setting the securelevel, we know that no jail will
+ * be less secure that its parent (or the physical system), so it is sufficient
+ * to test the current jail only.
+ *
+ * XXXRW: Possibly since this has to do with privilege, it should move to
+ * kern_priv.c.
+ */
+int
+securelevel_gt(struct ucred *cr, int level)
+{
+
+	return (cr->cr_prison->pr_securelevel > level ? EPERM : 0);
+}
+
+int
+securelevel_ge(struct ucred *cr, int level)
+{
+
+	return (cr->cr_prison->pr_securelevel >= level ? EPERM : 0);
+}
+
+/*
+ * 'see_other_uids' determines whether or not visibility of processes
+ * and sockets with credentials holding different real uids is possible
+ * using a variety of system MIBs.
+ * XXX: data declarations should be together near the beginning of the file.
+ */
+static int	see_other_uids = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW,
+    &see_other_uids, 0,
+    "Unprivileged processes may see subjects/objects with different real uid");
+
+/*-
+ * Determine if u1 "can see" the subject specified by u2, according to the
+ * 'see_other_uids' policy.
+ * Returns: 0 for permitted, ESRCH otherwise
+ * Locks: none
+ * References: *u1 and *u2 must not change during the call
+ *             u1 may equal u2, in which case only one reference is required
+ */
+static int
+cr_seeotheruids(struct ucred *u1, struct ucred *u2)
+{
+
+	if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) {
+		if (priv_check_cred(u1, PRIV_SEEOTHERUIDS, 0) != 0)
+			return (ESRCH);
+	}
+	return (0);
+}
+
+/*
+ * 'see_other_gids' determines whether or not visibility of processes
+ * and sockets with credentials holding different real gids is possible
+ * using a variety of system MIBs.
+ * XXX: data declarations should be together near the beginning of the file.
+ */
+static int	see_other_gids = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW,
+    &see_other_gids, 0,
+    "Unprivileged processes may see subjects/objects with different real gid");
+
+/*
+ * Determine if u1 can "see" the subject specified by u2, according to the
+ * 'see_other_gids' policy.
+ * Returns: 0 for permitted, ESRCH otherwise
+ * Locks: none
+ * References: *u1 and *u2 must not change during the call
+ *             u1 may equal u2, in which case only one reference is required
+ */
+static int
+cr_seeothergids(struct ucred *u1, struct ucred *u2)
+{
+	int i, match;
+	
+	if (!see_other_gids) {
+		match = 0;
+		for (i = 0; i < u1->cr_ngroups; i++) {
+			if (groupmember(u1->cr_groups[i], u2))
+				match = 1;
+			if (match)
+				break;
+		}
+		if (!match) {
+			if (priv_check_cred(u1, PRIV_SEEOTHERGIDS, 0) != 0)
+				return (ESRCH);
+		}
+	}
+	return (0);
+}
+
+/*-
+ * Determine if u1 "can see" the subject specified by u2.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: none
+ * References: *u1 and *u2 must not change during the call
+ *             u1 may equal u2, in which case only one reference is required
+ */
+int
+cr_cansee(struct ucred *u1, struct ucred *u2)
+{
+	int error;
+
+	if ((error = prison_check(u1, u2)))
+		return (error);
+#ifdef MAC
+	if ((error = mac_cred_check_visible(u1, u2)))
+		return (error);
+#endif
+	if ((error = cr_seeotheruids(u1, u2)))
+		return (error);
+	if ((error = cr_seeothergids(u1, u2)))
+		return (error);
+	return (0);
+}
+
+/*-
+ * Determine if td "can see" the subject specified by p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect p->p_ucred must be held.  td really
+ *        should be curthread.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_cansee(struct thread *td, struct proc *p)
+{
+
+	/* Wrap cr_cansee() for all functionality. */
+	KASSERT(td == curthread, ("%s: td not curthread", __func__));
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	return (cr_cansee(td->td_ucred, p->p_ucred));
+}
+
+/*
+ * 'conservative_signals' prevents the delivery of a broad class of
+ * signals by unprivileged processes to processes that have changed their
+ * credentials since the last invocation of execve().  This can prevent
+ * the leakage of cached information or retained privileges as a result
+ * of a common class of signal-related vulnerabilities.  However, this
+ * may interfere with some applications that expect to be able to
+ * deliver these signals to peer processes after having given up
+ * privilege.
+ */
+static int	conservative_signals = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW,
+    &conservative_signals, 0, "Unprivileged processes prevented from "
+    "sending certain signals to processes whose credentials have changed");
+/*-
+ * Determine whether cred may deliver the specified signal to proc.
+ * Returns: 0 for permitted, an errno value otherwise.
+ * Locks: A lock must be held for proc.
+ * References: cred and proc must be valid for the lifetime of the call.
+ */
+int
+cr_cansignal(struct ucred *cred, struct proc *proc, int signum)
+{
+	int error;
+
+	PROC_LOCK_ASSERT(proc, MA_OWNED);
+	/*
+	 * Jail semantics limit the scope of signalling to proc in the
+	 * same jail as cred, if cred is in jail.
+	 */
+	error = prison_check(cred, proc->p_ucred);
+	if (error)
+		return (error);
+#ifdef MAC
+	if ((error = mac_proc_check_signal(cred, proc, signum)))
+		return (error);
+#endif
+	if ((error = cr_seeotheruids(cred, proc->p_ucred)))
+		return (error);
+	if ((error = cr_seeothergids(cred, proc->p_ucred)))
+		return (error);
+
+	/*
+	 * UNIX signal semantics depend on the status of the P_SUGID
+	 * bit on the target process.  If the bit is set, then additional
+	 * restrictions are placed on the set of available signals.
+	 */
+	if (conservative_signals && (proc->p_flag & P_SUGID)) {
+		switch (signum) {
+		case 0:
+		case SIGKILL:
+		case SIGINT:
+		case SIGTERM:
+		case SIGALRM:
+		case SIGSTOP:
+		case SIGTTIN:
+		case SIGTTOU:
+		case SIGTSTP:
+		case SIGHUP:
+		case SIGUSR1:
+		case SIGUSR2:
+			/*
+			 * Generally, permit job and terminal control
+			 * signals.
+			 */
+			break;
+		default:
+			/* Not permitted without privilege. */
+			error = priv_check_cred(cred, PRIV_SIGNAL_SUGID, 0);
+			if (error)
+				return (error);
+		}
+	}
+
+	/*
+	 * Generally, the target credential's ruid or svuid must match the
+	 * subject credential's ruid or euid.
+	 */
+	if (cred->cr_ruid != proc->p_ucred->cr_ruid &&
+	    cred->cr_ruid != proc->p_ucred->cr_svuid &&
+	    cred->cr_uid != proc->p_ucred->cr_ruid &&
+	    cred->cr_uid != proc->p_ucred->cr_svuid) {
+		error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED, 0);
+		if (error)
+			return (error);
+	}
+
+	return (0);
+}
+
+/*-
+ * Determine whether td may deliver the specified signal to p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ *        must be held.  td must be curthread, and a lock must be
+ *        held for p.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_cansignal(struct thread *td, struct proc *p, int signum)
+{
+
+	KASSERT(td == curthread, ("%s: td not curthread", __func__));
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if (td->td_proc == p)
+		return (0);
+
+	/*
+	 * UNIX signalling semantics require that processes in the same
+	 * session always be able to deliver SIGCONT to one another,
+	 * overriding the remaining protections.
+	 */
+	/* XXX: This will require an additional lock of some sort. */
+	if (signum == SIGCONT && td->td_proc->p_session == p->p_session)
+		return (0);
+	/*
+	 * Some compat layers use SIGTHR and higher signals for
+	 * communication between different kernel threads of the same
+	 * process, so that they expect that it's always possible to
+	 * deliver them, even for suid applications where cr_cansignal() can
+	 * deny such ability for security consideration.  It should be
+	 * pretty safe to do since the only way to create two processes
+	 * with the same p_leader is via rfork(2).
+	 */
+	if (td->td_proc->p_leader != NULL && signum >= SIGTHR &&
+	    signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader)
+		return (0);
+
+	return (cr_cansignal(td->td_ucred, p, signum));
+}
+
+/*-
+ * Determine whether td may reschedule p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ *        must be held.  td must be curthread, and a lock must
+ *        be held for p.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_cansched(struct thread *td, struct proc *p)
+{
+	int error;
+
+	KASSERT(td == curthread, ("%s: td not curthread", __func__));
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if (td->td_proc == p)
+		return (0);
+	if ((error = prison_check(td->td_ucred, p->p_ucred)))
+		return (error);
+#ifdef MAC
+	if ((error = mac_proc_check_sched(td->td_ucred, p)))
+		return (error);
+#endif
+	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
+		return (error);
+	if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
+		return (error);
+	if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid &&
+	    td->td_ucred->cr_uid != p->p_ucred->cr_ruid) {
+		error = priv_check(td, PRIV_SCHED_DIFFCRED);
+		if (error)
+			return (error);
+	}
+	return (0);
+}
+
+/*
+ * The 'unprivileged_proc_debug' flag may be used to disable a variety of
+ * unprivileged inter-process debugging services, including some procfs
+ * functionality, ptrace(), and ktrace().  In the past, inter-process
+ * debugging has been involved in a variety of security problems, and sites
+ * not requiring the service might choose to disable it when hardening
+ * systems.
+ *
+ * XXX: Should modifying and reading this variable require locking?
+ * XXX: data declarations should be together near the beginning of the file.
+ */
+static int	unprivileged_proc_debug = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLFLAG_RW,
+    &unprivileged_proc_debug, 0,
+    "Unprivileged processes may use process debugging facilities");
+
+/*-
+ * Determine whether td may debug p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ *        must be held.  td must be curthread, and a lock must
+ *        be held for p.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_candebug(struct thread *td, struct proc *p)
+{
+	int credentialchanged, error, grpsubset, i, uidsubset;
+
+	KASSERT(td == curthread, ("%s: td not curthread", __func__));
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if (!unprivileged_proc_debug) {
+		error = priv_check(td, PRIV_DEBUG_UNPRIV);
+		if (error)
+			return (error);
+	}
+	if (td->td_proc == p)
+		return (0);
+	if ((error = prison_check(td->td_ucred, p->p_ucred)))
+		return (error);
+#ifdef MAC
+	if ((error = mac_proc_check_debug(td->td_ucred, p)))
+		return (error);
+#endif
+	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
+		return (error);
+	if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
+		return (error);
+
+	/*
+	 * Is p's group set a subset of td's effective group set?  This
+	 * includes p's egid, group access list, rgid, and svgid.
+	 */
+	grpsubset = 1;
+	for (i = 0; i < p->p_ucred->cr_ngroups; i++) {
+		if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) {
+			grpsubset = 0;
+			break;
+		}
+	}
+	grpsubset = grpsubset &&
+	    groupmember(p->p_ucred->cr_rgid, td->td_ucred) &&
+	    groupmember(p->p_ucred->cr_svgid, td->td_ucred);
+
+	/*
+	 * Are the uids present in p's credential equal to td's
+	 * effective uid?  This includes p's euid, svuid, and ruid.
+	 */
+	uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid &&
+	    td->td_ucred->cr_uid == p->p_ucred->cr_svuid &&
+	    td->td_ucred->cr_uid == p->p_ucred->cr_ruid);
+
+	/*
+	 * Has the credential of the process changed since the last exec()?
+	 */
+	credentialchanged = (p->p_flag & P_SUGID);
+
+	/*
+	 * If p's gids aren't a subset, or the uids aren't a subset,
+	 * or the credential has changed, require appropriate privilege
+	 * for td to debug p.
+	 */
+	if (!grpsubset || !uidsubset) {
+		error = priv_check(td, PRIV_DEBUG_DIFFCRED);
+		if (error)
+			return (error);
+	}
+
+	if (credentialchanged) {
+		error = priv_check(td, PRIV_DEBUG_SUGID);
+		if (error)
+			return (error);
+	}
+
+	/* Can't trace init when securelevel > 0. */
+	if (p == initproc) {
+		error = securelevel_gt(td->td_ucred, 0);
+		if (error)
+			return (error);
+	}
+
+	/*
+	 * Can't trace a process that's currently exec'ing.
+	 *
+	 * XXX: Note, this is not a security policy decision, it's a
+	 * basic correctness/functionality decision.  Therefore, this check
+	 * should be moved to the caller's of p_candebug().
+	 */
+	if ((p->p_flag & P_INEXEC) != 0)
+		return (EBUSY);
+
+	return (0);
+}
+
+/*-
+ * Determine whether the subject represented by cred can "see" a socket.
+ * Returns: 0 for permitted, ENOENT otherwise.
+ */
+int
+cr_canseesocket(struct ucred *cred, struct socket *so)
+{
+	int error;
+
+	error = prison_check(cred, so->so_cred);
+	if (error)
+		return (ENOENT);
+#ifdef MAC
+	error = mac_socket_check_visible(cred, so);
+	if (error)
+		return (error);
+#endif
+	if (cr_seeotheruids(cred, so->so_cred))
+		return (ENOENT);
+	if (cr_seeothergids(cred, so->so_cred))
+		return (ENOENT);
+
+	return (0);
+}
+
+#if defined(INET) || defined(INET6)
+/*-
+ * Determine whether the subject represented by cred can "see" a socket.
+ * Returns: 0 for permitted, ENOENT otherwise.
+ */
+int
+cr_canseeinpcb(struct ucred *cred, struct inpcb *inp)
+{
+	int error;
+
+	error = prison_check(cred, inp->inp_cred);
+	if (error)
+		return (ENOENT);
+#ifdef MAC
+	INP_LOCK_ASSERT(inp);
+	error = mac_inpcb_check_visible(cred, inp);
+	if (error)
+		return (error);
+#endif
+	if (cr_seeotheruids(cred, inp->inp_cred))
+		return (ENOENT);
+	if (cr_seeothergids(cred, inp->inp_cred))
+		return (ENOENT);
+
+	return (0);
+}
+#endif
+
+/*-
+ * Determine whether td can wait for the exit of p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ *        must be held.  td must be curthread, and a lock must
+ *        be held for p.
+ * References: td and p must be valid for the lifetime of the call
+
+ */
+int
+p_canwait(struct thread *td, struct proc *p)
+{
+	int error;
+
+	KASSERT(td == curthread, ("%s: td not curthread", __func__));
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if ((error = prison_check(td->td_ucred, p->p_ucred)))
+		return (error);
+#ifdef MAC
+	if ((error = mac_proc_check_wait(td->td_ucred, p)))
+		return (error);
+#endif
+#if 0
+	/* XXXMAC: This could have odd effects on some shells. */
+	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
+		return (error);
+#endif
+
+	return (0);
+}
+
+/*
+ * Allocate a zeroed cred structure.
+ */
+struct ucred *
+crget(void)
+{
+	register struct ucred *cr;
+
+	cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO);
+	refcount_init(&cr->cr_ref, 1);
+#ifdef AUDIT
+	audit_cred_init(cr);
+#endif
+#ifdef MAC
+	mac_cred_init(cr);
+#endif
+	crextend(cr, XU_NGROUPS);
+	return (cr);
+}
+
+/*
+ * Claim another reference to a ucred structure.
+ */
+struct ucred *
+crhold(struct ucred *cr)
+{
+
+	refcount_acquire(&cr->cr_ref);
+	return (cr);
+}
+
+/*
+ * Free a cred structure.  Throws away space when ref count gets to 0.
+ */
+void
+crfree(struct ucred *cr)
+{
+
+	KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
+	KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred"));
+	if (refcount_release(&cr->cr_ref)) {
+		/*
+		 * Some callers of crget(), such as nfs_statfs(),
+		 * allocate a temporary credential, but don't
+		 * allocate a uidinfo structure.
+		 */
+		if (cr->cr_uidinfo != NULL)
+			uifree(cr->cr_uidinfo);
+		if (cr->cr_ruidinfo != NULL)
+			uifree(cr->cr_ruidinfo);
+		/*
+		 * Free a prison, if any.
+		 */
+		if (cr->cr_prison != NULL)
+			prison_free(cr->cr_prison);
+		if (cr->cr_loginclass != NULL)
+			loginclass_free(cr->cr_loginclass);
+#ifdef AUDIT
+		audit_cred_destroy(cr);
+#endif
+#ifdef MAC
+		mac_cred_destroy(cr);
+#endif
+		free(cr->cr_groups, M_CRED);
+		free(cr, M_CRED);
+	}
+}
+
+/*
+ * Check to see if this ucred is shared.
+ */
+int
+crshared(struct ucred *cr)
+{
+
+	return (cr->cr_ref > 1);
+}
+
+/*
+ * Copy a ucred's contents from a template.  Does not block.
+ */
+void
+crcopy(struct ucred *dest, struct ucred *src)
+{
+
+	KASSERT(crshared(dest) == 0, ("crcopy of shared ucred"));
+	bcopy(&src->cr_startcopy, &dest->cr_startcopy,
+	    (unsigned)((caddr_t)&src->cr_endcopy -
+		(caddr_t)&src->cr_startcopy));
+	crsetgroups(dest, src->cr_ngroups, src->cr_groups);
+	uihold(dest->cr_uidinfo);
+	uihold(dest->cr_ruidinfo);
+	prison_hold(dest->cr_prison);
+	loginclass_hold(dest->cr_loginclass);
+#ifdef AUDIT
+	audit_cred_copy(src, dest);
+#endif
+#ifdef MAC
+	mac_cred_copy(src, dest);
+#endif
+}
+
+/*
+ * Dup cred struct to a new held one.
+ */
+struct ucred *
+crdup(struct ucred *cr)
+{
+	struct ucred *newcr;
+
+	newcr = crget();
+	crcopy(newcr, cr);
+	return (newcr);
+}
+
+/*
+ * Fill in a struct xucred based on a struct ucred.
+ */
+void
+cru2x(struct ucred *cr, struct xucred *xcr)
+{
+	int ngroups;
+
+	bzero(xcr, sizeof(*xcr));
+	xcr->cr_version = XUCRED_VERSION;
+	xcr->cr_uid = cr->cr_uid;
+
+	ngroups = MIN(cr->cr_ngroups, XU_NGROUPS);
+	xcr->cr_ngroups = ngroups;
+	bcopy(cr->cr_groups, xcr->cr_groups,
+	    ngroups * sizeof(*cr->cr_groups));
+}
+
+/*
+ * small routine to swap a thread's current ucred for the correct one taken
+ * from the process.
+ */
+void
+cred_update_thread(struct thread *td)
+{
+	struct proc *p;
+	struct ucred *cred;
+
+	p = td->td_proc;
+	cred = td->td_ucred;
+	PROC_LOCK(p);
+	td->td_ucred = crhold(p->p_ucred);
+	PROC_UNLOCK(p);
+	if (cred != NULL)
+		crfree(cred);
+}
+
+struct ucred *
+crcopysafe(struct proc *p, struct ucred *cr)
+{
+	struct ucred *oldcred;
+	int groups;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	oldcred = p->p_ucred;
+	while (cr->cr_agroups < oldcred->cr_agroups) {
+		groups = oldcred->cr_agroups;
+		PROC_UNLOCK(p);
+		crextend(cr, groups);
+		PROC_LOCK(p);
+		oldcred = p->p_ucred;
+	}
+	crcopy(cr, oldcred);
+
+	return (oldcred);
+}
+
+/*
+ * Extend the passed in credential to hold n items.
+ */
+static void
+crextend(struct ucred *cr, int n)
+{
+	int cnt;
+
+	/* Truncate? */
+	if (n <= cr->cr_agroups)
+		return;
+
+	/*
+	 * We extend by 2 each time since we're using a power of two
+	 * allocator until we need enough groups to fill a page.
+	 * Once we're allocating multiple pages, only allocate as many
+	 * as we actually need.  The case of processes needing a
+	 * non-power of two number of pages seems more likely than
+	 * a real world process that adds thousands of groups one at a
+	 * time.
+	 */
+	if ( n < PAGE_SIZE / sizeof(gid_t) ) {
+		if (cr->cr_agroups == 0)
+			cnt = MINALLOCSIZE / sizeof(gid_t);
+		else
+			cnt = cr->cr_agroups * 2;
+
+		while (cnt < n)
+			cnt *= 2;
+	} else
+		cnt = roundup2(n, PAGE_SIZE / sizeof(gid_t));
+
+	/* Free the old array. */
+	if (cr->cr_groups)
+		free(cr->cr_groups, M_CRED);
+
+	cr->cr_groups = malloc(cnt * sizeof(gid_t), M_CRED, M_WAITOK | M_ZERO);
+	cr->cr_agroups = cnt;
+}
+
+/*
+ * Copy groups in to a credential, preserving any necessary invariants.
+ * Currently this includes the sorting of all supplemental gids.
+ * crextend() must have been called before hand to ensure sufficient
+ * space is available.
+ */
+static void
+crsetgroups_locked(struct ucred *cr, int ngrp, gid_t *groups)
+{
+	int i;
+	int j;
+	gid_t g;
+	
+	KASSERT(cr->cr_agroups >= ngrp, ("cr_ngroups is too small"));
+
+	bcopy(groups, cr->cr_groups, ngrp * sizeof(gid_t));
+	cr->cr_ngroups = ngrp;
+
+	/*
+	 * Sort all groups except cr_groups[0] to allow groupmember to
+	 * perform a binary search.
+	 *
+	 * XXX: If large numbers of groups become common this should
+	 * be replaced with shell sort like linux uses or possibly
+	 * heap sort.
+	 */
+	for (i = 2; i < ngrp; i++) {
+		g = cr->cr_groups[i];
+		for (j = i-1; j >= 1 && g < cr->cr_groups[j]; j--)
+			cr->cr_groups[j + 1] = cr->cr_groups[j];
+		cr->cr_groups[j + 1] = g;
+	}
+}
+
+/*
+ * Copy groups in to a credential after expanding it if required.
+ * Truncate the list to (ngroups_max + 1) if it is too large.
+ */
+void
+crsetgroups(struct ucred *cr, int ngrp, gid_t *groups)
+{
+
+	if (ngrp > ngroups_max + 1)
+		ngrp = ngroups_max + 1;
+
+	crextend(cr, ngrp);
+	crsetgroups_locked(cr, ngrp, groups);
+}
+
+/*
+ * Get login name, if available.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getlogin_args {
+	char	*namebuf;
+	u_int	namelen;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getlogin(struct thread *td, struct getlogin_args *uap)
+{
+	int error;
+	char login[MAXLOGNAME];
+	struct proc *p = td->td_proc;
+
+	if (uap->namelen > MAXLOGNAME)
+		uap->namelen = MAXLOGNAME;
+	PROC_LOCK(p);
+	SESS_LOCK(p->p_session);
+	bcopy(p->p_session->s_login, login, uap->namelen);
+	SESS_UNLOCK(p->p_session);
+	PROC_UNLOCK(p);
+	if (strlen(login) + 1 > uap->namelen)
+		return (ERANGE);
+	error = copyout(login, uap->namebuf, uap->namelen);
+	return (error);
+}
+
+/*
+ * Set login name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setlogin_args {
+	char	*namebuf;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setlogin(struct thread *td, struct setlogin_args *uap)
+{
+	struct proc *p = td->td_proc;
+	int error;
+	char logintmp[MAXLOGNAME];
+
+	error = priv_check(td, PRIV_PROC_SETLOGIN);
+	if (error)
+		return (error);
+	error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL);
+	if (error == ENAMETOOLONG)
+		error = EINVAL;
+	else if (!error) {
+		PROC_LOCK(p);
+		SESS_LOCK(p->p_session);
+		(void) memcpy(p->p_session->s_login, logintmp,
+		    sizeof(logintmp));
+		SESS_UNLOCK(p->p_session);
+		PROC_UNLOCK(p);
+	}
+	return (error);
+}
+
+void
+setsugid(struct proc *p)
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	p->p_flag |= P_SUGID;
+	if (!(p->p_pfsflags & PF_ISUGID))
+		p->p_stops = 0;
+}
+
+/*-
+ * Change a process's effective uid.
+ * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified.
+ * References: newcred must be an exclusive credential reference for the
+ *             duration of the call.
+ */
+void
+change_euid(struct ucred *newcred, struct uidinfo *euip)
+{
+
+	newcred->cr_uid = euip->ui_uid;
+	uihold(euip);
+	uifree(newcred->cr_uidinfo);
+	newcred->cr_uidinfo = euip;
+}
+
+/*-
+ * Change a process's effective gid.
+ * Side effects: newcred->cr_gid will be modified.
+ * References: newcred must be an exclusive credential reference for the
+ *             duration of the call.
+ */
+void
+change_egid(struct ucred *newcred, gid_t egid)
+{
+
+	newcred->cr_groups[0] = egid;
+}
+
+/*-
+ * Change a process's real uid.
+ * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo
+ *               will be updated, and the old and new cr_ruidinfo proc
+ *               counts will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ *             duration of the call.
+ */
+void
+change_ruid(struct ucred *newcred, struct uidinfo *ruip)
+{
+
+	(void)chgproccnt(newcred->cr_ruidinfo, -1, 0);
+	newcred->cr_ruid = ruip->ui_uid;
+	uihold(ruip);
+	uifree(newcred->cr_ruidinfo);
+	newcred->cr_ruidinfo = ruip;
+	(void)chgproccnt(newcred->cr_ruidinfo, 1, 0);
+}
+
+/*-
+ * Change a process's real gid.
+ * Side effects: newcred->cr_rgid will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ *             duration of the call.
+ */
+void
+change_rgid(struct ucred *newcred, gid_t rgid)
+{
+
+	newcred->cr_rgid = rgid;
+}
+
+/*-
+ * Change a process's saved uid.
+ * Side effects: newcred->cr_svuid will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ *             duration of the call.
+ */
+void
+change_svuid(struct ucred *newcred, uid_t svuid)
+{
+
+	newcred->cr_svuid = svuid;
+}
+
+/*-
+ * Change a process's saved gid.
+ * Side effects: newcred->cr_svgid will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ *             duration of the call.
+ */
+void
+change_svgid(struct ucred *newcred, gid_t svgid)
+{
+
+	newcred->cr_svgid = svgid;
+}
diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c
new file mode 100644
index 0000000..d31c832
--- /dev/null
+++ b/sys/kern/kern_racct.c
@@ -0,0 +1,1291 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Edward Tomasz Napierala under sponsorship
+ * from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdtrace.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/loginclass.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/umtx.h>
+#include <machine/smp.h>
+
+#ifdef RCTL
+#include <sys/rctl.h>
+#endif
+
+#ifdef RACCT
+
+FEATURE(racct, "Resource Accounting");
+
+/*
+ * Do not block processes that have their %cpu usage <= pcpu_threshold.
+ */
+static int pcpu_threshold = 1;
+
+SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
+SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
+    0, "Processes with higher %cpu usage than this value can be throttled.");
+
+/*
+ * How many seconds it takes to use the scheduler %cpu calculations.  When a
+ * process starts, we compute its %cpu usage by dividing its runtime by the
+ * process wall clock time.  After RACCT_PCPU_SECS pass, we use the value
+ * provided by the scheduler.
+ */
+#define RACCT_PCPU_SECS		3
+
+static struct mtx racct_lock;
+MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
+
+static uma_zone_t racct_zone;
+
+static void racct_sub_racct(struct racct *dest, const struct racct *src);
+static void racct_sub_cred_locked(struct ucred *cred, int resource,
+		uint64_t amount);
+static void racct_add_cred_locked(struct ucred *cred, int resource,
+		uint64_t amount);
+
+SDT_PROVIDER_DEFINE(racct);
+SDT_PROBE_DEFINE3(racct, kernel, rusage, add, add, "struct proc *", "int",
+    "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, add_failure, add-failure,
+    "struct proc *", "int", "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, add_cred, add-cred, "struct ucred *",
+    "int", "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, add_force, add-force, "struct proc *",
+    "int", "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, set, set, "struct proc *", "int",
+    "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, set_failure, set-failure,
+    "struct proc *", "int", "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, sub, sub, "struct proc *", "int",
+    "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, sub_cred, sub-cred, "struct ucred *",
+    "int", "uint64_t");
+SDT_PROBE_DEFINE1(racct, kernel, racct, create, create, "struct racct *");
+SDT_PROBE_DEFINE1(racct, kernel, racct, destroy, destroy, "struct racct *");
+SDT_PROBE_DEFINE2(racct, kernel, racct, join, join, "struct racct *",
+    "struct racct *");
+SDT_PROBE_DEFINE2(racct, kernel, racct, join_failure, join-failure,
+    "struct racct *", "struct racct *");
+SDT_PROBE_DEFINE2(racct, kernel, racct, leave, leave, "struct racct *",
+    "struct racct *");
+
+int racct_types[] = {
+	[RACCT_CPU] =
+		RACCT_IN_MILLIONS,
+	[RACCT_DATA] =
+		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
+	[RACCT_STACK] =
+		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
+	[RACCT_CORE] =
+		RACCT_DENIABLE,
+	[RACCT_RSS] =
+		RACCT_RECLAIMABLE,
+	[RACCT_MEMLOCK] =
+		RACCT_RECLAIMABLE | RACCT_DENIABLE,
+	[RACCT_NPROC] =
+		RACCT_RECLAIMABLE | RACCT_DENIABLE,
+	[RACCT_NOFILE] =
+		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
+	[RACCT_VMEM] =
+		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
+	[RACCT_NPTS] =
+		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+	[RACCT_SWAP] =
+		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+	[RACCT_NTHR] =
+		RACCT_RECLAIMABLE | RACCT_DENIABLE,
+	[RACCT_MSGQQUEUED] =
+		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+	[RACCT_MSGQSIZE] =
+		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+	[RACCT_NMSGQ] =
+		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+	[RACCT_NSEM] =
+		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+	[RACCT_NSEMOP] =
+		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
+	[RACCT_NSHM] =
+		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+	[RACCT_SHMSIZE] =
+		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+	[RACCT_WALLCLOCK] =
+		RACCT_IN_MILLIONS,
+	[RACCT_PCTCPU] =
+		RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
+
+static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
+
+#ifdef SCHED_4BSD
+/*
+ * Contains intermediate values for %cpu calculations to avoid using floating
+ * point in the kernel.
+ * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
+ * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
+ * zero so the calculations are more straightforward.
+ */
+fixpt_t ccpu_exp[] = {
+	[0] = FSCALE * 1,
+	[1] = FSCALE * 0.95122942450071400909,
+	[2] = FSCALE * 0.90483741803595957316,
+	[3] = FSCALE * 0.86070797642505780722,
+	[4] = FSCALE * 0.81873075307798185866,
+	[5] = FSCALE * 0.77880078307140486824,
+	[6] = FSCALE * 0.74081822068171786606,
+	[7] = FSCALE * 0.70468808971871343435,
+	[8] = FSCALE * 0.67032004603563930074,
+	[9] = FSCALE * 0.63762815162177329314,
+	[10] = FSCALE * 0.60653065971263342360,
+	[11] = FSCALE * 0.57694981038048669531,
+	[12] = FSCALE * 0.54881163609402643262,
+	[13] = FSCALE * 0.52204577676101604789,
+	[14] = FSCALE * 0.49658530379140951470,
+	[15] = FSCALE * 0.47236655274101470713,
+	[16] = FSCALE * 0.44932896411722159143,
+	[17] = FSCALE * 0.42741493194872666992,
+	[18] = FSCALE * 0.40656965974059911188,
+	[19] = FSCALE * 0.38674102345450120691,
+	[20] = FSCALE * 0.36787944117144232159,
+	[21] = FSCALE * 0.34993774911115535467,
+	[22] = FSCALE * 0.33287108369807955328,
+	[23] = FSCALE * 0.31663676937905321821,
+	[24] = FSCALE * 0.30119421191220209664,
+	[25] = FSCALE * 0.28650479686019010032,
+	[26] = FSCALE * 0.27253179303401260312,
+	[27] = FSCALE * 0.25924026064589150757,
+	[28] = FSCALE * 0.24659696394160647693,
+	[29] = FSCALE * 0.23457028809379765313,
+	[30] = FSCALE * 0.22313016014842982893,
+	[31] = FSCALE * 0.21224797382674305771,
+	[32] = FSCALE * 0.20189651799465540848,
+	[33] = FSCALE * 0.19204990862075411423,
+	[34] = FSCALE * 0.18268352405273465022,
+	[35] = FSCALE * 0.17377394345044512668,
+	[36] = FSCALE * 0.16529888822158653829,
+	[37] = FSCALE * 0.15723716631362761621,
+	[38] = FSCALE * 0.14956861922263505264,
+	[39] = FSCALE * 0.14227407158651357185,
+	[40] = FSCALE * 0.13533528323661269189,
+	[41] = FSCALE * 0.12873490358780421886,
+	[42] = FSCALE * 0.12245642825298191021,
+	[43] = FSCALE * 0.11648415777349695786,
+	[44] = FSCALE * 0.11080315836233388333,
+	[45] = FSCALE * 0.10539922456186433678,
+	[46] = FSCALE * 0.10025884372280373372,
+	[47] = FSCALE * 0.09536916221554961888,
+	[48] = FSCALE * 0.09071795328941250337,
+	[49] = FSCALE * 0.08629358649937051097,
+	[50] = FSCALE * 0.08208499862389879516,
+	[51] = FSCALE * 0.07808166600115315231,
+	[52] = FSCALE * 0.07427357821433388042,
+	[53] = FSCALE * 0.07065121306042958674,
+	[54] = FSCALE * 0.06720551273974976512,
+	[55] = FSCALE * 0.06392786120670757270,
+	[56] = FSCALE * 0.06081006262521796499,
+	[57] = FSCALE * 0.05784432087483846296,
+	[58] = FSCALE * 0.05502322005640722902,
+	[59] = FSCALE * 0.05233970594843239308,
+	[60] = FSCALE * 0.04978706836786394297,
+	[61] = FSCALE * 0.04735892439114092119,
+	[62] = FSCALE * 0.04504920239355780606,
+	[63] = FSCALE * 0.04285212686704017991,
+	[64] = FSCALE * 0.04076220397836621516,
+	[65] = FSCALE * 0.03877420783172200988,
+	[66] = FSCALE * 0.03688316740124000544,
+	[67] = FSCALE * 0.03508435410084502588,
+	[68] = FSCALE * 0.03337326996032607948,
+	[69] = FSCALE * 0.03174563637806794323,
+	[70] = FSCALE * 0.03019738342231850073,
+	[71] = FSCALE * 0.02872463965423942912,
+	[72] = FSCALE * 0.02732372244729256080,
+	[73] = FSCALE * 0.02599112877875534358,
+	[74] = FSCALE * 0.02472352647033939120,
+	[75] = FSCALE * 0.02351774585600910823,
+	[76] = FSCALE * 0.02237077185616559577,
+	[77] = FSCALE * 0.02127973643837716938,
+	[78] = FSCALE * 0.02024191144580438847,
+	[79] = FSCALE * 0.01925470177538692429,
+	[80] = FSCALE * 0.01831563888873418029,
+	[81] = FSCALE * 0.01742237463949351138,
+	[82] = FSCALE * 0.01657267540176124754,
+	[83] = FSCALE * 0.01576441648485449082,
+	[84] = FSCALE * 0.01499557682047770621,
+	[85] = FSCALE * 0.01426423390899925527,
+	[86] = FSCALE * 0.01356855901220093175,
+	[87] = FSCALE * 0.01290681258047986886,
+	[88] = FSCALE * 0.01227733990306844117,
+	[89] = FSCALE * 0.01167856697039544521,
+	[90] = FSCALE * 0.01110899653824230649,
+	[91] = FSCALE * 0.01056720438385265337,
+	[92] = FSCALE * 0.01005183574463358164,
+	[93] = FSCALE * 0.00956160193054350793,
+	[94] = FSCALE * 0.00909527710169581709,
+	[95] = FSCALE * 0.00865169520312063417,
+	[96] = FSCALE * 0.00822974704902002884,
+	[97] = FSCALE * 0.00782837754922577143,
+	[98] = FSCALE * 0.00744658307092434051,
+	[99] = FSCALE * 0.00708340892905212004,
+	[100] = FSCALE * 0.00673794699908546709,
+	[101] = FSCALE * 0.00640933344625638184,
+	[102] = FSCALE * 0.00609674656551563610,
+	[103] = FSCALE * 0.00579940472684214321,
+	[104] = FSCALE * 0.00551656442076077241,
+	[105] = FSCALE * 0.00524751839918138427,
+	[106] = FSCALE * 0.00499159390691021621,
+	[107] = FSCALE * 0.00474815099941147558,
+	[108] = FSCALE * 0.00451658094261266798,
+	[109] = FSCALE * 0.00429630469075234057,
+	[110] = FSCALE * 0.00408677143846406699,
+};
+#endif
+
+#define	CCPU_EXP_MAX	110
+
+/*
+ * This function is analogical to the getpcpu() function in the ps(1) command.
+ * They should both calculate in the same way so that the racct %cpu
+ * calculations are consistent with the values showed by the ps(1) tool.
+ * The calculations are more complex in the 4BSD scheduler because of the value
+ * of the ccpu variable.  In ULE it is defined to be zero which saves us some
+ * work.
+ */
+static uint64_t
+racct_getpcpu(struct proc *p, u_int pcpu)
+{
+	u_int swtime;
+#ifdef SCHED_4BSD
+	fixpt_t pctcpu, pctcpu_next;
+#endif
+#ifdef SMP
+	struct pcpu *pc;
+	int found;
+#endif
+	fixpt_t p_pctcpu;
+	struct thread *td;
+
+	/*
+	 * If the process is swapped out, we count its %cpu usage as zero.
+	 * This behaviour is consistent with the userland ps(1) tool.
+	 */
+	if ((p->p_flag & P_INMEM) == 0)
+		return (0);
+	swtime = (ticks - p->p_swtick) / hz;
+
+	/*
+	 * For short-lived processes, the sched_pctcpu() returns small
+	 * values even for cpu intensive processes.  Therefore we use
+	 * our own estimate in this case.
+	 */
+	if (swtime < RACCT_PCPU_SECS)
+		return (pcpu);
+
+	p_pctcpu = 0;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		if (td == PCPU_GET(idlethread))
+			continue;
+#ifdef SMP
+		found = 0;
+		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
+			if (td == pc->pc_idlethread) {
+				found = 1;
+				break;
+			}
+		}
+		if (found)
+			continue;
+#endif
+		thread_lock(td);
+#ifdef SCHED_4BSD
+		pctcpu = sched_pctcpu(td);
+		/* Count also the yet unfinished second. */
+		pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
+		pctcpu_next += sched_pctcpu_delta(td);
+		p_pctcpu += max(pctcpu, pctcpu_next);
+#else
+		/*
+		 * In ULE the %cpu statistics are updated on every
+		 * sched_pctcpu() call.  So special calculations to
+		 * account for the latest (unfinished) second are
+		 * not needed.
+		 */
+		p_pctcpu += sched_pctcpu(td);
+#endif
+		thread_unlock(td);
+	}
+
+#ifdef SCHED_4BSD
+	if (swtime <= CCPU_EXP_MAX)
+		return ((100 * (uint64_t)p_pctcpu * 1000000) /
+		    (FSCALE - ccpu_exp[swtime]));
+#endif
+
+	return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
+}
+
+static void
+racct_add_racct(struct racct *dest, const struct racct *src)
+{
+	int i;
+
+	mtx_assert(&racct_lock, MA_OWNED);
+
+	/*
+	 * Update resource usage in dest.
+	 */
+	for (i = 0; i <= RACCT_MAX; i++) {
+		KASSERT(dest->r_resources[i] >= 0,
+		    ("%s: resource %d propagation meltdown: dest < 0",
+		    __func__, i));
+		KASSERT(src->r_resources[i] >= 0,
+		    ("%s: resource %d propagation meltdown: src < 0",
+		    __func__, i));
+		dest->r_resources[i] += src->r_resources[i];
+	}
+}
+
+static void
+racct_sub_racct(struct racct *dest, const struct racct *src)
+{
+	int i;
+
+	mtx_assert(&racct_lock, MA_OWNED);
+
+	/*
+	 * Update resource usage in dest.
+	 */
+	for (i = 0; i <= RACCT_MAX; i++) {
+		if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
+			KASSERT(dest->r_resources[i] >= 0,
+			    ("%s: resource %d propagation meltdown: dest < 0",
+			    __func__, i));
+			KASSERT(src->r_resources[i] >= 0,
+			    ("%s: resource %d propagation meltdown: src < 0",
+			    __func__, i));
+			KASSERT(src->r_resources[i] <= dest->r_resources[i],
+			    ("%s: resource %d propagation meltdown: src > dest",
+			    __func__, i));
+		}
+		if (RACCT_CAN_DROP(i)) {
+			dest->r_resources[i] -= src->r_resources[i];
+			if (dest->r_resources[i] < 0) {
+				KASSERT(RACCT_IS_SLOPPY(i) ||
+				    RACCT_IS_DECAYING(i),
+				    ("%s: resource %d usage < 0", __func__, i));
+				dest->r_resources[i] = 0;
+			}
+		}
+	}
+}
+
+void
+racct_create(struct racct **racctp)
+{
+
+	SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0);
+
+	KASSERT(*racctp == NULL, ("racct already allocated"));
+
+	*racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
+}
+
+static void
+racct_destroy_locked(struct racct **racctp)
+{
+	int i;
+	struct racct *racct;
+
+	SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0);
+
+	mtx_assert(&racct_lock, MA_OWNED);
+	KASSERT(racctp != NULL, ("NULL racctp"));
+	KASSERT(*racctp != NULL, ("NULL racct"));
+
+	racct = *racctp;
+
+	for (i = 0; i <= RACCT_MAX; i++) {
+		if (RACCT_IS_SLOPPY(i))
+			continue;
+		if (!RACCT_IS_RECLAIMABLE(i))
+			continue;
+		KASSERT(racct->r_resources[i] == 0,
+		    ("destroying non-empty racct: "
+		    "%ju allocated for resource %d\n",
+		    racct->r_resources[i], i));
+	}
+	uma_zfree(racct_zone, racct);
+	*racctp = NULL;
+}
+
+void
+racct_destroy(struct racct **racct)
+{
+
+	mtx_lock(&racct_lock);
+	racct_destroy_locked(racct);
+	mtx_unlock(&racct_lock);
+}
+
+/*
+ * Increase consumption of 'resource' by 'amount' for 'racct'
+ * and all its parents.  Differently from other cases, 'amount' here
+ * may be less than zero.
+ */
+static void
+racct_alloc_resource(struct racct *racct, int resource,
+    uint64_t amount)
+{
+
+	mtx_assert(&racct_lock, MA_OWNED);
+	KASSERT(racct != NULL, ("NULL racct"));
+
+	racct->r_resources[resource] += amount;
+	if (racct->r_resources[resource] < 0) {
+		KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
+		    ("%s: resource %d usage < 0", __func__, resource));
+		racct->r_resources[resource] = 0;
+	}
+	
+	/*
+	 * There are some cases where the racct %cpu resource would grow
+	 * beyond 100%.
+	 * For example in racct_proc_exit() we add the process %cpu usage
+	 * to the ucred racct containers.  If too many processes terminated
+	 * in a short time span, the ucred %cpu resource could grow too much.
+	 * Also, the 4BSD scheduler sometimes returns for a thread more than
+	 * 100% cpu usage.  So we set a boundary here to 100%.
+	 */
+	if ((resource == RACCT_PCTCPU) &&
+	    (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000))
+		racct->r_resources[RACCT_PCTCPU] = 100 * 1000000;
+}
+
+static int
+racct_add_locked(struct proc *p, int resource, uint64_t amount)
+{
+#ifdef RCTL
+	int error;
+#endif
+
+	SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0);
+
+	/*
+	 * We need proc lock to dereference p->p_ucred.
+	 */
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+#ifdef RCTL
+	error = rctl_enforce(p, resource, amount);
+	if (error && RACCT_IS_DENIABLE(resource)) {
+		SDT_PROBE(racct, kernel, rusage, add_failure, p, resource,
+		    amount, 0, 0);
+		return (error);
+	}
+#endif
+	racct_alloc_resource(p->p_racct, resource, amount);
+	racct_add_cred_locked(p->p_ucred, resource, amount);
+
+	return (0);
+}
+
+/*
+ * Increase allocation of 'resource' by 'amount' for process 'p'.
+ * Return 0 if it's below limits, or errno, if it's not.
+ */
+int
+racct_add(struct proc *p, int resource, uint64_t amount)
+{
+	int error;
+
+	mtx_lock(&racct_lock);
+	error = racct_add_locked(p, resource, amount);
+	mtx_unlock(&racct_lock);
+	return (error);
+}
+
+static void
+racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
+{
+	struct prison *pr;
+
+	SDT_PROBE(racct, kernel, rusage, add_cred, cred, resource, amount,
+	    0, 0);
+
+	racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
+	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
+		racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
+		    amount);
+	racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount);
+}
+
+/*
+ * Increase allocation of 'resource' by 'amount' for credential 'cred'.
+ * Doesn't check for limits and never fails.
+ *
+ * XXX: Shouldn't this ever return an error?
+ */
+void
+racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
+{
+
+	mtx_lock(&racct_lock);
+	racct_add_cred_locked(cred, resource, amount);
+	mtx_unlock(&racct_lock);
+}
+
+/*
+ * Increase allocation of 'resource' by 'amount' for process 'p'.
+ * Doesn't check for limits and never fails.
+ */
+void
+racct_add_force(struct proc *p, int resource, uint64_t amount)
+{
+
+	SDT_PROBE(racct, kernel, rusage, add_force, p, resource, amount, 0, 0);
+
+	/*
+	 * We need proc lock to dereference p->p_ucred.
+	 */
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	mtx_lock(&racct_lock);
+	racct_alloc_resource(p->p_racct, resource, amount);
+	mtx_unlock(&racct_lock);
+	racct_add_cred(p->p_ucred, resource, amount);
+}
+
+static int
+racct_set_locked(struct proc *p, int resource, uint64_t amount)
+{
+	int64_t old_amount, decayed_amount;
+	int64_t diff_proc, diff_cred;
+#ifdef RCTL
+	int error;
+#endif
+
+	SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
+
+	/*
+	 * We need proc lock to dereference p->p_ucred.
+	 */
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	old_amount = p->p_racct->r_resources[resource];
+	/*
+	 * The diffs may be negative.
+	 */
+	diff_proc = amount - old_amount;
+	if (RACCT_IS_DECAYING(resource)) {
+		/*
+		 * Resources in per-credential racct containers may decay.
+		 * If this is the case, we need to calculate the difference
+		 * between the new amount and the proportional value of the
+		 * old amount that has decayed in the ucred racct containers.
+		 */
+		decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
+		diff_cred = amount - decayed_amount;
+	} else
+		diff_cred = diff_proc;
+#ifdef notyet
+	KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
+	    ("%s: usage of non-droppable resource %d dropping", __func__,
+	     resource));
+#endif
+#ifdef RCTL
+	if (diff_proc > 0) {
+		error = rctl_enforce(p, resource, diff_proc);
+		if (error && RACCT_IS_DENIABLE(resource)) {
+			SDT_PROBE(racct, kernel, rusage, set_failure, p,
+			    resource, amount, 0, 0);
+			return (error);
+		}
+	}
+#endif
+	racct_alloc_resource(p->p_racct, resource, diff_proc);
+	if (diff_cred > 0)
+		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
+	else if (diff_cred < 0)
+		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
+
+	return (0);
+}
+
+/*
+ * Set allocation of 'resource' to 'amount' for process 'p'.
+ * Return 0 if it's below limits, or errno, if it's not.
+ *
+ * Note that decreasing the allocation always returns 0,
+ * even if it's above the limit.
+ */
+int
+racct_set(struct proc *p, int resource, uint64_t amount)
+{
+	int error;
+
+	mtx_lock(&racct_lock);
+	error = racct_set_locked(p, resource, amount);
+	mtx_unlock(&racct_lock);
+	return (error);
+}
+
+static void
+racct_set_force_locked(struct proc *p, int resource, uint64_t amount)
+{
+	int64_t old_amount, decayed_amount;
+	int64_t diff_proc, diff_cred;
+
+	SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
+
+	/*
+	 * We need proc lock to dereference p->p_ucred.
+	 */
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	old_amount = p->p_racct->r_resources[resource];
+	/*
+	 * The diffs may be negative.
+	 */
+	diff_proc = amount - old_amount;
+	if (RACCT_IS_DECAYING(resource)) {
+		/*
+		 * Resources in per-credential racct containers may decay.
+		 * If this is the case, we need to calculate the difference
+		 * between the new amount and the proportional value of the
+		 * old amount that has decayed in the ucred racct containers.
+		 */
+		decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
+		diff_cred = amount - decayed_amount;
+	} else
+		diff_cred = diff_proc;
+
+	racct_alloc_resource(p->p_racct, resource, diff_proc);
+	if (diff_cred > 0)
+		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
+	else if (diff_cred < 0)
+		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
+}
+
+void
+racct_set_force(struct proc *p, int resource, uint64_t amount)
+{
+	mtx_lock(&racct_lock);
+	racct_set_force_locked(p, resource, amount);
+	mtx_unlock(&racct_lock);
+}
+
+/*
+ * Returns amount of 'resource' the process 'p' can keep allocated.
+ * Allocating more than that would be denied, unless the resource
+ * is marked undeniable.  Amount of already allocated resource does
+ * not matter.
+ */
+uint64_t
+racct_get_limit(struct proc *p, int resource)
+{
+
+#ifdef RCTL
+	return (rctl_get_limit(p, resource));
+#else
+	return (UINT64_MAX);
+#endif
+}
+
+/*
+ * Returns amount of 'resource' the process 'p' can keep allocated.
+ * Allocating more than that would be denied, unless the resource
+ * is marked undeniable.  Amount of already allocated resource does
+ * matter.
+ */
+uint64_t
+racct_get_available(struct proc *p, int resource)
+{
+
+#ifdef RCTL
+	return (rctl_get_available(p, resource));
+#else
+	return (UINT64_MAX);
+#endif
+}
+
+/*
+ * Returns amount of the %cpu resource that process 'p' can add to its %cpu
+ * utilization.  Adding more than that would lead to the process being
+ * throttled.
+ */
+static int64_t
+racct_pcpu_available(struct proc *p)
+{
+
+#ifdef RCTL
+	return (rctl_pcpu_available(p));
+#else
+	return (INT64_MAX);
+#endif
+}
+
+/*
+ * Decrease allocation of 'resource' by 'amount' for process 'p'.
+ */
+void
+racct_sub(struct proc *p, int resource, uint64_t amount)
+{
+
+	SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0);
+
+	/*
+	 * We need proc lock to dereference p->p_ucred.
+	 */
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	KASSERT(RACCT_CAN_DROP(resource),
+	    ("%s: called for non-droppable resource %d", __func__, resource));
+
+	mtx_lock(&racct_lock);
+	KASSERT(amount <= p->p_racct->r_resources[resource],
+	    ("%s: freeing %ju of resource %d, which is more "
+	     "than allocated %jd for %s (pid %d)", __func__, amount, resource,
+	    (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
+
+	racct_alloc_resource(p->p_racct, resource, -amount);
+	racct_sub_cred_locked(p->p_ucred, resource, amount);
+	mtx_unlock(&racct_lock);
+}
+
+static void
+racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
+{
+	struct prison *pr;
+
+	SDT_PROBE(racct, kernel, rusage, sub_cred, cred, resource, amount,
+	    0, 0);
+
+#ifdef notyet
+	KASSERT(RACCT_CAN_DROP(resource),
+	    ("%s: called for resource %d which can not drop", __func__,
+	     resource));
+#endif
+
+	racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
+	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
+		racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
+		    -amount);
+	racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount);
+}
+
+/*
+ * Decrease allocation of 'resource' by 'amount' for credential 'cred'.
+ */
+void
+racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
+{
+
+	mtx_lock(&racct_lock);
+	racct_sub_cred_locked(cred, resource, amount);
+	mtx_unlock(&racct_lock);
+}
+
+/*
+ * Inherit resource usage information from the parent process.
+ */
+int
+racct_proc_fork(struct proc *parent, struct proc *child)
+{
+	int i, error = 0;
+
+	/*
+	 * Create racct for the child process.
+	 */
+	racct_create(&child->p_racct);
+
+	PROC_LOCK(parent);
+	PROC_LOCK(child);
+	mtx_lock(&racct_lock);
+
+#ifdef RCTL
+	error = rctl_proc_fork(parent, child);
+	if (error != 0)
+		goto out;
+#endif
+
+	/* Init process cpu time. */
+	child->p_prev_runtime = 0;
+	child->p_throttled = 0;
+
+	/*
+	 * Inherit resource usage.
+	 */
+	for (i = 0; i <= RACCT_MAX; i++) {
+		if (parent->p_racct->r_resources[i] == 0 ||
+		    !RACCT_IS_INHERITABLE(i))
+			continue;
+
+		error = racct_set_locked(child, i,
+		    parent->p_racct->r_resources[i]);
+		if (error != 0)
+			goto out;
+	}
+
+	error = racct_add_locked(child, RACCT_NPROC, 1);
+	error += racct_add_locked(child, RACCT_NTHR, 1);
+
+out:
+	mtx_unlock(&racct_lock);
+	PROC_UNLOCK(child);
+	PROC_UNLOCK(parent);
+
+	if (error != 0)
+		racct_proc_exit(child);
+
+	return (error);
+}
+
+/*
+ * Called at the end of fork1(), to handle rules that require the process
+ * to be fully initialized.
+ */
+void
+racct_proc_fork_done(struct proc *child)
+{
+
+#ifdef RCTL
+	PROC_LOCK(child);
+	mtx_lock(&racct_lock);
+	rctl_enforce(child, RACCT_NPROC, 0);
+	rctl_enforce(child, RACCT_NTHR, 0);
+	mtx_unlock(&racct_lock);
+	PROC_UNLOCK(child);
+#endif
+}
+
+void
+racct_proc_exit(struct proc *p)
+{
+	int i;
+	uint64_t runtime;
+	struct timeval wallclock;
+	uint64_t pct_estimate, pct;
+
+	PROC_LOCK(p);
+	/*
+	 * We don't need to calculate rux, proc_reap() has already done this.
+	 */
+	runtime = cputick2usec(p->p_rux.rux_runtime);
+#ifdef notyet
+	KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
+#else
+	if (runtime < p->p_prev_runtime)
+		runtime = p->p_prev_runtime;
+#endif
+	microuptime(&wallclock);
+	timevalsub(&wallclock, &p->p_stats->p_start);
+	if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
+		pct_estimate = (1000000 * runtime * 100) /
+		    ((uint64_t)wallclock.tv_sec * 1000000 +
+		    wallclock.tv_usec);
+	} else
+		pct_estimate = 0;
+	pct = racct_getpcpu(p, pct_estimate);
+
+	mtx_lock(&racct_lock);
+	racct_set_locked(p, RACCT_CPU, runtime);
+	racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
+
+	for (i = 0; i <= RACCT_MAX; i++) {
+		if (p->p_racct->r_resources[i] == 0)
+			continue;
+	    	if (!RACCT_IS_RECLAIMABLE(i))
+			continue;
+		racct_set_locked(p, i, 0);
+	}
+
+	mtx_unlock(&racct_lock);
+	PROC_UNLOCK(p);
+
+#ifdef RCTL
+	rctl_racct_release(p->p_racct);
+#endif
+	racct_destroy(&p->p_racct);
+}
+
+/*
+ * Called after credentials change, to move resource utilisation
+ * between raccts.
+ */
+void
+racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
+    struct ucred *newcred)
+{
+	struct uidinfo *olduip, *newuip;
+	struct loginclass *oldlc, *newlc;
+	struct prison *oldpr, *newpr, *pr;
+
+	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+
+	newuip = newcred->cr_ruidinfo;
+	olduip = oldcred->cr_ruidinfo;
+	newlc = newcred->cr_loginclass;
+	oldlc = oldcred->cr_loginclass;
+	newpr = newcred->cr_prison;
+	oldpr = oldcred->cr_prison;
+
+	mtx_lock(&racct_lock);
+	if (newuip != olduip) {
+		racct_sub_racct(olduip->ui_racct, p->p_racct);
+		racct_add_racct(newuip->ui_racct, p->p_racct);
+	}
+	if (newlc != oldlc) {
+		racct_sub_racct(oldlc->lc_racct, p->p_racct);
+		racct_add_racct(newlc->lc_racct, p->p_racct);
+	}
+	if (newpr != oldpr) {
+		for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
+			racct_sub_racct(pr->pr_prison_racct->prr_racct,
+			    p->p_racct);
+		for (pr = newpr; pr != NULL; pr = pr->pr_parent)
+			racct_add_racct(pr->pr_prison_racct->prr_racct,
+			    p->p_racct);
+	}
+	mtx_unlock(&racct_lock);
+
+#ifdef RCTL
+	rctl_proc_ucred_changed(p, newcred);
+#endif
+}
+
+void
+racct_move(struct racct *dest, struct racct *src)
+{
+
+	mtx_lock(&racct_lock);
+
+	racct_add_racct(dest, src);
+	racct_sub_racct(src, src);
+
+	mtx_unlock(&racct_lock);
+}
+
+static void
+racct_proc_throttle(struct proc *p)
+{
+	struct thread *td;
+#ifdef SMP
+	int cpuid;
+#endif
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	/*
+	 * Do not block kernel processes.  Also do not block processes with
+	 * low %cpu utilization to improve interactivity.
+	 */
+	if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) ||
+	    (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
+		return;
+	p->p_throttled = 1;
+
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		switch (td->td_state) {
+		case TDS_RUNQ:
+			/*
+			 * If the thread is on the scheduler run-queue, we can
+			 * not just remove it from there.  So we set the flag
+			 * TDF_NEEDRESCHED for the thread, so that once it is
+			 * running, it is taken off the cpu as soon as possible.
+			 */
+			td->td_flags |= TDF_NEEDRESCHED;
+			break;
+		case TDS_RUNNING:
+			/*
+			 * If the thread is running, we request a context
+			 * switch for it by setting the TDF_NEEDRESCHED flag.
+			 */
+			td->td_flags |= TDF_NEEDRESCHED;
+#ifdef SMP
+			cpuid = td->td_oncpu;
+			if ((cpuid != NOCPU) && (td != curthread))
+				ipi_cpu(cpuid, IPI_AST);
+#endif
+			break;
+		default:
+			break;
+		}
+		thread_unlock(td);
+	}
+}
+
+static void
+racct_proc_wakeup(struct proc *p)
+{
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	if (p->p_throttled) {
+		p->p_throttled = 0;
+		wakeup(p->p_racct);
+	}
+}
+
+static void
+racct_decay_resource(struct racct *racct, void * res, void* dummy)
+{
+	int resource;
+	int64_t r_old, r_new;
+
+	resource = *(int *)res;
+	r_old = racct->r_resources[resource];
+
+	/* If there is nothing to decay, just exit. */
+	if (r_old <= 0)
+		return;
+
+	mtx_lock(&racct_lock);
+	r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
+	racct->r_resources[resource] = r_new;
+	mtx_unlock(&racct_lock);
+}
+
+static void
+racct_decay(int resource)
+{
+	ui_racct_foreach(racct_decay_resource, &resource, NULL);
+	loginclass_racct_foreach(racct_decay_resource, &resource, NULL);
+	prison_racct_foreach(racct_decay_resource, &resource, NULL);
+}
+
+static void
+racctd(void)
+{
+	struct thread *td;
+	struct proc *p;
+	struct timeval wallclock;
+	uint64_t runtime;
+	uint64_t pct, pct_estimate;
+
+	for (;;) {
+		racct_decay(RACCT_PCTCPU);
+
+		sx_slock(&allproc_lock);
+
+		LIST_FOREACH(p, &zombproc, p_list) {
+			PROC_LOCK(p);
+			racct_set(p, RACCT_PCTCPU, 0);
+			PROC_UNLOCK(p);
+		}
+
+		FOREACH_PROC_IN_SYSTEM(p) {
+			PROC_LOCK(p);
+			if (p->p_state != PRS_NORMAL) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+
+			microuptime(&wallclock);
+			timevalsub(&wallclock, &p->p_stats->p_start);
+			PROC_SLOCK(p);
+			FOREACH_THREAD_IN_PROC(p, td)
+				ruxagg(p, td);
+			runtime = cputick2usec(p->p_rux.rux_runtime);
+			PROC_SUNLOCK(p);
+#ifdef notyet
+			KASSERT(runtime >= p->p_prev_runtime,
+			    ("runtime < p_prev_runtime"));
+#else
+			if (runtime < p->p_prev_runtime)
+				runtime = p->p_prev_runtime;
+#endif
+			p->p_prev_runtime = runtime;
+			if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
+				pct_estimate = (1000000 * runtime * 100) /
+				    ((uint64_t)wallclock.tv_sec * 1000000 +
+				    wallclock.tv_usec);
+			} else
+				pct_estimate = 0;
+			pct = racct_getpcpu(p, pct_estimate);
+			mtx_lock(&racct_lock);
+			racct_set_force_locked(p, RACCT_PCTCPU, pct);
+			racct_set_locked(p, RACCT_CPU, runtime);
+			racct_set_locked(p, RACCT_WALLCLOCK,
+			    (uint64_t)wallclock.tv_sec * 1000000 +
+			    wallclock.tv_usec);
+			mtx_unlock(&racct_lock);
+			PROC_UNLOCK(p);
+		}
+
+		/*
+		 * To ensure that processes are throttled in a fair way, we need
+		 * to iterate over all processes again and check the limits
+		 * for %cpu resource only after ucred racct containers have been
+		 * properly filled.
+		 */
+		FOREACH_PROC_IN_SYSTEM(p) {
+			PROC_LOCK(p);
+			if (p->p_state != PRS_NORMAL) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+
+			if (racct_pcpu_available(p) <= 0)
+				racct_proc_throttle(p);
+			else if (p->p_throttled)
+				racct_proc_wakeup(p);
+			PROC_UNLOCK(p);
+		}
+		sx_sunlock(&allproc_lock);
+		pause("-", hz);
+	}
+}
+
+static struct kproc_desc racctd_kp = {
+	"racctd",
+	racctd,
+	NULL
+};
+SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp);
+
+static void
+racct_init(void)
+{
+
+	racct_zone = uma_zcreate("racct", sizeof(struct racct),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	/*
+	 * XXX: Move this somewhere.
+	 */
+	prison0.pr_prison_racct = prison_racct_find("0");
+}
+SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
+
+#else /* !RACCT */
+
+int
+racct_add(struct proc *p, int resource, uint64_t amount)
+{
+
+	return (0);
+}
+
+void
+racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
+{
+}
+
+void
+racct_add_force(struct proc *p, int resource, uint64_t amount)
+{
+
+	return;
+}
+
+int
+racct_set(struct proc *p, int resource, uint64_t amount)
+{
+
+	return (0);
+}
+
+void
+racct_set_force(struct proc *p, int resource, uint64_t amount)
+{
+}
+
+void
+racct_sub(struct proc *p, int resource, uint64_t amount)
+{
+}
+
+void
+racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
+{
+}
+
+uint64_t
+racct_get_limit(struct proc *p, int resource)
+{
+
+	return (UINT64_MAX);
+}
+
+uint64_t
+racct_get_available(struct proc *p, int resource)
+{
+
+	return (UINT64_MAX);
+}
+
+void
+racct_create(struct racct **racctp)
+{
+}
+
+void
+racct_destroy(struct racct **racctp)
+{
+}
+
+int
+racct_proc_fork(struct proc *parent, struct proc *child)
+{
+
+	return (0);
+}
+
+void
+racct_proc_fork_done(struct proc *child)
+{
+}
+
+void
+racct_proc_exit(struct proc *p)
+{
+}
+
+#endif /* !RACCT */
diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c
new file mode 100644
index 0000000..1c0faa3
--- /dev/null
+++ b/sys/kern/kern_rangelock.c
@@ -0,0 +1,248 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rangelock.h>
+#include <sys/systm.h>
+
+#include <vm/uma.h>
+
+struct rl_q_entry {
+	TAILQ_ENTRY(rl_q_entry) rl_q_link;
+	off_t		rl_q_start, rl_q_end;
+	int		rl_q_flags;
+};
+
+static uma_zone_t rl_entry_zone;
+
+static void
+rangelock_sys_init(void)
+{
+
+	rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
+
+static struct rl_q_entry *
+rlqentry_alloc(void)
+{
+
+	return (uma_zalloc(rl_entry_zone, M_WAITOK));
+}
+
+void
+rlqentry_free(struct rl_q_entry *rleq)
+{
+
+	uma_zfree(rl_entry_zone, rleq);
+}
+
+void
+rangelock_init(struct rangelock *lock)
+{
+
+	TAILQ_INIT(&lock->rl_waiters);
+	lock->rl_currdep = NULL;
+}
+
+void
+rangelock_destroy(struct rangelock *lock)
+{
+
+	KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
+}
+
+/*
+ * Two entries are compatible if their ranges do not overlap, or both
+ * entries are for read.
+ */
+static int
+ranges_overlap(const struct rl_q_entry *e1,
+    const struct rl_q_entry *e2)
+{
+
+	if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start)
+		return (1);
+	return (0);
+}
+
+/*
+ * Recalculate the lock->rl_currdep after an unlock.
+ */
+static void
+rangelock_calc_block(struct rangelock *lock)
+{
+	struct rl_q_entry *entry, *nextentry, *entry1;
+
+	for (entry = lock->rl_currdep; entry != NULL; entry = nextentry) {
+		nextentry = TAILQ_NEXT(entry, rl_q_link);
+		if (entry->rl_q_flags & RL_LOCK_READ) {
+			/* Reads must not overlap with granted writes. */
+			for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
+			    !(entry1->rl_q_flags & RL_LOCK_READ);
+			    entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
+				if (ranges_overlap(entry, entry1))
+					goto out;
+			}
+		} else {
+			/* Write must not overlap with any granted locks. */
+			for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
+			    entry1 != entry;
+			    entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
+				if (ranges_overlap(entry, entry1))
+					goto out;
+			}
+
+			/* Move grantable write locks to the front. */
+			TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
+			TAILQ_INSERT_HEAD(&lock->rl_waiters, entry, rl_q_link);
+		}
+
+		/* Grant this lock. */
+		entry->rl_q_flags |= RL_LOCK_GRANTED;
+		wakeup(entry);
+	}
+out:
+	lock->rl_currdep = entry;
+}
+
+static void
+rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry,
+    struct mtx *ilk)
+{
+
+	MPASS(lock != NULL && entry != NULL && ilk != NULL);
+	mtx_assert(ilk, MA_OWNED);
+	KASSERT(entry != lock->rl_currdep, ("stuck currdep"));
+
+	TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
+	rangelock_calc_block(lock);
+	mtx_unlock(ilk);
+	if (curthread->td_rlqe == NULL)
+		curthread->td_rlqe = entry;
+	else
+		rlqentry_free(entry);
+}
+
+void
+rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk)
+{
+
+	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
+
+	mtx_lock(ilk);
+	rangelock_unlock_locked(lock, cookie, ilk);
+}
+
+/*
+ * Unlock the sub-range of granted lock.
+ */
+void *
+rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start,
+    off_t end, struct mtx *ilk)
+{
+	struct rl_q_entry *entry;
+
+	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
+	entry = cookie;
+	KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED,
+	    ("Unlocking non-granted lock"));
+	KASSERT(entry->rl_q_start == start, ("wrong start"));
+	KASSERT(entry->rl_q_end >= end, ("wrong end"));
+
+	mtx_lock(ilk);
+	if (entry->rl_q_end == end) {
+		rangelock_unlock_locked(lock, cookie, ilk);
+		return (NULL);
+	}
+	entry->rl_q_end = end;
+	rangelock_calc_block(lock);
+	mtx_unlock(ilk);
+	return (cookie);
+}
+
+/*
+ * Add the lock request to the queue of the pending requests for
+ * rangelock.  Sleep until the request can be granted.
+ */
+static void *
+rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode,
+    struct mtx *ilk)
+{
+	struct rl_q_entry *entry;
+	struct thread *td;
+
+	MPASS(lock != NULL && ilk != NULL);
+
+	td = curthread;
+	if (td->td_rlqe != NULL) {
+		entry = td->td_rlqe;
+		td->td_rlqe = NULL;
+	} else
+		entry = rlqentry_alloc();
+	MPASS(entry != NULL);
+	entry->rl_q_flags = mode;
+	entry->rl_q_start = start;
+	entry->rl_q_end = end;
+
+	mtx_lock(ilk);
+	/*
+	 * XXXKIB TODO. Check that a thread does not try to enqueue a
+	 * lock that is incompatible with another request from the same
+	 * thread.
+	 */
+
+	TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link);
+	if (lock->rl_currdep == NULL)
+		lock->rl_currdep = entry;
+	rangelock_calc_block(lock);
+	while (!(entry->rl_q_flags & RL_LOCK_GRANTED))
+		msleep(entry, ilk, 0, "range", 0);
+	mtx_unlock(ilk);
+	return (entry);
+}
+
+void *
+rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+{
+
+	return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk));
+}
+
+void *
+rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+{
+
+	return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk));
+}
diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c
new file mode 100644
index 0000000..934327a
--- /dev/null
+++ b/sys/kern/kern_rctl.c
@@ -0,0 +1,1870 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Edward Tomasz Napierala under sponsorship
+ * from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+#include <sys/refcount.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/loginclass.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/rctl.h>
+#include <sys/resourcevar.h>
+#include <sys/sx.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/taskqueue.h>
+#include <sys/tree.h>
+#include <vm/uma.h>
+
+#ifdef RCTL
+#ifndef RACCT
+#error "The RCTL option requires the RACCT option"
+#endif
+
+FEATURE(rctl, "Resource Limits");
+
+#define	HRF_DEFAULT		0
+#define	HRF_DONT_INHERIT	1
+#define	HRF_DONT_ACCUMULATE	2
+
+/* Default buffer size for rctl_get_rules(2). */
+#define	RCTL_DEFAULT_BUFSIZE	4096
+#define	RCTL_MAX_INBUFLEN	4096
+#define	RCTL_LOG_BUFSIZE	128
+
+#define	RCTL_PCPU_SHIFT		(10 * 1000000)
+
+/*
+ * 'rctl_rule_link' connects a rule with every racct it's related to.
+ * For example, rule 'user:X:openfiles:deny=N/process' is linked
+ * with uidinfo for user X, and to each process of that user.
+ */
+struct rctl_rule_link {
+	LIST_ENTRY(rctl_rule_link)	rrl_next;
+	struct rctl_rule		*rrl_rule;
+	int				rrl_exceeded;
+};
+
+struct dict {
+	const char	*d_name;
+	int		d_value;
+};
+
+static struct dict subjectnames[] = {
+	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
+	{ "user", RCTL_SUBJECT_TYPE_USER },
+	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
+	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
+	{ NULL, -1 }};
+
+static struct dict resourcenames[] = {
+	{ "cputime", RACCT_CPU },
+	{ "datasize", RACCT_DATA },
+	{ "stacksize", RACCT_STACK },
+	{ "coredumpsize", RACCT_CORE },
+	{ "memoryuse", RACCT_RSS },
+	{ "memorylocked", RACCT_MEMLOCK },
+	{ "maxproc", RACCT_NPROC },
+	{ "openfiles", RACCT_NOFILE },
+	{ "vmemoryuse", RACCT_VMEM },
+	{ "pseudoterminals", RACCT_NPTS },
+	{ "swapuse", RACCT_SWAP },
+	{ "nthr", RACCT_NTHR },
+	{ "msgqqueued", RACCT_MSGQQUEUED },
+	{ "msgqsize", RACCT_MSGQSIZE },
+	{ "nmsgq", RACCT_NMSGQ },
+	{ "nsem", RACCT_NSEM },
+	{ "nsemop", RACCT_NSEMOP },
+	{ "nshm", RACCT_NSHM },
+	{ "shmsize", RACCT_SHMSIZE },
+	{ "wallclock", RACCT_WALLCLOCK },
+	{ "pcpu", RACCT_PCTCPU },
+	{ NULL, -1 }};
+
+static struct dict actionnames[] = {
+	{ "sighup", RCTL_ACTION_SIGHUP },
+	{ "sigint", RCTL_ACTION_SIGINT },
+	{ "sigquit", RCTL_ACTION_SIGQUIT },
+	{ "sigill", RCTL_ACTION_SIGILL },
+	{ "sigtrap", RCTL_ACTION_SIGTRAP },
+	{ "sigabrt", RCTL_ACTION_SIGABRT },
+	{ "sigemt", RCTL_ACTION_SIGEMT },
+	{ "sigfpe", RCTL_ACTION_SIGFPE },
+	{ "sigkill", RCTL_ACTION_SIGKILL },
+	{ "sigbus", RCTL_ACTION_SIGBUS },
+	{ "sigsegv", RCTL_ACTION_SIGSEGV },
+	{ "sigsys", RCTL_ACTION_SIGSYS },
+	{ "sigpipe", RCTL_ACTION_SIGPIPE },
+	{ "sigalrm", RCTL_ACTION_SIGALRM },
+	{ "sigterm", RCTL_ACTION_SIGTERM },
+	{ "sigurg", RCTL_ACTION_SIGURG },
+	{ "sigstop", RCTL_ACTION_SIGSTOP },
+	{ "sigtstp", RCTL_ACTION_SIGTSTP },
+	{ "sigchld", RCTL_ACTION_SIGCHLD },
+	{ "sigttin", RCTL_ACTION_SIGTTIN },
+	{ "sigttou", RCTL_ACTION_SIGTTOU },
+	{ "sigio", RCTL_ACTION_SIGIO },
+	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
+	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
+	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
+	{ "sigprof", RCTL_ACTION_SIGPROF },
+	{ "sigwinch", RCTL_ACTION_SIGWINCH },
+	{ "siginfo", RCTL_ACTION_SIGINFO },
+	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
+	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
+	{ "sigthr", RCTL_ACTION_SIGTHR },
+	{ "deny", RCTL_ACTION_DENY },
+	{ "log", RCTL_ACTION_LOG },
+	{ "devctl", RCTL_ACTION_DEVCTL },
+	{ NULL, -1 }};
+
+static void rctl_init(void);
+SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
+
+static uma_zone_t rctl_rule_link_zone;
+static uma_zone_t rctl_rule_zone;
+static struct rwlock rctl_lock;
+RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock");
+
+static int rctl_rule_fully_specified(const struct rctl_rule *rule);
+static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
+
+static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
+
+static const char *
+rctl_subject_type_name(int subject)
+{
+	int i;
+
+	for (i = 0; subjectnames[i].d_name != NULL; i++) {
+		if (subjectnames[i].d_value == subject)
+			return (subjectnames[i].d_name);
+	}
+
+	panic("rctl_subject_type_name: unknown subject type %d", subject);
+}
+
+static const char *
+rctl_action_name(int action)
+{
+	int i;
+
+	for (i = 0; actionnames[i].d_name != NULL; i++) {
+		if (actionnames[i].d_value == action)
+			return (actionnames[i].d_name);
+	}
+
+	panic("rctl_action_name: unknown action %d", action);
+}
+
+const char *
+rctl_resource_name(int resource)
+{
+	int i;
+
+	for (i = 0; resourcenames[i].d_name != NULL; i++) {
+		if (resourcenames[i].d_value == resource)
+			return (resourcenames[i].d_name);
+	}
+
+	panic("rctl_resource_name: unknown resource %d", resource);
+}
+
+/*
+ * Return the amount of resource that can be allocated by 'p' before
+ * hitting 'rule'.
+ */
+static int64_t
+rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
+{
+	int resource;
+	int64_t available = INT64_MAX;
+	struct ucred *cred = p->p_ucred;
+
+	rw_assert(&rctl_lock, RA_LOCKED);
+
+	resource = rule->rr_resource;
+	switch (rule->rr_per) {
+	case RCTL_SUBJECT_TYPE_PROCESS:
+		available = rule->rr_amount -
+		    p->p_racct->r_resources[resource];
+		break;
+	case RCTL_SUBJECT_TYPE_USER:
+		available = rule->rr_amount -
+		    cred->cr_ruidinfo->ui_racct->r_resources[resource];
+		break;
+	case RCTL_SUBJECT_TYPE_LOGINCLASS:
+		available = rule->rr_amount -
+		    cred->cr_loginclass->lc_racct->r_resources[resource];
+		break;
+	case RCTL_SUBJECT_TYPE_JAIL:
+		available = rule->rr_amount -
+		    cred->cr_prison->pr_prison_racct->prr_racct->
+		        r_resources[resource];
+		break;
+	default:
+		panic("rctl_compute_available: unknown per %d",
+		    rule->rr_per);
+	}
+
+	return (available);
+}
+
+/*
+ * Return non-zero if allocating 'amount' by proc 'p' would exceed
+ * resource limit specified by 'rule'.
+ */
+static int
+rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule,
+    int64_t amount)
+{
+	int64_t available;
+
+	rw_assert(&rctl_lock, RA_LOCKED);
+
+	available = rctl_available_resource(p, rule);
+	if (available >= amount)
+		return (0);
+
+	return (1);
+}
+
+/*
+ * Special version of rctl_available() function for the %cpu resource.
+ * We slightly cheat here and return less than we normally would.
+ */
+int64_t
+rctl_pcpu_available(const struct proc *p) {
+	struct rctl_rule *rule;
+	struct rctl_rule_link *link;
+	int64_t available, minavailable, limit;
+
+	minavailable = INT64_MAX;
+	limit = 0;
+
+	rw_rlock(&rctl_lock);
+
+	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+		rule = link->rrl_rule;
+		if (rule->rr_resource != RACCT_PCTCPU)
+			continue;
+		if (rule->rr_action != RCTL_ACTION_DENY)
+			continue;
+		available = rctl_available_resource(p, rule);
+		if (available < minavailable) {
+			minavailable = available;
+			limit = rule->rr_amount;
+		}
+	}
+
+	rw_runlock(&rctl_lock);
+
+	/*
+	 * Return slightly less than actual value of the available
+	 * %cpu resource.  This makes %cpu throttling more agressive
+	 * and lets us act sooner than the limits are already exceeded.
+	 */
+	if (limit != 0) {
+		if (limit > 2 * RCTL_PCPU_SHIFT)
+			minavailable -= RCTL_PCPU_SHIFT;
+		else
+			minavailable -= (limit / 2);
+	}
+
+	return (minavailable);
+}
+
+/*
+ * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
+ * to what it keeps allocated now.  Returns non-zero if the allocation should
+ * be denied, 0 otherwise.
+ */
+int
+rctl_enforce(struct proc *p, int resource, uint64_t amount)
+{
+	struct rctl_rule *rule;
+	struct rctl_rule_link *link;
+	struct sbuf sb;
+	int should_deny = 0;
+	char *buf;
+	static int curtime = 0;
+	static struct timeval lasttime;
+
+	rw_rlock(&rctl_lock);
+
+	/*
+	 * There may be more than one matching rule; go through all of them.
+	 * Denial should be done last, after logging and sending signals.
+	 */
+	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+		rule = link->rrl_rule;
+		if (rule->rr_resource != resource)
+			continue;
+		if (!rctl_would_exceed(p, rule, amount)) {
+			link->rrl_exceeded = 0;
+			continue;
+		}
+
+		switch (rule->rr_action) {
+		case RCTL_ACTION_DENY:
+			should_deny = 1;
+			continue;
+		case RCTL_ACTION_LOG:
+			/*
+			 * If rrl_exceeded != 0, it means we've already
+			 * logged a warning for this process.
+			 */
+			if (link->rrl_exceeded != 0)
+				continue;
+
+			/*
+			 * If the process state is not fully initialized yet,
+			 * we can't access most of the required fields, e.g.
+			 * p->p_comm.  This happens when called from fork1().
+			 * Ignore this rule for now; it will be processed just
+			 * after fork, when called from racct_proc_fork_done().
+			 */
+			if (p->p_state != PRS_NORMAL)
+				continue;
+
+			if (!ppsratecheck(&lasttime, &curtime, 10))
+				continue;
+
+			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
+			if (buf == NULL) {
+				printf("rctl_enforce: out of memory\n");
+				continue;
+			}
+			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
+			rctl_rule_to_sbuf(&sb, rule);
+			sbuf_finish(&sb);
+			printf("rctl: rule \"%s\" matched by pid %d "
+			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
+			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
+			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
+			sbuf_delete(&sb);
+			free(buf, M_RCTL);
+			link->rrl_exceeded = 1;
+			continue;
+		case RCTL_ACTION_DEVCTL:
+			if (link->rrl_exceeded != 0)
+				continue;
+
+			if (p->p_state != PRS_NORMAL)
+				continue;
+	
+			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
+			if (buf == NULL) {
+				printf("rctl_enforce: out of memory\n");
+				continue;
+			}
+			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
+			sbuf_printf(&sb, "rule=");
+			rctl_rule_to_sbuf(&sb, rule);
+			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
+			    p->p_pid, p->p_ucred->cr_ruid,
+			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
+			sbuf_finish(&sb);
+			devctl_notify_f("RCTL", "rule", "matched",
+			    sbuf_data(&sb), M_NOWAIT);
+			sbuf_delete(&sb);
+			free(buf, M_RCTL);
+			link->rrl_exceeded = 1;
+			continue;
+		default:
+			if (link->rrl_exceeded != 0)
+				continue;
+
+			if (p->p_state != PRS_NORMAL)
+				continue;
+
+			KASSERT(rule->rr_action > 0 &&
+			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
+			    ("rctl_enforce: unknown action %d",
+			     rule->rr_action));
+
+			/*
+			 * We're using the fact that RCTL_ACTION_SIG* values
+			 * are equal to their counterparts from sys/signal.h.
+			 */
+			kern_psignal(p, rule->rr_action);
+			link->rrl_exceeded = 1;
+			continue;
+		}
+	}
+
+	rw_runlock(&rctl_lock);
+
+	if (should_deny) {
+		/*
+		 * Return fake error code; the caller should change it
+		 * into one proper for the situation - EFSIZ, ENOMEM etc.
+		 */
+		return (EDOOFUS);
+	}
+
+	return (0);
+}
+
+uint64_t
+rctl_get_limit(struct proc *p, int resource)
+{
+	struct rctl_rule *rule;
+	struct rctl_rule_link *link;
+	uint64_t amount = UINT64_MAX;
+
+	rw_rlock(&rctl_lock);
+
+	/*
+	 * There may be more than one matching rule; go through all of them.
+	 * Denial should be done last, after logging and sending signals.
+	 */
+	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+		rule = link->rrl_rule;
+		if (rule->rr_resource != resource)
+			continue;
+		if (rule->rr_action != RCTL_ACTION_DENY)
+			continue;
+		if (rule->rr_amount < amount)
+			amount = rule->rr_amount;
+	}
+
+	rw_runlock(&rctl_lock);
+
+	return (amount);
+}
+
+uint64_t
+rctl_get_available(struct proc *p, int resource)
+{
+	struct rctl_rule *rule;
+	struct rctl_rule_link *link;
+	int64_t available, minavailable, allocated;
+
+	minavailable = INT64_MAX;
+
+	rw_rlock(&rctl_lock);
+
+	/*
+	 * There may be more than one matching rule; go through all of them.
+	 * Denial should be done last, after logging and sending signals.
+	 */
+	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+		rule = link->rrl_rule;
+		if (rule->rr_resource != resource)
+			continue;
+		if (rule->rr_action != RCTL_ACTION_DENY)
+			continue;
+		available = rctl_available_resource(p, rule);
+		if (available < minavailable)
+			minavailable = available;
+	}
+
+	rw_runlock(&rctl_lock);
+
+	/*
+	 * XXX: Think about this _hard_.
+	 */
+	allocated = p->p_racct->r_resources[resource];
+	if (minavailable < INT64_MAX - allocated)
+		minavailable += allocated;
+	if (minavailable < 0)
+		minavailable = 0;
+	return (minavailable);
+}
+
+static int
+rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
+{
+
+	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
+		if (rule->rr_subject_type != filter->rr_subject_type)
+			return (0);
+
+		switch (filter->rr_subject_type) {
+		case RCTL_SUBJECT_TYPE_PROCESS:
+			if (filter->rr_subject.rs_proc != NULL &&
+			    rule->rr_subject.rs_proc !=
+			    filter->rr_subject.rs_proc)
+				return (0);
+			break;
+		case RCTL_SUBJECT_TYPE_USER:
+			if (filter->rr_subject.rs_uip != NULL &&
+			    rule->rr_subject.rs_uip !=
+			    filter->rr_subject.rs_uip)
+				return (0);
+			break;
+		case RCTL_SUBJECT_TYPE_LOGINCLASS:
+			if (filter->rr_subject.rs_loginclass != NULL &&
+			    rule->rr_subject.rs_loginclass !=
+			    filter->rr_subject.rs_loginclass)
+				return (0);
+			break;
+		case RCTL_SUBJECT_TYPE_JAIL:
+			if (filter->rr_subject.rs_prison_racct != NULL &&
+			    rule->rr_subject.rs_prison_racct !=
+			    filter->rr_subject.rs_prison_racct)
+				return (0);
+			break;
+		default:
+			panic("rctl_rule_matches: unknown subject type %d",
+			    filter->rr_subject_type);
+		}
+	}
+
+	if (filter->rr_resource != RACCT_UNDEFINED) {
+		if (rule->rr_resource != filter->rr_resource)
+			return (0);
+	}
+
+	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
+		if (rule->rr_action != filter->rr_action)
+			return (0);
+	}
+
+	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
+		if (rule->rr_amount != filter->rr_amount)
+			return (0);
+	}
+
+	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
+		if (rule->rr_per != filter->rr_per)
+			return (0);
+	}
+
+	return (1);
+}
+
+static int
+str2value(const char *str, int *value, struct dict *table)
+{
+	int i;
+
+	if (value == NULL)
+		return (EINVAL);
+
+	for (i = 0; table[i].d_name != NULL; i++) {
+		if (strcasecmp(table[i].d_name, str) == 0) {
+			*value =  table[i].d_value;
+			return (0);
+		}
+	}
+
+	return (EINVAL);
+}
+
+static int
+str2id(const char *str, id_t *value)
+{
+	char *end;
+
+	if (str == NULL)
+		return (EINVAL);
+
+	*value = strtoul(str, &end, 10);
+	if ((size_t)(end - str) != strlen(str))
+		return (EINVAL);
+
+	return (0);
+}
+
+static int
+str2int64(const char *str, int64_t *value)
+{
+	char *end;
+
+	if (str == NULL)
+		return (EINVAL);
+
+	*value = strtoul(str, &end, 10);
+	if ((size_t)(end - str) != strlen(str))
+		return (EINVAL);
+
+	return (0);
+}
+
+/*
+ * Connect the rule to the racct, increasing refcount for the rule.
+ */
+static void
+rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
+{
+	struct rctl_rule_link *link;
+
+	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
+
+	rctl_rule_acquire(rule);
+	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
+	link->rrl_rule = rule;
+	link->rrl_exceeded = 0;
+
+	rw_wlock(&rctl_lock);
+	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
+	rw_wunlock(&rctl_lock);
+}
+
+static int
+rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
+{
+	struct rctl_rule_link *link;
+
+	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
+	rw_assert(&rctl_lock, RA_WLOCKED);
+
+	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
+	if (link == NULL)
+		return (ENOMEM);
+	rctl_rule_acquire(rule);
+	link->rrl_rule = rule;
+	link->rrl_exceeded = 0;
+
+	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
+	return (0);
+}
+
+/*
+ * Remove limits for a rules matching the filter and release
+ * the refcounts for the rules, possibly freeing them.  Returns
+ * the number of limit structures removed.
+ */
+static int
+rctl_racct_remove_rules(struct racct *racct,
+    const struct rctl_rule *filter)
+{
+	int removed = 0;
+	struct rctl_rule_link *link, *linktmp;
+
+	rw_assert(&rctl_lock, RA_WLOCKED);
+
+	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
+		if (!rctl_rule_matches(link->rrl_rule, filter))
+			continue;
+
+		LIST_REMOVE(link, rrl_next);
+		rctl_rule_release(link->rrl_rule);
+		uma_zfree(rctl_rule_link_zone, link);
+		removed++;
+	}
+	return (removed);
+}
+
+static void
+rctl_rule_acquire_subject(struct rctl_rule *rule)
+{
+
+	switch (rule->rr_subject_type) {
+	case RCTL_SUBJECT_TYPE_UNDEFINED:
+	case RCTL_SUBJECT_TYPE_PROCESS:
+		break;
+	case RCTL_SUBJECT_TYPE_JAIL:
+		if (rule->rr_subject.rs_prison_racct != NULL)
+			prison_racct_hold(rule->rr_subject.rs_prison_racct);
+		break;
+	case RCTL_SUBJECT_TYPE_USER:
+		if (rule->rr_subject.rs_uip != NULL)
+			uihold(rule->rr_subject.rs_uip);
+		break;
+	case RCTL_SUBJECT_TYPE_LOGINCLASS:
+		if (rule->rr_subject.rs_loginclass != NULL)
+			loginclass_hold(rule->rr_subject.rs_loginclass);
+		break;
+	default:
+		panic("rctl_rule_acquire_subject: unknown subject type %d",
+		    rule->rr_subject_type);
+	}
+}
+
+static void
+rctl_rule_release_subject(struct rctl_rule *rule)
+{
+
+	switch (rule->rr_subject_type) {
+	case RCTL_SUBJECT_TYPE_UNDEFINED:
+	case RCTL_SUBJECT_TYPE_PROCESS:
+		break;
+	case RCTL_SUBJECT_TYPE_JAIL:
+		if (rule->rr_subject.rs_prison_racct != NULL)
+			prison_racct_free(rule->rr_subject.rs_prison_racct);
+		break;
+	case RCTL_SUBJECT_TYPE_USER:
+		if (rule->rr_subject.rs_uip != NULL)
+			uifree(rule->rr_subject.rs_uip);
+		break;
+	case RCTL_SUBJECT_TYPE_LOGINCLASS:
+		if (rule->rr_subject.rs_loginclass != NULL)
+			loginclass_free(rule->rr_subject.rs_loginclass);
+		break;
+	default:
+		panic("rctl_rule_release_subject: unknown subject type %d",
+		    rule->rr_subject_type);
+	}
+}
+
+struct rctl_rule *
+rctl_rule_alloc(int flags)
+{
+	struct rctl_rule *rule;
+
+	rule = uma_zalloc(rctl_rule_zone, flags);
+	if (rule == NULL)
+		return (NULL);
+	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
+	rule->rr_subject.rs_proc = NULL;
+	rule->rr_subject.rs_uip = NULL;
+	rule->rr_subject.rs_loginclass = NULL;
+	rule->rr_subject.rs_prison_racct = NULL;
+	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
+	rule->rr_resource = RACCT_UNDEFINED;
+	rule->rr_action = RCTL_ACTION_UNDEFINED;
+	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
+	refcount_init(&rule->rr_refcount, 1);
+
+	return (rule);
+}
+
+struct rctl_rule *
+rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
+{
+	struct rctl_rule *copy;
+
+	copy = uma_zalloc(rctl_rule_zone, flags);
+	if (copy == NULL)
+		return (NULL);
+	copy->rr_subject_type = rule->rr_subject_type;
+	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
+	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
+	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
+	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
+	copy->rr_per = rule->rr_per;
+	copy->rr_resource = rule->rr_resource;
+	copy->rr_action = rule->rr_action;
+	copy->rr_amount = rule->rr_amount;
+	refcount_init(&copy->rr_refcount, 1);
+	rctl_rule_acquire_subject(copy);
+
+	return (copy);
+}
+
+void
+rctl_rule_acquire(struct rctl_rule *rule)
+{
+
+	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
+
+	refcount_acquire(&rule->rr_refcount);
+}
+
+static void
+rctl_rule_free(void *context, int pending)
+{
+	struct rctl_rule *rule;
+	
+	rule = (struct rctl_rule *)context;
+
+	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
+	
+	/*
+	 * We don't need locking here; rule is guaranteed to be inaccessible.
+	 */
+	
+	rctl_rule_release_subject(rule);
+	uma_zfree(rctl_rule_zone, rule);
+}
+
+void
+rctl_rule_release(struct rctl_rule *rule)
+{
+
+	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
+
+	if (refcount_release(&rule->rr_refcount)) {
+		/*
+		 * rctl_rule_release() is often called when iterating
+		 * over all the uidinfo structures in the system,
+		 * holding uihashtbl_lock.  Since rctl_rule_free()
+		 * might end up calling uifree(), this would lead
+		 * to lock recursion.  Use taskqueue to avoid this.
+		 */
+		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
+		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
+	}
+}
+
+static int
+rctl_rule_fully_specified(const struct rctl_rule *rule)
+{
+
+	switch (rule->rr_subject_type) {
+	case RCTL_SUBJECT_TYPE_UNDEFINED:
+		return (0);
+	case RCTL_SUBJECT_TYPE_PROCESS:
+		if (rule->rr_subject.rs_proc == NULL)
+			return (0);
+		break;
+	case RCTL_SUBJECT_TYPE_USER:
+		if (rule->rr_subject.rs_uip == NULL)
+			return (0);
+		break;
+	case RCTL_SUBJECT_TYPE_LOGINCLASS:
+		if (rule->rr_subject.rs_loginclass == NULL)
+			return (0);
+		break;
+	case RCTL_SUBJECT_TYPE_JAIL:
+		if (rule->rr_subject.rs_prison_racct == NULL)
+			return (0);
+		break;
+	default:
+		panic("rctl_rule_fully_specified: unknown subject type %d",
+		    rule->rr_subject_type);
+	}
+	if (rule->rr_resource == RACCT_UNDEFINED)
+		return (0);
+	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
+		return (0);
+	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
+		return (0);
+	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
+		return (0);
+
+	return (1);
+}
+
+static int
+rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
+{
+	int error = 0;
+	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
+	     *amountstr, *perstr;
+	struct rctl_rule *rule;
+	id_t id;
+
+	rule = rctl_rule_alloc(M_WAITOK);
+
+	subjectstr = strsep(&rulestr, ":");
+	subject_idstr = strsep(&rulestr, ":");
+	resourcestr = strsep(&rulestr, ":");
+	actionstr = strsep(&rulestr, "=/");
+	amountstr = strsep(&rulestr, "/");
+	perstr = rulestr;
+
+	if (subjectstr == NULL || subjectstr[0] == '\0')
+		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
+	else {
+		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
+		if (error != 0)
+			goto out;
+	}
+
+	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
+		rule->rr_subject.rs_proc = NULL;
+		rule->rr_subject.rs_uip = NULL;
+		rule->rr_subject.rs_loginclass = NULL;
+		rule->rr_subject.rs_prison_racct = NULL;
+	} else {
+		switch (rule->rr_subject_type) {
+		case RCTL_SUBJECT_TYPE_UNDEFINED:
+			error = EINVAL;
+			goto out;
+		case RCTL_SUBJECT_TYPE_PROCESS:
+			error = str2id(subject_idstr, &id);
+			if (error != 0)
+				goto out;
+			sx_assert(&allproc_lock, SA_LOCKED);
+			rule->rr_subject.rs_proc = pfind(id);
+			if (rule->rr_subject.rs_proc == NULL) {
+				error = ESRCH;
+				goto out;
+			}
+			PROC_UNLOCK(rule->rr_subject.rs_proc);
+			break;
+		case RCTL_SUBJECT_TYPE_USER:
+			error = str2id(subject_idstr, &id);
+			if (error != 0)
+				goto out;
+			rule->rr_subject.rs_uip = uifind(id);
+			break;
+		case RCTL_SUBJECT_TYPE_LOGINCLASS:
+			rule->rr_subject.rs_loginclass =
+			    loginclass_find(subject_idstr);
+			if (rule->rr_subject.rs_loginclass == NULL) {
+				error = ENAMETOOLONG;
+				goto out;
+			}
+			break;
+		case RCTL_SUBJECT_TYPE_JAIL:
+			rule->rr_subject.rs_prison_racct =
+			    prison_racct_find(subject_idstr);
+			if (rule->rr_subject.rs_prison_racct == NULL) {
+				error = ENAMETOOLONG;
+				goto out;
+			}
+			break;
+               default:
+                       panic("rctl_string_to_rule: unknown subject type %d",
+                           rule->rr_subject_type);
+               }
+	}
+
+	if (resourcestr == NULL || resourcestr[0] == '\0')
+		rule->rr_resource = RACCT_UNDEFINED;
+	else {
+		error = str2value(resourcestr, &rule->rr_resource,
+		    resourcenames);
+		if (error != 0)
+			goto out;
+	}
+
+	if (actionstr == NULL || actionstr[0] == '\0')
+		rule->rr_action = RCTL_ACTION_UNDEFINED;
+	else {
+		error = str2value(actionstr, &rule->rr_action, actionnames);
+		if (error != 0)
+			goto out;
+	}
+
+	if (amountstr == NULL || amountstr[0] == '\0')
+		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
+	else {
+		error = str2int64(amountstr, &rule->rr_amount);
+		if (error != 0)
+			goto out;
+		if (RACCT_IS_IN_MILLIONS(rule->rr_resource))
+			rule->rr_amount *= 1000000;
+	}
+
+	if (perstr == NULL || perstr[0] == '\0')
+		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
+	else {
+		error = str2value(perstr, &rule->rr_per, subjectnames);
+		if (error != 0)
+			goto out;
+	}
+
+out:
+	if (error == 0)
+		*rulep = rule;
+	else
+		rctl_rule_release(rule);
+
+	return (error);
+}
+
+/*
+ * Link a rule with all the subjects it applies to.
+ */
+int
+rctl_rule_add(struct rctl_rule *rule)
+{
+	struct proc *p;
+	struct ucred *cred;
+	struct uidinfo *uip;
+	struct prison *pr;
+	struct prison_racct *prr;
+	struct loginclass *lc;
+	struct rctl_rule *rule2;
+	int match;
+
+	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
+
+	/*
+	 * Some rules just don't make sense.  Note that the one below
+	 * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
+	 * for example, is not deniable in the racct sense, but the
+	 * limit is enforced in a different way, so "deny" rules for %CPU
+	 * do make sense.
+	 */
+	if (rule->rr_action == RCTL_ACTION_DENY &&
+	    (rule->rr_resource == RACCT_CPU ||
+	    rule->rr_resource == RACCT_WALLCLOCK))
+		return (EOPNOTSUPP);
+
+	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
+	    RACCT_IS_SLOPPY(rule->rr_resource))
+		return (EOPNOTSUPP);
+
+	/*
+	 * Make sure there are no duplicated rules.  Also, for the "deny"
+	 * rules, remove ones differing only by "amount".
+	 */
+	if (rule->rr_action == RCTL_ACTION_DENY) {
+		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
+		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
+		rctl_rule_remove(rule2);
+		rctl_rule_release(rule2);
+	} else
+		rctl_rule_remove(rule);
+
+	switch (rule->rr_subject_type) {
+	case RCTL_SUBJECT_TYPE_PROCESS:
+		p = rule->rr_subject.rs_proc;
+		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
+
+		rctl_racct_add_rule(p->p_racct, rule);
+		/*
+		 * In case of per-process rule, we don't have anything more
+		 * to do.
+		 */
+		return (0);
+
+	case RCTL_SUBJECT_TYPE_USER:
+		uip = rule->rr_subject.rs_uip;
+		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
+		rctl_racct_add_rule(uip->ui_racct, rule);
+		break;
+
+	case RCTL_SUBJECT_TYPE_LOGINCLASS:
+		lc = rule->rr_subject.rs_loginclass;
+		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
+		rctl_racct_add_rule(lc->lc_racct, rule);
+		break;
+
+	case RCTL_SUBJECT_TYPE_JAIL:
+		prr = rule->rr_subject.rs_prison_racct;
+		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
+		rctl_racct_add_rule(prr->prr_racct, rule);
+		break;
+
+	default:
+		panic("rctl_rule_add: unknown subject type %d",
+		    rule->rr_subject_type);
+	}
+
+	/*
+	 * Now go through all the processes and add the new rule to the ones
+	 * it applies to.
+	 */
+	sx_assert(&allproc_lock, SA_LOCKED);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		cred = p->p_ucred;
+		switch (rule->rr_subject_type) {
+		case RCTL_SUBJECT_TYPE_USER:
+			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
+			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
+				break;
+			continue;
+		case RCTL_SUBJECT_TYPE_LOGINCLASS:
+			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
+				break;
+			continue;
+		case RCTL_SUBJECT_TYPE_JAIL:
+			match = 0;
+			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
+				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
+					match = 1;
+					break;
+				}
+			}
+			if (match)
+				break;
+			continue;
+		default:
+			panic("rctl_rule_add: unknown subject type %d",
+			    rule->rr_subject_type);
+		}
+
+		rctl_racct_add_rule(p->p_racct, rule);
+	}
+
+	return (0);
+}
+
+static void
+rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
+{
+	struct rctl_rule *filter = (struct rctl_rule *)arg2;
+	int found = 0;
+
+	rw_wlock(&rctl_lock);
+	found += rctl_racct_remove_rules(racct, filter);
+	rw_wunlock(&rctl_lock);
+
+	*((int *)arg3) += found;
+}
+
+/*
+ * Remove all rules that match the filter.
+ */
+int
+rctl_rule_remove(struct rctl_rule *filter)
+{
+	int found = 0;
+	struct proc *p;
+
+	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
+	    filter->rr_subject.rs_proc != NULL) {
+		p = filter->rr_subject.rs_proc;
+		rw_wlock(&rctl_lock);
+		found = rctl_racct_remove_rules(p->p_racct, filter);
+		rw_wunlock(&rctl_lock);
+		if (found)
+			return (0);
+		return (ESRCH);
+	}
+
+	loginclass_racct_foreach(rctl_rule_remove_callback, filter,
+	    (void *)&found);
+	ui_racct_foreach(rctl_rule_remove_callback, filter,
+	    (void *)&found);
+	prison_racct_foreach(rctl_rule_remove_callback, filter,
+	    (void *)&found);
+
+	sx_assert(&allproc_lock, SA_LOCKED);
+	rw_wlock(&rctl_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		found += rctl_racct_remove_rules(p->p_racct, filter);
+	}
+	rw_wunlock(&rctl_lock);
+
+	if (found)
+		return (0);
+	return (ESRCH);
+}
+
+/*
+ * Appends a rule to the sbuf.
+ */
+static void
+rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
+{
+	int64_t amount;
+
+	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
+
+	switch (rule->rr_subject_type) {
+	case RCTL_SUBJECT_TYPE_PROCESS:
+		if (rule->rr_subject.rs_proc == NULL)
+			sbuf_printf(sb, ":");
+		else
+			sbuf_printf(sb, "%d:",
+			    rule->rr_subject.rs_proc->p_pid);
+		break;
+	case RCTL_SUBJECT_TYPE_USER:
+		if (rule->rr_subject.rs_uip == NULL)
+			sbuf_printf(sb, ":");
+		else
+			sbuf_printf(sb, "%d:",
+			    rule->rr_subject.rs_uip->ui_uid);
+		break;
+	case RCTL_SUBJECT_TYPE_LOGINCLASS:
+		if (rule->rr_subject.rs_loginclass == NULL)
+			sbuf_printf(sb, ":");
+		else
+			sbuf_printf(sb, "%s:",
+			    rule->rr_subject.rs_loginclass->lc_name);
+		break;
+	case RCTL_SUBJECT_TYPE_JAIL:
+		if (rule->rr_subject.rs_prison_racct == NULL)
+			sbuf_printf(sb, ":");
+		else
+			sbuf_printf(sb, "%s:",
+			    rule->rr_subject.rs_prison_racct->prr_name);
+		break;
+	default:
+		panic("rctl_rule_to_sbuf: unknown subject type %d",
+		    rule->rr_subject_type);
+	}
+
+	amount = rule->rr_amount;
+	if (amount != RCTL_AMOUNT_UNDEFINED &&
+	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
+		amount /= 1000000;
+
+	sbuf_printf(sb, "%s:%s=%jd",
+	    rctl_resource_name(rule->rr_resource),
+	    rctl_action_name(rule->rr_action),
+	    amount);
+
+	if (rule->rr_per != rule->rr_subject_type)
+		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
+}
+
+/*
+ * Routine used by RCTL syscalls to read in input string.
+ */
+static int
+rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
+{
+	int error;
+	char *str;
+
+	if (inbuflen <= 0)
+		return (EINVAL);
+	if (inbuflen > RCTL_MAX_INBUFLEN)
+		return (E2BIG);
+
+	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
+	error = copyinstr(inbufp, str, inbuflen, NULL);
+	if (error != 0) {
+		free(str, M_RCTL);
+		return (error);
+	}
+
+	*inputstr = str;
+
+	return (0);
+}
+
+/*
+ * Routine used by RCTL syscalls to write out output string.
+ */
+static int
+rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
+{
+	int error;
+
+	if (outputsbuf == NULL)
+		return (0);
+
+	sbuf_finish(outputsbuf);
+	if (outbuflen < sbuf_len(outputsbuf) + 1) {
+		sbuf_delete(outputsbuf);
+		return (ERANGE);
+	}
+	error = copyout(sbuf_data(outputsbuf), outbufp,
+	    sbuf_len(outputsbuf) + 1);
+	sbuf_delete(outputsbuf);
+	return (error);
+}
+
+static struct sbuf *
+rctl_racct_to_sbuf(struct racct *racct, int sloppy)
+{
+	int i;
+	int64_t amount;
+	struct sbuf *sb;
+
+	sb = sbuf_new_auto();
+	for (i = 0; i <= RACCT_MAX; i++) {
+		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
+			continue;
+		amount = racct->r_resources[i];
+		if (RACCT_IS_IN_MILLIONS(i))
+			amount /= 1000000;
+		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
+	}
+	sbuf_setpos(sb, sbuf_len(sb) - 1);
+	return (sb);
+}
+
+int
+sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
+{
+	int error;
+	char *inputstr;
+	struct rctl_rule *filter;
+	struct sbuf *outputsbuf = NULL;
+	struct proc *p;
+	struct uidinfo *uip;
+	struct loginclass *lc;
+	struct prison_racct *prr;
+
+	error = priv_check(td, PRIV_RCTL_GET_RACCT);
+	if (error != 0)
+		return (error);
+
+	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
+	if (error != 0)
+		return (error);
+
+	sx_slock(&allproc_lock);
+	error = rctl_string_to_rule(inputstr, &filter);
+	free(inputstr, M_RCTL);
+	if (error != 0) {
+		sx_sunlock(&allproc_lock);
+		return (error);
+	}
+
+	switch (filter->rr_subject_type) {
+	case RCTL_SUBJECT_TYPE_PROCESS:
+		p = filter->rr_subject.rs_proc;
+		if (p == NULL) {
+			error = EINVAL;
+			goto out;
+		}
+		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
+		break;
+	case RCTL_SUBJECT_TYPE_USER:
+		uip = filter->rr_subject.rs_uip;
+		if (uip == NULL) {
+			error = EINVAL;
+			goto out;
+		}
+		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
+		break;
+	case RCTL_SUBJECT_TYPE_LOGINCLASS:
+		lc = filter->rr_subject.rs_loginclass;
+		if (lc == NULL) {
+			error = EINVAL;
+			goto out;
+		}
+		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
+		break;
+	case RCTL_SUBJECT_TYPE_JAIL:
+		prr = filter->rr_subject.rs_prison_racct;
+		if (prr == NULL) {
+			error = EINVAL;
+			goto out;
+		}
+		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
+		break;
+	default:
+		error = EINVAL;
+	}
+out:
+	rctl_rule_release(filter);
+	sx_sunlock(&allproc_lock);
+	if (error != 0)
+		return (error);
+
+	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
+
+	return (error);
+}
+
+static void
+rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
+{
+	struct rctl_rule *filter = (struct rctl_rule *)arg2;
+	struct rctl_rule_link *link;
+	struct sbuf *sb = (struct sbuf *)arg3;
+
+	rw_rlock(&rctl_lock);
+	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
+		if (!rctl_rule_matches(link->rrl_rule, filter))
+			continue;
+		rctl_rule_to_sbuf(sb, link->rrl_rule);
+		sbuf_printf(sb, ",");
+	}
+	rw_runlock(&rctl_lock);
+}
+
+int
+sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
+{
+	int error;
+	size_t bufsize = RCTL_DEFAULT_BUFSIZE;
+	char *inputstr, *buf;
+	struct sbuf *sb;
+	struct rctl_rule *filter;
+	struct rctl_rule_link *link;
+	struct proc *p;
+
+	error = priv_check(td, PRIV_RCTL_GET_RULES);
+	if (error != 0)
+		return (error);
+
+	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
+	if (error != 0)
+		return (error);
+
+	sx_slock(&allproc_lock);
+	error = rctl_string_to_rule(inputstr, &filter);
+	free(inputstr, M_RCTL);
+	if (error != 0) {
+		sx_sunlock(&allproc_lock);
+		return (error);
+	}
+
+again:
+	buf = malloc(bufsize, M_RCTL, M_WAITOK);
+	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
+	KASSERT(sb != NULL, ("sbuf_new failed"));
+
+	sx_assert(&allproc_lock, SA_LOCKED);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		rw_rlock(&rctl_lock);
+		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+			/*
+			 * Non-process rules will be added to the buffer later.
+			 * Adding them here would result in duplicated output.
+			 */
+			if (link->rrl_rule->rr_subject_type !=
+			    RCTL_SUBJECT_TYPE_PROCESS)
+				continue;
+			if (!rctl_rule_matches(link->rrl_rule, filter))
+				continue;
+			rctl_rule_to_sbuf(sb, link->rrl_rule);
+			sbuf_printf(sb, ",");
+		}
+		rw_runlock(&rctl_lock);
+	}
+
+	loginclass_racct_foreach(rctl_get_rules_callback, filter, sb);
+	ui_racct_foreach(rctl_get_rules_callback, filter, sb);
+	prison_racct_foreach(rctl_get_rules_callback, filter, sb);
+	if (sbuf_error(sb) == ENOMEM) {
+		sbuf_delete(sb);
+		free(buf, M_RCTL);
+		bufsize *= 4;
+		goto again;
+	}
+
+	/*
+	 * Remove trailing ",".
+	 */
+	if (sbuf_len(sb) > 0)
+		sbuf_setpos(sb, sbuf_len(sb) - 1);
+
+	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
+
+	rctl_rule_release(filter);
+	sx_sunlock(&allproc_lock);
+	free(buf, M_RCTL);
+	return (error);
+}
+
+int
+sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
+{
+	int error;
+	size_t bufsize = RCTL_DEFAULT_BUFSIZE;
+	char *inputstr, *buf;
+	struct sbuf *sb;
+	struct rctl_rule *filter;
+	struct rctl_rule_link *link;
+
+	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
+	if (error != 0)
+		return (error);
+
+	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
+	if (error != 0)
+		return (error);
+
+	sx_slock(&allproc_lock);
+	error = rctl_string_to_rule(inputstr, &filter);
+	free(inputstr, M_RCTL);
+	if (error != 0) {
+		sx_sunlock(&allproc_lock);
+		return (error);
+	}
+
+	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
+		rctl_rule_release(filter);
+		sx_sunlock(&allproc_lock);
+		return (EINVAL);
+	}
+	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
+		rctl_rule_release(filter);
+		sx_sunlock(&allproc_lock);
+		return (EOPNOTSUPP);
+	}
+	if (filter->rr_subject.rs_proc == NULL) {
+		rctl_rule_release(filter);
+		sx_sunlock(&allproc_lock);
+		return (EINVAL);
+	}
+
+again:
+	buf = malloc(bufsize, M_RCTL, M_WAITOK);
+	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
+	KASSERT(sb != NULL, ("sbuf_new failed"));
+
+	rw_rlock(&rctl_lock);
+	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
+	    rrl_next) {
+		rctl_rule_to_sbuf(sb, link->rrl_rule);
+		sbuf_printf(sb, ",");
+	}
+	rw_runlock(&rctl_lock);
+	if (sbuf_error(sb) == ENOMEM) {
+		sbuf_delete(sb);
+		free(buf, M_RCTL);
+		bufsize *= 4;
+		goto again;
+	}
+
+	/*
+	 * Remove trailing ",".
+	 */
+	if (sbuf_len(sb) > 0)
+		sbuf_setpos(sb, sbuf_len(sb) - 1);
+
+	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
+	rctl_rule_release(filter);
+	sx_sunlock(&allproc_lock);
+	free(buf, M_RCTL);
+	return (error);
+}
+
+int
+sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
+{
+	int error;
+	struct rctl_rule *rule;
+	char *inputstr;
+
+	error = priv_check(td, PRIV_RCTL_ADD_RULE);
+	if (error != 0)
+		return (error);
+
+	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
+	if (error != 0)
+		return (error);
+
+	sx_slock(&allproc_lock);
+	error = rctl_string_to_rule(inputstr, &rule);
+	free(inputstr, M_RCTL);
+	if (error != 0) {
+		sx_sunlock(&allproc_lock);
+		return (error);
+	}
+	/*
+	 * The 'per' part of a rule is optional.
+	 */
+	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
+	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
+		rule->rr_per = rule->rr_subject_type;
+
+	if (!rctl_rule_fully_specified(rule)) {
+		error = EINVAL;
+		goto out;
+	}
+
+	error = rctl_rule_add(rule);
+
+out:
+	rctl_rule_release(rule);
+	sx_sunlock(&allproc_lock);
+	return (error);
+}
+
+int
+sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
+{
+	int error;
+	struct rctl_rule *filter;
+	char *inputstr;
+
+	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
+	if (error != 0)
+		return (error);
+
+	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
+	if (error != 0)
+		return (error);
+
+	sx_slock(&allproc_lock);
+	error = rctl_string_to_rule(inputstr, &filter);
+	free(inputstr, M_RCTL);
+	if (error != 0) {
+		sx_sunlock(&allproc_lock);
+		return (error);
+	}
+
+	error = rctl_rule_remove(filter);
+	rctl_rule_release(filter);
+	sx_sunlock(&allproc_lock);
+
+	return (error);
+}
+
+/*
+ * Update RCTL rule list after credential change.
+ */
+void
+rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
+{
+	int rulecnt, i;
+	struct rctl_rule_link *link, *newlink;
+	struct uidinfo *newuip;
+	struct loginclass *newlc;
+	struct prison_racct *newprr;
+	LIST_HEAD(, rctl_rule_link) newrules;
+
+	newuip = newcred->cr_ruidinfo;
+	newlc = newcred->cr_loginclass;
+	newprr = newcred->cr_prison->pr_prison_racct;
+	
+	LIST_INIT(&newrules);
+
+again:
+	/*
+	 * First, count the rules that apply to the process with new
+	 * credentials.
+	 */
+	rulecnt = 0;
+	rw_rlock(&rctl_lock);
+	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+		if (link->rrl_rule->rr_subject_type ==
+		    RCTL_SUBJECT_TYPE_PROCESS)
+			rulecnt++;
+	}
+	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
+		rulecnt++;
+	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
+		rulecnt++;
+	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
+		rulecnt++;
+	rw_runlock(&rctl_lock);
+
+	/*
+	 * Create temporary list.  We've dropped the rctl_lock in order
+	 * to use M_WAITOK.
+	 */
+	for (i = 0; i < rulecnt; i++) {
+		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
+		newlink->rrl_rule = NULL;
+		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
+	}
+
+	newlink = LIST_FIRST(&newrules);
+
+	/*
+	 * Assign rules to the newly allocated list entries.
+	 */
+	rw_wlock(&rctl_lock);
+	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+		if (link->rrl_rule->rr_subject_type ==
+		    RCTL_SUBJECT_TYPE_PROCESS) {
+			if (newlink == NULL)
+				goto goaround;
+			rctl_rule_acquire(link->rrl_rule);
+			newlink->rrl_rule = link->rrl_rule;
+			newlink = LIST_NEXT(newlink, rrl_next);
+			rulecnt--;
+		}
+	}
+	
+	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
+		if (newlink == NULL)
+			goto goaround;
+		rctl_rule_acquire(link->rrl_rule);
+		newlink->rrl_rule = link->rrl_rule;
+		newlink = LIST_NEXT(newlink, rrl_next);
+		rulecnt--;
+	}
+
+	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
+		if (newlink == NULL)
+			goto goaround;
+		rctl_rule_acquire(link->rrl_rule);
+		newlink->rrl_rule = link->rrl_rule;
+		newlink = LIST_NEXT(newlink, rrl_next);
+		rulecnt--;
+	}
+
+	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
+		if (newlink == NULL)
+			goto goaround;
+		rctl_rule_acquire(link->rrl_rule);
+		newlink->rrl_rule = link->rrl_rule;
+		newlink = LIST_NEXT(newlink, rrl_next);
+		rulecnt--;
+	}
+
+	if (rulecnt == 0) {
+		/*
+		 * Free the old rule list.
+		 */
+		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
+			link = LIST_FIRST(&p->p_racct->r_rule_links);
+			LIST_REMOVE(link, rrl_next);
+			rctl_rule_release(link->rrl_rule);
+			uma_zfree(rctl_rule_link_zone, link);
+		}
+
+		/*
+		 * Replace lists and we're done.
+		 *
+		 * XXX: Is there any way to switch list heads instead
+		 *      of iterating here?
+		 */
+		while (!LIST_EMPTY(&newrules)) {
+			newlink = LIST_FIRST(&newrules);
+			LIST_REMOVE(newlink, rrl_next);
+			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
+			    newlink, rrl_next);
+		}
+
+		rw_wunlock(&rctl_lock);
+
+		return;
+	}
+
+goaround:
+	rw_wunlock(&rctl_lock);
+
+	/*
+	 * Rule list changed while we were not holding the rctl_lock.
+	 * Free the new list and try again.
+	 */
+	while (!LIST_EMPTY(&newrules)) {
+		newlink = LIST_FIRST(&newrules);
+		LIST_REMOVE(newlink, rrl_next);
+		if (newlink->rrl_rule != NULL)
+			rctl_rule_release(newlink->rrl_rule);
+		uma_zfree(rctl_rule_link_zone, newlink);
+	}
+
+	goto again;
+}
+
+/*
+ * Assign RCTL rules to the newly created process.
+ */
+int
+rctl_proc_fork(struct proc *parent, struct proc *child)
+{
+	int error;
+	struct rctl_rule_link *link;
+	struct rctl_rule *rule;
+
+	LIST_INIT(&child->p_racct->r_rule_links);
+
+	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
+
+	rw_wlock(&rctl_lock);
+
+	/*
+	 * Go through limits applicable to the parent and assign them
+	 * to the child.  Rules with 'process' subject have to be duplicated
+	 * in order to make their rr_subject point to the new process.
+	 */
+	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
+		if (link->rrl_rule->rr_subject_type ==
+		    RCTL_SUBJECT_TYPE_PROCESS) {
+			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
+			if (rule == NULL)
+				goto fail;
+			KASSERT(rule->rr_subject.rs_proc == parent,
+			    ("rule->rr_subject.rs_proc != parent"));
+			rule->rr_subject.rs_proc = child;
+			error = rctl_racct_add_rule_locked(child->p_racct,
+			    rule);
+			rctl_rule_release(rule);
+			if (error != 0)
+				goto fail;
+		} else {
+			error = rctl_racct_add_rule_locked(child->p_racct,
+			    link->rrl_rule);
+			if (error != 0)
+				goto fail;
+		}
+	}
+
+	rw_wunlock(&rctl_lock);
+	return (0);
+
+fail:
+	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
+		link = LIST_FIRST(&child->p_racct->r_rule_links);
+		LIST_REMOVE(link, rrl_next);
+		rctl_rule_release(link->rrl_rule);
+		uma_zfree(rctl_rule_link_zone, link);
+	}
+	rw_wunlock(&rctl_lock);
+	return (EAGAIN);
+}
+
+/*
+ * Release rules attached to the racct.
+ */
+void
+rctl_racct_release(struct racct *racct)
+{
+	struct rctl_rule_link *link;
+
+	rw_wlock(&rctl_lock);
+	while (!LIST_EMPTY(&racct->r_rule_links)) {
+		link = LIST_FIRST(&racct->r_rule_links);
+		LIST_REMOVE(link, rrl_next);
+		rctl_rule_release(link->rrl_rule);
+		uma_zfree(rctl_rule_link_zone, link);
+	}
+	rw_wunlock(&rctl_lock);
+}
+
+static void
+rctl_init(void)
+{
+
+	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
+	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+}
+
+#else /* !RCTL */
+
+int
+sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
+{
+	
+	return (ENOSYS);
+}
+
+int
+sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
+{
+	
+	return (ENOSYS);
+}
+
+int
+sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
+{
+	
+	return (ENOSYS);
+}
+
+int
+sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
+{
+	
+	return (ENOSYS);
+}
+
+int
+sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
+{
+	
+	return (ENOSYS);
+}
+
+#endif /* !RCTL */
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
new file mode 100644
index 0000000..57ee671
--- /dev/null
+++ b/sys/kern/kern_resource.c
@@ -0,0 +1,1434 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_resource.c	8.5 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/file.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/refcount.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/sx.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/time.h>
+#include <sys/umtx.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+
+static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
+static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
+#define	UIHASH(uid)	(&uihashtbl[(uid) & uihash])
+static struct rwlock uihashtbl_lock;
+static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
+static u_long uihash;		/* size of hash table - 1 */
+
+static void	calcru1(struct proc *p, struct rusage_ext *ruxp,
+		    struct timeval *up, struct timeval *sp);
+static int	donice(struct thread *td, struct proc *chgp, int n);
+static struct uidinfo *uilookup(uid_t uid);
+static void	ruxagg_locked(struct rusage_ext *rux, struct thread *td);
+
+/*
+ * Resource controls and accounting.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getpriority_args {
+	int	which;
+	int	who;
+};
+#endif
+int
+sys_getpriority(td, uap)
+	struct thread *td;
+	register struct getpriority_args *uap;
+{
+	struct proc *p;
+	struct pgrp *pg;
+	int error, low;
+
+	error = 0;
+	low = PRIO_MAX + 1;
+	switch (uap->which) {
+
+	case PRIO_PROCESS:
+		if (uap->who == 0)
+			low = td->td_proc->p_nice;
+		else {
+			p = pfind(uap->who);
+			if (p == NULL)
+				break;
+			if (p_cansee(td, p) == 0)
+				low = p->p_nice;
+			PROC_UNLOCK(p);
+		}
+		break;
+
+	case PRIO_PGRP:
+		sx_slock(&proctree_lock);
+		if (uap->who == 0) {
+			pg = td->td_proc->p_pgrp;
+			PGRP_LOCK(pg);
+		} else {
+			pg = pgfind(uap->who);
+			if (pg == NULL) {
+				sx_sunlock(&proctree_lock);
+				break;
+			}
+		}
+		sx_sunlock(&proctree_lock);
+		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+			PROC_LOCK(p);
+			if (p->p_state == PRS_NORMAL &&
+			    p_cansee(td, p) == 0) {
+				if (p->p_nice < low)
+					low = p->p_nice;
+			}
+			PROC_UNLOCK(p);
+		}
+		PGRP_UNLOCK(pg);
+		break;
+
+	case PRIO_USER:
+		if (uap->who == 0)
+			uap->who = td->td_ucred->cr_uid;
+		sx_slock(&allproc_lock);
+		FOREACH_PROC_IN_SYSTEM(p) {
+			PROC_LOCK(p);
+			if (p->p_state == PRS_NORMAL &&
+			    p_cansee(td, p) == 0 &&
+			    p->p_ucred->cr_uid == uap->who) {
+				if (p->p_nice < low)
+					low = p->p_nice;
+			}
+			PROC_UNLOCK(p);
+		}
+		sx_sunlock(&allproc_lock);
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+	if (low == PRIO_MAX + 1 && error == 0)
+		error = ESRCH;
+	td->td_retval[0] = low;
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setpriority_args {
+	int	which;
+	int	who;
+	int	prio;
+};
+#endif
+int
+sys_setpriority(td, uap)
+	struct thread *td;
+	struct setpriority_args *uap;
+{
+	struct proc *curp, *p;
+	struct pgrp *pg;
+	int found = 0, error = 0;
+
+	curp = td->td_proc;
+	switch (uap->which) {
+	case PRIO_PROCESS:
+		if (uap->who == 0) {
+			PROC_LOCK(curp);
+			error = donice(td, curp, uap->prio);
+			PROC_UNLOCK(curp);
+		} else {
+			p = pfind(uap->who);
+			if (p == NULL)
+				break;
+			error = p_cansee(td, p);
+			if (error == 0)
+				error = donice(td, p, uap->prio);
+			PROC_UNLOCK(p);
+		}
+		found++;
+		break;
+
+	case PRIO_PGRP:
+		sx_slock(&proctree_lock);
+		if (uap->who == 0) {
+			pg = curp->p_pgrp;
+			PGRP_LOCK(pg);
+		} else {
+			pg = pgfind(uap->who);
+			if (pg == NULL) {
+				sx_sunlock(&proctree_lock);
+				break;
+			}
+		}
+		sx_sunlock(&proctree_lock);
+		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+			PROC_LOCK(p);
+			if (p->p_state == PRS_NORMAL &&
+			    p_cansee(td, p) == 0) {
+				error = donice(td, p, uap->prio);
+				found++;
+			}
+			PROC_UNLOCK(p);
+		}
+		PGRP_UNLOCK(pg);
+		break;
+
+	case PRIO_USER:
+		if (uap->who == 0)
+			uap->who = td->td_ucred->cr_uid;
+		sx_slock(&allproc_lock);
+		FOREACH_PROC_IN_SYSTEM(p) {
+			PROC_LOCK(p);
+			if (p->p_state == PRS_NORMAL &&
+			    p->p_ucred->cr_uid == uap->who &&
+			    p_cansee(td, p) == 0) {
+				error = donice(td, p, uap->prio);
+				found++;
+			}
+			PROC_UNLOCK(p);
+		}
+		sx_sunlock(&allproc_lock);
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+	if (found == 0 && error == 0)
+		error = ESRCH;
+	return (error);
+}
+
+/*
+ * Set "nice" for a (whole) process.
+ */
+static int
+donice(struct thread *td, struct proc *p, int n)
+{
+	int error;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if ((error = p_cansched(td, p)))
+		return (error);
+	if (n > PRIO_MAX)
+		n = PRIO_MAX;
+	if (n < PRIO_MIN)
+		n = PRIO_MIN;
+	if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0)
+		return (EACCES);
+	sched_nice(p, n);
+	return (0);
+}
+
+static int unprivileged_idprio;
+SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_idprio, CTLFLAG_RW,
+    &unprivileged_idprio, 0, "Allow non-root users to set an idle priority");
+
+/*
+ * Set realtime priority for LWP.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rtprio_thread_args {
+	int		function;
+	lwpid_t		lwpid;
+	struct rtprio	*rtp;
+};
+#endif
+int
+sys_rtprio_thread(struct thread *td, struct rtprio_thread_args *uap)
+{
+	struct proc *p;
+	struct rtprio rtp;
+	struct thread *td1;
+	int cierror, error;
+
+	/* Perform copyin before acquiring locks if needed. */
+	if (uap->function == RTP_SET)
+		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
+	else
+		cierror = 0;
+
+	if (uap->lwpid == 0 || uap->lwpid == td->td_tid) {
+		p = td->td_proc;
+		td1 = td;
+		PROC_LOCK(p);
+	} else {
+		/* Only look up thread in current process */
+		td1 = tdfind(uap->lwpid, curproc->p_pid);
+		if (td1 == NULL)
+			return (ESRCH);
+		p = td1->td_proc;
+	}
+
+	switch (uap->function) {
+	case RTP_LOOKUP:
+		if ((error = p_cansee(td, p)))
+			break;
+		pri_to_rtp(td1, &rtp);
+		PROC_UNLOCK(p);
+		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
+	case RTP_SET:
+		if ((error = p_cansched(td, p)) || (error = cierror))
+			break;
+
+		/* Disallow setting rtprio in most cases if not superuser. */
+
+		/*
+		 * Realtime priority has to be restricted for reasons which
+		 * should be obvious.  However, for idleprio processes, there is
+		 * a potential for system deadlock if an idleprio process gains
+		 * a lock on a resource that other processes need (and the
+		 * idleprio process can't run due to a CPU-bound normal
+		 * process).  Fix me!  XXX
+		 *
+		 * This problem is not only related to idleprio process.
+		 * A user level program can obtain a file lock and hold it
+		 * indefinitely.  Additionally, without idleprio processes it is
+		 * still conceivable that a program with low priority will never
+		 * get to run.  In short, allowing this feature might make it
+		 * easier to lock a resource indefinitely, but it is not the
+		 * only thing that makes it possible.
+		 */
+		if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME ||
+		    (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE &&
+		    unprivileged_idprio == 0)) {
+			error = priv_check(td, PRIV_SCHED_RTPRIO);
+			if (error)
+				break;
+		}
+		error = rtp_to_pri(&rtp, td1);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+/*
+ * Set realtime priority.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rtprio_args {
+	int		function;
+	pid_t		pid;
+	struct rtprio	*rtp;
+};
+#endif
+int
+sys_rtprio(td, uap)
+	struct thread *td;		/* curthread */
+	register struct rtprio_args *uap;
+{
+	struct proc *p;
+	struct thread *tdp;
+	struct rtprio rtp;
+	int cierror, error;
+
+	/* Perform copyin before acquiring locks if needed. */
+	if (uap->function == RTP_SET)
+		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
+	else
+		cierror = 0;
+
+	if (uap->pid == 0) {
+		p = td->td_proc;
+		PROC_LOCK(p);
+	} else {
+		p = pfind(uap->pid);
+		if (p == NULL)
+			return (ESRCH);
+	}
+
+	switch (uap->function) {
+	case RTP_LOOKUP:
+		if ((error = p_cansee(td, p)))
+			break;
+		/*
+		 * Return OUR priority if no pid specified,
+		 * or if one is, report the highest priority
+		 * in the process.  There isn't much more you can do as
+		 * there is only room to return a single priority.
+		 * Note: specifying our own pid is not the same
+		 * as leaving it zero.
+		 */
+		if (uap->pid == 0) {
+			pri_to_rtp(td, &rtp);
+		} else {
+			struct rtprio rtp2;
+
+			rtp.type = RTP_PRIO_IDLE;
+			rtp.prio = RTP_PRIO_MAX;
+			FOREACH_THREAD_IN_PROC(p, tdp) {
+				pri_to_rtp(tdp, &rtp2);
+				if (rtp2.type <  rtp.type ||
+				    (rtp2.type == rtp.type &&
+				    rtp2.prio < rtp.prio)) {
+					rtp.type = rtp2.type;
+					rtp.prio = rtp2.prio;
+				}
+			}
+		}
+		PROC_UNLOCK(p);
+		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
+	case RTP_SET:
+		if ((error = p_cansched(td, p)) || (error = cierror))
+			break;
+
+		/*
+		 * Disallow setting rtprio in most cases if not superuser.
+		 * See the comment in sys_rtprio_thread about idprio
+		 * threads holding a lock.
+		 */
+		if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME ||
+		    (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE &&
+		    !unprivileged_idprio)) {
+			error = priv_check(td, PRIV_SCHED_RTPRIO);
+			if (error)
+				break;
+		}
+
+		/*
+		 * If we are setting our own priority, set just our
+		 * thread but if we are doing another process,
+		 * do all the threads on that process. If we
+		 * specify our own pid we do the latter.
+		 */
+		if (uap->pid == 0) {
+			error = rtp_to_pri(&rtp, td);
+		} else {
+			FOREACH_THREAD_IN_PROC(p, td) {
+				if ((error = rtp_to_pri(&rtp, td)) != 0)
+					break;
+			}
+		}
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+int
+rtp_to_pri(struct rtprio *rtp, struct thread *td)
+{
+	u_char  newpri, oldclass, oldpri;
+
+	switch (RTP_PRIO_BASE(rtp->type)) {
+	case RTP_PRIO_REALTIME:
+		if (rtp->prio > RTP_PRIO_MAX)
+			return (EINVAL);
+		newpri = PRI_MIN_REALTIME + rtp->prio;
+		break;
+	case RTP_PRIO_NORMAL:
+		if (rtp->prio > (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE))
+			return (EINVAL);
+		newpri = PRI_MIN_TIMESHARE + rtp->prio;
+		break;
+	case RTP_PRIO_IDLE:
+		if (rtp->prio > RTP_PRIO_MAX)
+			return (EINVAL);
+		newpri = PRI_MIN_IDLE + rtp->prio;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	thread_lock(td);
+	oldclass = td->td_pri_class;
+	sched_class(td, rtp->type);	/* XXX fix */
+	oldpri = td->td_user_pri;
+	sched_user_prio(td, newpri);
+	if (td->td_user_pri != oldpri && (oldclass != RTP_PRIO_NORMAL ||
+	    td->td_pri_class != RTP_PRIO_NORMAL))
+		sched_prio(td, td->td_user_pri);
+	if (TD_ON_UPILOCK(td) && oldpri != newpri) {
+		critical_enter();
+		thread_unlock(td);
+		umtx_pi_adjust(td, oldpri);
+		critical_exit();
+	} else
+		thread_unlock(td);
+	return (0);
+}
+
+void
+pri_to_rtp(struct thread *td, struct rtprio *rtp)
+{
+
+	thread_lock(td);
+	switch (PRI_BASE(td->td_pri_class)) {
+	case PRI_REALTIME:
+		rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME;
+		break;
+	case PRI_TIMESHARE:
+		rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE;
+		break;
+	case PRI_IDLE:
+		rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE;
+		break;
+	default:
+		break;
+	}
+	rtp->type = td->td_pri_class;
+	thread_unlock(td);
+}
+
+#if defined(COMPAT_43)
+#ifndef _SYS_SYSPROTO_H_
+struct osetrlimit_args {
+	u_int	which;
+	struct	orlimit *rlp;
+};
+#endif
+int
+osetrlimit(td, uap)
+	struct thread *td;
+	register struct osetrlimit_args *uap;
+{
+	struct orlimit olim;
+	struct rlimit lim;
+	int error;
+
+	if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
+		return (error);
+	lim.rlim_cur = olim.rlim_cur;
+	lim.rlim_max = olim.rlim_max;
+	error = kern_setrlimit(td, uap->which, &lim);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ogetrlimit_args {
+	u_int	which;
+	struct	orlimit *rlp;
+};
+#endif
+int
+ogetrlimit(td, uap)
+	struct thread *td;
+	register struct ogetrlimit_args *uap;
+{
+	struct orlimit olim;
+	struct rlimit rl;
+	struct proc *p;
+	int error;
+
+	if (uap->which >= RLIM_NLIMITS)
+		return (EINVAL);
+	p = td->td_proc;
+	PROC_LOCK(p);
+	lim_rlimit(p, uap->which, &rl);
+	PROC_UNLOCK(p);
+
+	/*
+	 * XXX would be more correct to convert only RLIM_INFINITY to the
+	 * old RLIM_INFINITY and fail with EOVERFLOW for other larger
+	 * values.  Most 64->32 and 32->16 conversions, including not
+	 * unimportant ones of uids are even more broken than what we
+	 * do here (they blindly truncate).  We don't do this correctly
+	 * here since we have little experience with EOVERFLOW yet.
+	 * Elsewhere, getuid() can't fail...
+	 */
+	olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur;
+	olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max;
+	error = copyout(&olim, uap->rlp, sizeof(olim));
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct __setrlimit_args {
+	u_int	which;
+	struct	rlimit *rlp;
+};
+#endif
+int
+sys_setrlimit(td, uap)
+	struct thread *td;
+	register struct __setrlimit_args *uap;
+{
+	struct rlimit alim;
+	int error;
+
+	if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit))))
+		return (error);
+	error = kern_setrlimit(td, uap->which, &alim);
+	return (error);
+}
+
+static void
+lim_cb(void *arg)
+{
+	struct rlimit rlim;
+	struct thread *td;
+	struct proc *p;
+
+	p = arg;
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	/*
+	 * Check if the process exceeds its cpu resource allocation.  If
+	 * it reaches the max, arrange to kill the process in ast().
+	 */
+	if (p->p_cpulimit == RLIM_INFINITY)
+		return;
+	PROC_SLOCK(p);
+	FOREACH_THREAD_IN_PROC(p, td) {
+		ruxagg(p, td);
+	}
+	PROC_SUNLOCK(p);
+	if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) {
+		lim_rlimit(p, RLIMIT_CPU, &rlim);
+		if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) {
+			killproc(p, "exceeded maximum CPU limit");
+		} else {
+			if (p->p_cpulimit < rlim.rlim_max)
+				p->p_cpulimit += 5;
+			kern_psignal(p, SIGXCPU);
+		}
+	}
+	if ((p->p_flag & P_WEXIT) == 0)
+		callout_reset_sbt(&p->p_limco, SBT_1S, 0,
+		    lim_cb, p, C_PREL(1));
+}
+
+int
+kern_setrlimit(struct thread *td, u_int which, struct rlimit *limp)
+{
+
+	return (kern_proc_setrlimit(td, td->td_proc, which, limp));
+}
+
+int
+kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which,
+    struct rlimit *limp)
+{
+	struct plimit *newlim, *oldlim;
+	register struct rlimit *alimp;
+	struct rlimit oldssiz;
+	int error;
+
+	if (which >= RLIM_NLIMITS)
+		return (EINVAL);
+
+	/*
+	 * Preserve historical bugs by treating negative limits as unsigned.
+	 */
+	if (limp->rlim_cur < 0)
+		limp->rlim_cur = RLIM_INFINITY;
+	if (limp->rlim_max < 0)
+		limp->rlim_max = RLIM_INFINITY;
+
+	oldssiz.rlim_cur = 0;
+	newlim = lim_alloc();
+	PROC_LOCK(p);
+	oldlim = p->p_limit;
+	alimp = &oldlim->pl_rlimit[which];
+	if (limp->rlim_cur > alimp->rlim_max ||
+	    limp->rlim_max > alimp->rlim_max)
+		if ((error = priv_check(td, PRIV_PROC_SETRLIMIT))) {
+			PROC_UNLOCK(p);
+			lim_free(newlim);
+			return (error);
+		}
+	if (limp->rlim_cur > limp->rlim_max)
+		limp->rlim_cur = limp->rlim_max;
+	lim_copy(newlim, oldlim);
+	alimp = &newlim->pl_rlimit[which];
+
+	switch (which) {
+
+	case RLIMIT_CPU:
+		if (limp->rlim_cur != RLIM_INFINITY &&
+		    p->p_cpulimit == RLIM_INFINITY)
+			callout_reset_sbt(&p->p_limco, SBT_1S, 0,
+			    lim_cb, p, C_PREL(1));
+		p->p_cpulimit = limp->rlim_cur;
+		break;
+	case RLIMIT_DATA:
+		if (limp->rlim_cur > maxdsiz)
+			limp->rlim_cur = maxdsiz;
+		if (limp->rlim_max > maxdsiz)
+			limp->rlim_max = maxdsiz;
+		break;
+
+	case RLIMIT_STACK:
+		if (limp->rlim_cur > maxssiz)
+			limp->rlim_cur = maxssiz;
+		if (limp->rlim_max > maxssiz)
+			limp->rlim_max = maxssiz;
+		oldssiz = *alimp;
+		if (p->p_sysent->sv_fixlimit != NULL)
+			p->p_sysent->sv_fixlimit(&oldssiz,
+			    RLIMIT_STACK);
+		break;
+
+	case RLIMIT_NOFILE:
+		if (limp->rlim_cur > maxfilesperproc)
+			limp->rlim_cur = maxfilesperproc;
+		if (limp->rlim_max > maxfilesperproc)
+			limp->rlim_max = maxfilesperproc;
+		break;
+
+	case RLIMIT_NPROC:
+		if (limp->rlim_cur > maxprocperuid)
+			limp->rlim_cur = maxprocperuid;
+		if (limp->rlim_max > maxprocperuid)
+			limp->rlim_max = maxprocperuid;
+		if (limp->rlim_cur < 1)
+			limp->rlim_cur = 1;
+		if (limp->rlim_max < 1)
+			limp->rlim_max = 1;
+		break;
+	}
+	if (p->p_sysent->sv_fixlimit != NULL)
+		p->p_sysent->sv_fixlimit(limp, which);
+	*alimp = *limp;
+	p->p_limit = newlim;
+	PROC_UNLOCK(p);
+	lim_free(oldlim);
+
+	if (which == RLIMIT_STACK) {
+		/*
+		 * Stack is allocated to the max at exec time with only
+		 * "rlim_cur" bytes accessible.  If stack limit is going
+		 * up make more accessible, if going down make inaccessible.
+		 */
+		if (limp->rlim_cur != oldssiz.rlim_cur) {
+			vm_offset_t addr;
+			vm_size_t size;
+			vm_prot_t prot;
+
+			if (limp->rlim_cur > oldssiz.rlim_cur) {
+				prot = p->p_sysent->sv_stackprot;
+				size = limp->rlim_cur - oldssiz.rlim_cur;
+				addr = p->p_sysent->sv_usrstack -
+				    limp->rlim_cur;
+			} else {
+				prot = VM_PROT_NONE;
+				size = oldssiz.rlim_cur - limp->rlim_cur;
+				addr = p->p_sysent->sv_usrstack -
+				    oldssiz.rlim_cur;
+			}
+			addr = trunc_page(addr);
+			size = round_page(size);
+			(void)vm_map_protect(&p->p_vmspace->vm_map,
+			    addr, addr + size, prot, FALSE);
+		}
+	}
+
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct __getrlimit_args {
+	u_int	which;
+	struct	rlimit *rlp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getrlimit(td, uap)
+	struct thread *td;
+	register struct __getrlimit_args *uap;
+{
+	struct rlimit rlim;
+	struct proc *p;
+	int error;
+
+	if (uap->which >= RLIM_NLIMITS)
+		return (EINVAL);
+	p = td->td_proc;
+	PROC_LOCK(p);
+	lim_rlimit(p, uap->which, &rlim);
+	PROC_UNLOCK(p);
+	error = copyout(&rlim, uap->rlp, sizeof(struct rlimit));
+	return (error);
+}
+
+/*
+ * Transform the running time and tick information for children of proc p
+ * into user and system time usage.
+ */
+void
+calccru(p, up, sp)
+	struct proc *p;
+	struct timeval *up;
+	struct timeval *sp;
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	calcru1(p, &p->p_crux, up, sp);
+}
+
+/*
+ * Transform the running time and tick information in proc p into user
+ * and system time usage.  If appropriate, include the current time slice
+ * on this CPU.
+ */
+void
+calcru(struct proc *p, struct timeval *up, struct timeval *sp)
+{
+	struct thread *td;
+	uint64_t runtime, u;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	/*
+	 * If we are getting stats for the current process, then add in the
+	 * stats that this thread has accumulated in its current time slice.
+	 * We reset the thread and CPU state as if we had performed a context
+	 * switch right here.
+	 */
+	td = curthread;
+	if (td->td_proc == p) {
+		u = cpu_ticks();
+		runtime = u - PCPU_GET(switchtime);
+		td->td_runtime += runtime;
+		td->td_incruntime += runtime;
+		PCPU_SET(switchtime, u);
+	}
+	/* Make sure the per-thread stats are current. */
+	FOREACH_THREAD_IN_PROC(p, td) {
+		if (td->td_incruntime == 0)
+			continue;
+		ruxagg(p, td);
+	}
+	calcru1(p, &p->p_rux, up, sp);
+}
+
+/* Collect resource usage for a single thread. */
+void
+rufetchtd(struct thread *td, struct rusage *ru)
+{
+	struct proc *p;
+	uint64_t runtime, u;
+
+	p = td->td_proc;
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	/*
+	 * If we are getting stats for the current thread, then add in the
+	 * stats that this thread has accumulated in its current time slice.
+	 * We reset the thread and CPU state as if we had performed a context
+	 * switch right here.
+	 */
+	if (td == curthread) {
+		u = cpu_ticks();
+		runtime = u - PCPU_GET(switchtime);
+		td->td_runtime += runtime;
+		td->td_incruntime += runtime;
+		PCPU_SET(switchtime, u);
+	}
+	ruxagg(p, td);
+	*ru = td->td_ru;
+	calcru1(p, &td->td_rux, &ru->ru_utime, &ru->ru_stime);
+}
+
+static void
+calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
+    struct timeval *sp)
+{
+	/* {user, system, interrupt, total} {ticks, usec}: */
+	uint64_t ut, uu, st, su, it, tt, tu;
+
+	ut = ruxp->rux_uticks;
+	st = ruxp->rux_sticks;
+	it = ruxp->rux_iticks;
+	tt = ut + st + it;
+	if (tt == 0) {
+		/* Avoid divide by zero */
+		st = 1;
+		tt = 1;
+	}
+	tu = cputick2usec(ruxp->rux_runtime);
+	if ((int64_t)tu < 0) {
+		/* XXX: this should be an assert /phk */
+		printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
+		    (intmax_t)tu, p->p_pid, p->p_comm);
+		tu = ruxp->rux_tu;
+	}
+
+	if (tu >= ruxp->rux_tu) {
+		/*
+		 * The normal case, time increased.
+		 * Enforce monotonicity of bucketed numbers.
+		 */
+		uu = (tu * ut) / tt;
+		if (uu < ruxp->rux_uu)
+			uu = ruxp->rux_uu;
+		su = (tu * st) / tt;
+		if (su < ruxp->rux_su)
+			su = ruxp->rux_su;
+	} else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
+		/*
+		 * When we calibrate the cputicker, it is not uncommon to
+		 * see the presumably fixed frequency increase slightly over
+		 * time as a result of thermal stabilization and NTP
+		 * discipline (of the reference clock).  We therefore ignore
+		 * a bit of backwards slop because we  expect to catch up
+		 * shortly.  We use a 3 microsecond limit to catch low
+		 * counts and a 1% limit for high counts.
+		 */
+		uu = ruxp->rux_uu;
+		su = ruxp->rux_su;
+		tu = ruxp->rux_tu;
+	} else { /* tu < ruxp->rux_tu */
+		/*
+		 * What happened here was likely that a laptop, which ran at
+		 * a reduced clock frequency at boot, kicked into high gear.
+		 * The wisdom of spamming this message in that case is
+		 * dubious, but it might also be indicative of something
+		 * serious, so lets keep it and hope laptops can be made
+		 * more truthful about their CPU speed via ACPI.
+		 */
+		printf("calcru: runtime went backwards from %ju usec "
+		    "to %ju usec for pid %d (%s)\n",
+		    (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
+		    p->p_pid, p->p_comm);
+		uu = (tu * ut) / tt;
+		su = (tu * st) / tt;
+	}
+
+	ruxp->rux_uu = uu;
+	ruxp->rux_su = su;
+	ruxp->rux_tu = tu;
+
+	up->tv_sec = uu / 1000000;
+	up->tv_usec = uu % 1000000;
+	sp->tv_sec = su / 1000000;
+	sp->tv_usec = su % 1000000;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getrusage_args {
+	int	who;
+	struct	rusage *rusage;
+};
+#endif
+int
+sys_getrusage(td, uap)
+	register struct thread *td;
+	register struct getrusage_args *uap;
+{
+	struct rusage ru;
+	int error;
+
+	error = kern_getrusage(td, uap->who, &ru);
+	if (error == 0)
+		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
+	return (error);
+}
+
+int
+kern_getrusage(struct thread *td, int who, struct rusage *rup)
+{
+	struct proc *p;
+	int error;
+
+	error = 0;
+	p = td->td_proc;
+	PROC_LOCK(p);
+	switch (who) {
+	case RUSAGE_SELF:
+		rufetchcalc(p, rup, &rup->ru_utime,
+		    &rup->ru_stime);
+		break;
+
+	case RUSAGE_CHILDREN:
+		*rup = p->p_stats->p_cru;
+		calccru(p, &rup->ru_utime, &rup->ru_stime);
+		break;
+
+	case RUSAGE_THREAD:
+		PROC_SLOCK(p);
+		thread_lock(td);
+		rufetchtd(td, rup);
+		thread_unlock(td);
+		PROC_SUNLOCK(p);
+		break;
+
+	default:
+		error = EINVAL;
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+void
+rucollect(struct rusage *ru, struct rusage *ru2)
+{
+	long *ip, *ip2;
+	int i;
+
+	if (ru->ru_maxrss < ru2->ru_maxrss)
+		ru->ru_maxrss = ru2->ru_maxrss;
+	ip = &ru->ru_first;
+	ip2 = &ru2->ru_first;
+	for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
+		*ip++ += *ip2++;
+}
+
+void
+ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2,
+    struct rusage_ext *rux2)
+{
+
+	rux->rux_runtime += rux2->rux_runtime;
+	rux->rux_uticks += rux2->rux_uticks;
+	rux->rux_sticks += rux2->rux_sticks;
+	rux->rux_iticks += rux2->rux_iticks;
+	rux->rux_uu += rux2->rux_uu;
+	rux->rux_su += rux2->rux_su;
+	rux->rux_tu += rux2->rux_tu;
+	rucollect(ru, ru2);
+}
+
+/*
+ * Aggregate tick counts into the proc's rusage_ext.
+ */
+static void
+ruxagg_locked(struct rusage_ext *rux, struct thread *td)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED);
+	rux->rux_runtime += td->td_incruntime;
+	rux->rux_uticks += td->td_uticks;
+	rux->rux_sticks += td->td_sticks;
+	rux->rux_iticks += td->td_iticks;
+}
+
+void
+ruxagg(struct proc *p, struct thread *td)
+{
+
+	thread_lock(td);
+	ruxagg_locked(&p->p_rux, td);
+	ruxagg_locked(&td->td_rux, td);
+	td->td_incruntime = 0;
+	td->td_uticks = 0;
+	td->td_iticks = 0;
+	td->td_sticks = 0;
+	thread_unlock(td);
+}
+
+/*
+ * Update the rusage_ext structure and fetch a valid aggregate rusage
+ * for proc p if storage for one is supplied.
+ */
+void
+rufetch(struct proc *p, struct rusage *ru)
+{
+	struct thread *td;
+
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+
+	*ru = p->p_ru;
+	if (p->p_numthreads > 0)  {
+		FOREACH_THREAD_IN_PROC(p, td) {
+			ruxagg(p, td);
+			rucollect(ru, &td->td_ru);
+		}
+	}
+}
+
+/*
+ * Atomically perform a rufetch and a calcru together.
+ * Consumers, can safely assume the calcru is executed only once
+ * rufetch is completed.
+ */
+void
+rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up,
+    struct timeval *sp)
+{
+
+	PROC_SLOCK(p);
+	rufetch(p, ru);
+	calcru(p, up, sp);
+	PROC_SUNLOCK(p);
+}
+
+/*
+ * Allocate a new resource limits structure and initialize its
+ * reference count and mutex pointer.
+ */
+struct plimit *
+lim_alloc()
+{
+	struct plimit *limp;
+
+	limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
+	refcount_init(&limp->pl_refcnt, 1);
+	return (limp);
+}
+
+struct plimit *
+lim_hold(limp)
+	struct plimit *limp;
+{
+
+	refcount_acquire(&limp->pl_refcnt);
+	return (limp);
+}
+
+void
+lim_fork(struct proc *p1, struct proc *p2)
+{
+
+	PROC_LOCK_ASSERT(p1, MA_OWNED);
+	PROC_LOCK_ASSERT(p2, MA_OWNED);
+
+	p2->p_limit = lim_hold(p1->p_limit);
+	callout_init_mtx(&p2->p_limco, &p2->p_mtx, 0);
+	if (p1->p_cpulimit != RLIM_INFINITY)
+		callout_reset_sbt(&p2->p_limco, SBT_1S, 0,
+		    lim_cb, p2, C_PREL(1));
+}
+
+void
+lim_free(limp)
+	struct plimit *limp;
+{
+
+	KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
+	if (refcount_release(&limp->pl_refcnt))
+		free((void *)limp, M_PLIMIT);
+}
+
+/*
+ * Make a copy of the plimit structure.
+ * We share these structures copy-on-write after fork.
+ */
+void
+lim_copy(dst, src)
+	struct plimit *dst, *src;
+{
+
+	KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit"));
+	bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit));
+}
+
+/*
+ * Return the hard limit for a particular system resource.  The
+ * which parameter specifies the index into the rlimit array.
+ */
+rlim_t
+lim_max(struct proc *p, int which)
+{
+	struct rlimit rl;
+
+	lim_rlimit(p, which, &rl);
+	return (rl.rlim_max);
+}
+
+/*
+ * Return the current (soft) limit for a particular system resource.
+ * The which parameter which specifies the index into the rlimit array
+ */
+rlim_t
+lim_cur(struct proc *p, int which)
+{
+	struct rlimit rl;
+
+	lim_rlimit(p, which, &rl);
+	return (rl.rlim_cur);
+}
+
+/*
+ * Return a copy of the entire rlimit structure for the system limit
+ * specified by 'which' in the rlimit structure pointed to by 'rlp'.
+ */
+void
+lim_rlimit(struct proc *p, int which, struct rlimit *rlp)
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	KASSERT(which >= 0 && which < RLIM_NLIMITS,
+	    ("request for invalid resource limit"));
+	*rlp = p->p_limit->pl_rlimit[which];
+	if (p->p_sysent->sv_fixlimit != NULL)
+		p->p_sysent->sv_fixlimit(rlp, which);
+}
+
+void
+uihashinit()
+{
+
+	uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
+	rw_init(&uihashtbl_lock, "uidinfo hash");
+}
+
+/*
+ * Look up a uidinfo struct for the parameter uid.
+ * uihashtbl_lock must be locked.
+ */
+static struct uidinfo *
+uilookup(uid)
+	uid_t uid;
+{
+	struct uihashhead *uipp;
+	struct uidinfo *uip;
+
+	rw_assert(&uihashtbl_lock, RA_LOCKED);
+	uipp = UIHASH(uid);
+	LIST_FOREACH(uip, uipp, ui_hash)
+		if (uip->ui_uid == uid)
+			break;
+
+	return (uip);
+}
+
+/*
+ * Find or allocate a struct uidinfo for a particular uid.
+ * Increase refcount on uidinfo struct returned.
+ * uifree() should be called on a struct uidinfo when released.
+ */
+struct uidinfo *
+uifind(uid)
+	uid_t uid;
+{
+	struct uidinfo *old_uip, *uip;
+
+	rw_rlock(&uihashtbl_lock);
+	uip = uilookup(uid);
+	if (uip == NULL) {
+		rw_runlock(&uihashtbl_lock);
+		uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
+		racct_create(&uip->ui_racct);
+		rw_wlock(&uihashtbl_lock);
+		/*
+		 * There's a chance someone created our uidinfo while we
+		 * were in malloc and not holding the lock, so we have to
+		 * make sure we don't insert a duplicate uidinfo.
+		 */
+		if ((old_uip = uilookup(uid)) != NULL) {
+			/* Someone else beat us to it. */
+			racct_destroy(&uip->ui_racct);
+			free(uip, M_UIDINFO);
+			uip = old_uip;
+		} else {
+			refcount_init(&uip->ui_ref, 0);
+			uip->ui_uid = uid;
+			mtx_init(&uip->ui_vmsize_mtx, "ui_vmsize", NULL,
+			    MTX_DEF);
+			LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
+		}
+	}
+	uihold(uip);
+	rw_unlock(&uihashtbl_lock);
+	return (uip);
+}
+
+/*
+ * Place another refcount on a uidinfo struct.
+ */
+void
+uihold(uip)
+	struct uidinfo *uip;
+{
+
+	refcount_acquire(&uip->ui_ref);
+}
+
+/*-
+ * Since uidinfo structs have a long lifetime, we use an
+ * opportunistic refcounting scheme to avoid locking the lookup hash
+ * for each release.
+ *
+ * If the refcount hits 0, we need to free the structure,
+ * which means we need to lock the hash.
+ * Optimal case:
+ *   After locking the struct and lowering the refcount, if we find
+ *   that we don't need to free, simply unlock and return.
+ * Suboptimal case:
+ *   If refcount lowering results in need to free, bump the count
+ *   back up, lose the lock and acquire the locks in the proper
+ *   order to try again.
+ */
+void
+uifree(uip)
+	struct uidinfo *uip;
+{
+	int old;
+
+	/* Prepare for optimal case. */
+	old = uip->ui_ref;
+	if (old > 1 && atomic_cmpset_int(&uip->ui_ref, old, old - 1))
+		return;
+
+	/* Prepare for suboptimal case. */
+	rw_wlock(&uihashtbl_lock);
+	if (refcount_release(&uip->ui_ref)) {
+		racct_destroy(&uip->ui_racct);
+		LIST_REMOVE(uip, ui_hash);
+		rw_wunlock(&uihashtbl_lock);
+		if (uip->ui_sbsize != 0)
+			printf("freeing uidinfo: uid = %d, sbsize = %ld\n",
+			    uip->ui_uid, uip->ui_sbsize);
+		if (uip->ui_proccnt != 0)
+			printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
+			    uip->ui_uid, uip->ui_proccnt);
+		if (uip->ui_vmsize != 0)
+			printf("freeing uidinfo: uid = %d, swapuse = %lld\n",
+			    uip->ui_uid, (unsigned long long)uip->ui_vmsize);
+		mtx_destroy(&uip->ui_vmsize_mtx);
+		free(uip, M_UIDINFO);
+		return;
+	}
+	/*
+	 * Someone added a reference between atomic_cmpset_int() and
+	 * rw_wlock(&uihashtbl_lock).
+	 */
+	rw_wunlock(&uihashtbl_lock);
+}
+
+void
+ui_racct_foreach(void (*callback)(struct racct *racct,
+    void *arg2, void *arg3), void *arg2, void *arg3)
+{
+	struct uidinfo *uip;
+	struct uihashhead *uih;
+
+	rw_rlock(&uihashtbl_lock);
+	for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) {
+		LIST_FOREACH(uip, uih, ui_hash) {
+			(callback)(uip->ui_racct, arg2, arg3);
+		}
+	}
+	rw_runlock(&uihashtbl_lock);
+}
+
+/*
+ * Change the count associated with number of processes
+ * a given user is using.  When 'max' is 0, don't enforce a limit
+ */
+int
+chgproccnt(uip, diff, max)
+	struct	uidinfo	*uip;
+	int	diff;
+	rlim_t	max;
+{
+
+	/* Don't allow them to exceed max, but allow subtraction. */
+	if (diff > 0 && max != 0) {
+		if (atomic_fetchadd_long(&uip->ui_proccnt, (long)diff) + diff > max) {
+			atomic_subtract_long(&uip->ui_proccnt, (long)diff);
+			return (0);
+		}
+	} else {
+		atomic_add_long(&uip->ui_proccnt, (long)diff);
+		if (uip->ui_proccnt < 0)
+			printf("negative proccnt for uid = %d\n", uip->ui_uid);
+	}
+	return (1);
+}
+
+/*
+ * Change the total socket buffer size a user has used.
+ */
+int
+chgsbsize(uip, hiwat, to, max)
+	struct	uidinfo	*uip;
+	u_int  *hiwat;
+	u_int	to;
+	rlim_t	max;
+{
+	int diff;
+
+	diff = to - *hiwat;
+	if (diff > 0) {
+		if (atomic_fetchadd_long(&uip->ui_sbsize, (long)diff) + diff > max) {
+			atomic_subtract_long(&uip->ui_sbsize, (long)diff);
+			return (0);
+		}
+	} else {
+		atomic_add_long(&uip->ui_sbsize, (long)diff);
+		if (uip->ui_sbsize < 0)
+			printf("negative sbsize for uid = %d\n", uip->ui_uid);
+	}
+	*hiwat = to;
+	return (1);
+}
+
+/*
+ * Change the count associated with number of pseudo-terminals
+ * a given user is using.  When 'max' is 0, don't enforce a limit
+ */
+int
+chgptscnt(uip, diff, max)
+	struct	uidinfo	*uip;
+	int	diff;
+	rlim_t	max;
+{
+
+	/* Don't allow them to exceed max, but allow subtraction. */
+	if (diff > 0 && max != 0) {
+		if (atomic_fetchadd_long(&uip->ui_ptscnt, (long)diff) + diff > max) {
+			atomic_subtract_long(&uip->ui_ptscnt, (long)diff);
+			return (0);
+		}
+	} else {
+		atomic_add_long(&uip->ui_ptscnt, (long)diff);
+		if (uip->ui_ptscnt < 0)
+			printf("negative ptscnt for uid = %d\n", uip->ui_uid);
+	}
+	return (1);
+}
diff --git a/sys/kern/kern_rmlock.c b/sys/kern/kern_rmlock.c
new file mode 100644
index 0000000..ff397eb
--- /dev/null
+++ b/sys/kern/kern_rmlock.c
@@ -0,0 +1,831 @@
+/*-
+ * Copyright (c) 2007 Stephan Uphoff <ups@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Machine independent bits of reader/writer lock implementation.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_kdtrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <sys/kernel.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rmlock.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/turnstile.h>
+#include <sys/lock_profile.h>
+#include <machine/cpu.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * A cookie to mark destroyed rmlocks.  This is stored in the head of
+ * rm_activeReaders.
+ */
+#define	RM_DESTROYED	((void *)0xdead)
+
+#define	rm_destroyed(rm)						\
+	(LIST_FIRST(&(rm)->rm_activeReaders) == RM_DESTROYED)
+
+#define RMPF_ONQUEUE	1
+#define RMPF_SIGNAL	2
+
+#ifndef INVARIANTS
+#define	_rm_assert(c, what, file, line)
+#endif
+
+static void	assert_rm(const struct lock_object *lock, int what);
+#ifdef DDB
+static void	db_show_rm(const struct lock_object *lock);
+#endif
+static void	lock_rm(struct lock_object *lock, int how);
+#ifdef KDTRACE_HOOKS
+static int	owner_rm(const struct lock_object *lock, struct thread **owner);
+#endif
+static int	unlock_rm(struct lock_object *lock);
+
+struct lock_class lock_class_rm = {
+	.lc_name = "rm",
+	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
+	.lc_assert = assert_rm,
+#ifdef DDB
+	.lc_ddb_show = db_show_rm,
+#endif
+	.lc_lock = lock_rm,
+	.lc_unlock = unlock_rm,
+#ifdef KDTRACE_HOOKS
+	.lc_owner = owner_rm,
+#endif
+};
+
+struct lock_class lock_class_rm_sleepable = {
+	.lc_name = "sleepable rm",
+	.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE,
+	.lc_assert = assert_rm,
+#ifdef DDB
+	.lc_ddb_show = db_show_rm,
+#endif
+	.lc_lock = lock_rm,
+	.lc_unlock = unlock_rm,
+#ifdef KDTRACE_HOOKS
+	.lc_owner = owner_rm,
+#endif
+};
+
+static void
+assert_rm(const struct lock_object *lock, int what)
+{
+
+	rm_assert((const struct rmlock *)lock, what);
+}
+
+/*
+ * These do not support read locks because it would be hard to make
+ * the tracker work correctly with the current lock_class API as you
+ * would need to have the tracker pointer available when calling
+ * rm_rlock() in lock_rm().
+ */
+static void
+lock_rm(struct lock_object *lock, int how)
+{
+	struct rmlock *rm;
+
+	rm = (struct rmlock *)lock;
+	if (how)
+		rm_wlock(rm);
+#ifdef INVARIANTS
+	else
+		panic("lock_rm called in read mode");
+#endif
+}
+
+static int
+unlock_rm(struct lock_object *lock)
+{
+	struct rmlock *rm;
+
+	rm = (struct rmlock *)lock;
+	rm_wunlock(rm);
+	return (1);
+}
+
+#ifdef KDTRACE_HOOKS
+static int
+owner_rm(const struct lock_object *lock, struct thread **owner)
+{
+	const struct rmlock *rm;
+	struct lock_class *lc;
+
+	rm = (const struct rmlock *)lock;
+	lc = LOCK_CLASS(&rm->rm_wlock_object);
+	return (lc->lc_owner(&rm->rm_wlock_object, owner));
+}
+#endif
+
+static struct mtx rm_spinlock;
+
+MTX_SYSINIT(rm_spinlock, &rm_spinlock, "rm_spinlock", MTX_SPIN);
+
+/*
+ * Add or remove tracker from per-cpu list.
+ *
+ * The per-cpu list can be traversed at any time in forward direction from an
+ * interrupt on the *local* cpu.
+ */
+static void inline
+rm_tracker_add(struct pcpu *pc, struct rm_priotracker *tracker)
+{
+	struct rm_queue *next;
+
+	/* Initialize all tracker pointers */
+	tracker->rmp_cpuQueue.rmq_prev = &pc->pc_rm_queue;
+	next = pc->pc_rm_queue.rmq_next;
+	tracker->rmp_cpuQueue.rmq_next = next;
+
+	/* rmq_prev is not used during froward traversal. */
+	next->rmq_prev = &tracker->rmp_cpuQueue;
+
+	/* Update pointer to first element. */
+	pc->pc_rm_queue.rmq_next = &tracker->rmp_cpuQueue;
+}
+
+/*
+ * Return a count of the number of trackers the thread 'td' already
+ * has on this CPU for the lock 'rm'.
+ */
+static int
+rm_trackers_present(const struct pcpu *pc, const struct rmlock *rm,
+    const struct thread *td)
+{
+	struct rm_queue *queue;
+	struct rm_priotracker *tracker;
+	int count;
+
+	count = 0;
+	for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
+	    queue = queue->rmq_next) {
+		tracker = (struct rm_priotracker *)queue;
+		if ((tracker->rmp_rmlock == rm) && (tracker->rmp_thread == td))
+			count++;
+	}
+	return (count);
+}
+
+static void inline
+rm_tracker_remove(struct pcpu *pc, struct rm_priotracker *tracker)
+{
+	struct rm_queue *next, *prev;
+
+	next = tracker->rmp_cpuQueue.rmq_next;
+	prev = tracker->rmp_cpuQueue.rmq_prev;
+
+	/* Not used during forward traversal. */
+	next->rmq_prev = prev;
+
+	/* Remove from list. */
+	prev->rmq_next = next;
+}
+
+static void
+rm_cleanIPI(void *arg)
+{
+	struct pcpu *pc;
+	struct rmlock *rm = arg;
+	struct rm_priotracker *tracker;
+	struct rm_queue *queue;
+	pc = pcpu_find(curcpu);
+
+	for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
+	    queue = queue->rmq_next) {
+		tracker = (struct rm_priotracker *)queue;
+		if (tracker->rmp_rmlock == rm && tracker->rmp_flags == 0) {
+			tracker->rmp_flags = RMPF_ONQUEUE;
+			mtx_lock_spin(&rm_spinlock);
+			LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
+			    rmp_qentry);
+			mtx_unlock_spin(&rm_spinlock);
+		}
+	}
+}
+
+void
+rm_init_flags(struct rmlock *rm, const char *name, int opts)
+{
+	struct lock_class *lc;
+	int liflags;
+
+	liflags = 0;
+	if (!(opts & RM_NOWITNESS))
+		liflags |= LO_WITNESS;
+	if (opts & RM_RECURSE)
+		liflags |= LO_RECURSABLE;
+	rm->rm_writecpus = all_cpus;
+	LIST_INIT(&rm->rm_activeReaders);
+	if (opts & RM_SLEEPABLE) {
+		liflags |= LO_SLEEPABLE;
+		lc = &lock_class_rm_sleepable;
+		sx_init_flags(&rm->rm_lock_sx, "rmlock_sx", SX_NOWITNESS);
+	} else {
+		lc = &lock_class_rm;
+		mtx_init(&rm->rm_lock_mtx, name, "rmlock_mtx", MTX_NOWITNESS);
+	}
+	lock_init(&rm->lock_object, lc, name, NULL, liflags);
+}
+
+void
+rm_init(struct rmlock *rm, const char *name)
+{
+
+	rm_init_flags(rm, name, 0);
+}
+
+void
+rm_destroy(struct rmlock *rm)
+{
+
+	rm_assert(rm, RA_UNLOCKED);
+	LIST_FIRST(&rm->rm_activeReaders) = RM_DESTROYED;
+	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+		sx_destroy(&rm->rm_lock_sx);
+	else
+		mtx_destroy(&rm->rm_lock_mtx);
+	lock_destroy(&rm->lock_object);
+}
+
+int
+rm_wowned(const struct rmlock *rm)
+{
+
+	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+		return (sx_xlocked(&rm->rm_lock_sx));
+	else
+		return (mtx_owned(&rm->rm_lock_mtx));
+}
+
+void
+rm_sysinit(void *arg)
+{
+	struct rm_args *args = arg;
+
+	rm_init(args->ra_rm, args->ra_desc);
+}
+
+void
+rm_sysinit_flags(void *arg)
+{
+	struct rm_args_flags *args = arg;
+
+	rm_init_flags(args->ra_rm, args->ra_desc, args->ra_opts);
+}
+
+static int
+_rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
+{
+	struct pcpu *pc;
+
+	critical_enter();
+	pc = pcpu_find(curcpu);
+
+	/* Check if we just need to do a proper critical_exit. */
+	if (!CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)) {
+		critical_exit();
+		return (1);
+	}
+
+	/* Remove our tracker from the per-cpu list. */
+	rm_tracker_remove(pc, tracker);
+
+	/* Check to see if the IPI granted us the lock after all. */
+	if (tracker->rmp_flags) {
+		/* Just add back tracker - we hold the lock. */
+		rm_tracker_add(pc, tracker);
+		critical_exit();
+		return (1);
+	}
+
+	/*
+	 * We allow readers to aquire a lock even if a writer is blocked if
+	 * the lock is recursive and the reader already holds the lock.
+	 */
+	if ((rm->lock_object.lo_flags & LO_RECURSABLE) != 0) {
+		/*
+		 * Just grant the lock if this thread already has a tracker
+		 * for this lock on the per-cpu queue.
+		 */
+		if (rm_trackers_present(pc, rm, curthread) != 0) {
+			mtx_lock_spin(&rm_spinlock);
+			LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
+			    rmp_qentry);
+			tracker->rmp_flags = RMPF_ONQUEUE;
+			mtx_unlock_spin(&rm_spinlock);
+			rm_tracker_add(pc, tracker);
+			critical_exit();
+			return (1);
+		}
+	}
+
+	sched_unpin();
+	critical_exit();
+
+	if (trylock) {
+		if (rm->lock_object.lo_flags & LO_SLEEPABLE) {
+			if (!sx_try_xlock(&rm->rm_lock_sx))
+				return (0);
+		} else {
+			if (!mtx_trylock(&rm->rm_lock_mtx))
+				return (0);
+		}
+	} else {
+		if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+			sx_xlock(&rm->rm_lock_sx);
+		else
+			mtx_lock(&rm->rm_lock_mtx);
+	}
+
+	critical_enter();
+	pc = pcpu_find(curcpu);
+	CPU_CLR(pc->pc_cpuid, &rm->rm_writecpus);
+	rm_tracker_add(pc, tracker);
+	sched_pin();
+	critical_exit();
+
+	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+		sx_xunlock(&rm->rm_lock_sx);
+	else
+		mtx_unlock(&rm->rm_lock_mtx);
+
+	return (1);
+}
+
+int
+_rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
+{
+	struct thread *td = curthread;
+	struct pcpu *pc;
+
+	if (SCHEDULER_STOPPED())
+		return (1);
+
+	tracker->rmp_flags  = 0;
+	tracker->rmp_thread = td;
+	tracker->rmp_rmlock = rm;
+
+	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+		THREAD_NO_SLEEPING();
+
+	td->td_critnest++;	/* critical_enter(); */
+
+	__compiler_membar();
+
+	pc = cpuid_to_pcpu[td->td_oncpu]; /* pcpu_find(td->td_oncpu); */
+
+	rm_tracker_add(pc, tracker);
+
+	sched_pin();
+
+	__compiler_membar();
+
+	td->td_critnest--;
+
+	/*
+	 * Fast path to combine two common conditions into a single
+	 * conditional jump.
+	 */
+	if (0 == (td->td_owepreempt |
+	    CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)))
+		return (1);
+
+	/* We do not have a read token and need to acquire one. */
+	return _rm_rlock_hard(rm, tracker, trylock);
+}
+
+static void
+_rm_unlock_hard(struct thread *td,struct rm_priotracker *tracker)
+{
+
+	if (td->td_owepreempt) {
+		td->td_critnest++;
+		critical_exit();
+	}
+
+	if (!tracker->rmp_flags)
+		return;
+
+	mtx_lock_spin(&rm_spinlock);
+	LIST_REMOVE(tracker, rmp_qentry);
+
+	if (tracker->rmp_flags & RMPF_SIGNAL) {
+		struct rmlock *rm;
+		struct turnstile *ts;
+
+		rm = tracker->rmp_rmlock;
+
+		turnstile_chain_lock(&rm->lock_object);
+		mtx_unlock_spin(&rm_spinlock);
+
+		ts = turnstile_lookup(&rm->lock_object);
+
+		turnstile_signal(ts, TS_EXCLUSIVE_QUEUE);
+		turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+		turnstile_chain_unlock(&rm->lock_object);
+	} else
+		mtx_unlock_spin(&rm_spinlock);
+}
+
+void
+_rm_runlock(struct rmlock *rm, struct rm_priotracker *tracker)
+{
+	struct pcpu *pc;
+	struct thread *td = tracker->rmp_thread;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	td->td_critnest++;	/* critical_enter(); */
+	pc = cpuid_to_pcpu[td->td_oncpu]; /* pcpu_find(td->td_oncpu); */
+	rm_tracker_remove(pc, tracker);
+	td->td_critnest--;
+	sched_unpin();
+
+	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+		THREAD_SLEEPING_OK();
+
+	if (0 == (td->td_owepreempt | tracker->rmp_flags))
+		return;
+
+	_rm_unlock_hard(td, tracker);
+}
+
+void
+_rm_wlock(struct rmlock *rm)
+{
+	struct rm_priotracker *prio;
+	struct turnstile *ts;
+	cpuset_t readcpus;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+		sx_xlock(&rm->rm_lock_sx);
+	else
+		mtx_lock(&rm->rm_lock_mtx);
+
+	if (CPU_CMP(&rm->rm_writecpus, &all_cpus)) {
+		/* Get all read tokens back */
+		readcpus = all_cpus;
+		CPU_NAND(&readcpus, &rm->rm_writecpus);
+		rm->rm_writecpus = all_cpus;
+
+		/*
+		 * Assumes rm->rm_writecpus update is visible on other CPUs
+		 * before rm_cleanIPI is called.
+		 */
+#ifdef SMP
+		smp_rendezvous_cpus(readcpus,
+		    smp_no_rendevous_barrier,
+		    rm_cleanIPI,
+		    smp_no_rendevous_barrier,
+		    rm);
+
+#else
+		rm_cleanIPI(rm);
+#endif
+
+		mtx_lock_spin(&rm_spinlock);
+		while ((prio = LIST_FIRST(&rm->rm_activeReaders)) != NULL) {
+			ts = turnstile_trywait(&rm->lock_object);
+			prio->rmp_flags = RMPF_ONQUEUE | RMPF_SIGNAL;
+			mtx_unlock_spin(&rm_spinlock);
+			turnstile_wait(ts, prio->rmp_thread,
+			    TS_EXCLUSIVE_QUEUE);
+			mtx_lock_spin(&rm_spinlock);
+		}
+		mtx_unlock_spin(&rm_spinlock);
+	}
+}
+
+void
+_rm_wunlock(struct rmlock *rm)
+{
+
+	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+		sx_xunlock(&rm->rm_lock_sx);
+	else
+		mtx_unlock(&rm->rm_lock_mtx);
+}
+
+#ifdef LOCK_DEBUG
+
+void
+_rm_wlock_debug(struct rmlock *rm, const char *file, int line)
+{
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("rm_wlock() by idle thread %p on rmlock %s @ %s:%d",
+	    curthread, rm->lock_object.lo_name, file, line));
+	KASSERT(!rm_destroyed(rm),
+	    ("rm_wlock() of destroyed rmlock @ %s:%d", file, line));
+	_rm_assert(rm, RA_UNLOCKED, file, line);
+
+	WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE,
+	    file, line, NULL);
+
+	_rm_wlock(rm);
+
+	LOCK_LOG_LOCK("RMWLOCK", &rm->lock_object, 0, 0, file, line);
+
+	WITNESS_LOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
+
+	curthread->td_locks++;
+
+}
+
+void
+_rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
+{
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	KASSERT(!rm_destroyed(rm),
+	    ("rm_wunlock() of destroyed rmlock @ %s:%d", file, line));
+	_rm_assert(rm, RA_WLOCKED, file, line);
+	WITNESS_UNLOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
+	LOCK_LOG_LOCK("RMWUNLOCK", &rm->lock_object, 0, 0, file, line);
+	_rm_wunlock(rm);
+	curthread->td_locks--;
+}
+
+int
+_rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
+    int trylock, const char *file, int line)
+{
+
+	if (SCHEDULER_STOPPED())
+		return (1);
+
+#ifdef INVARIANTS
+	if (!(rm->lock_object.lo_flags & LO_RECURSABLE) && !trylock) {
+		critical_enter();
+		KASSERT(rm_trackers_present(pcpu_find(curcpu), rm,
+		    curthread) == 0,
+		    ("rm_rlock: recursed on non-recursive rmlock %s @ %s:%d\n",
+		    rm->lock_object.lo_name, file, line));
+		critical_exit();
+	}
+#endif
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("rm_rlock() by idle thread %p on rmlock %s @ %s:%d",
+	    curthread, rm->lock_object.lo_name, file, line));
+	KASSERT(!rm_destroyed(rm),
+	    ("rm_rlock() of destroyed rmlock @ %s:%d", file, line));
+	if (!trylock) {
+		KASSERT(!rm_wowned(rm),
+		    ("rm_rlock: wlock already held for %s @ %s:%d",
+		    rm->lock_object.lo_name, file, line));
+		WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER, file, line,
+		    NULL);
+	}
+
+	if (_rm_rlock(rm, tracker, trylock)) {
+		if (trylock)
+			LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 1, file,
+			    line);
+		else
+			LOCK_LOG_LOCK("RMRLOCK", &rm->lock_object, 0, 0, file,
+			    line);
+		WITNESS_LOCK(&rm->lock_object, 0, file, line);
+
+		curthread->td_locks++;
+
+		return (1);
+	} else if (trylock)
+		LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 0, file, line);
+
+	return (0);
+}
+
+void
+_rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
+    const char *file, int line)
+{
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	KASSERT(!rm_destroyed(rm),
+	    ("rm_runlock() of destroyed rmlock @ %s:%d", file, line));
+	_rm_assert(rm, RA_RLOCKED, file, line);
+	WITNESS_UNLOCK(&rm->lock_object, 0, file, line);
+	LOCK_LOG_LOCK("RMRUNLOCK", &rm->lock_object, 0, 0, file, line);
+	_rm_runlock(rm, tracker);
+	curthread->td_locks--;
+}
+
+#else
+
+/*
+ * Just strip out file and line arguments if no lock debugging is enabled in
+ * the kernel - we are called from a kernel module.
+ */
+void
+_rm_wlock_debug(struct rmlock *rm, const char *file, int line)
+{
+
+	_rm_wlock(rm);
+}
+
+void
+_rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
+{
+
+	_rm_wunlock(rm);
+}
+
+int
+_rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
+    int trylock, const char *file, int line)
+{
+
+	return _rm_rlock(rm, tracker, trylock);
+}
+
+void
+_rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
+    const char *file, int line)
+{
+
+	_rm_runlock(rm, tracker);
+}
+
+#endif
+
+#ifdef INVARIANT_SUPPORT
+#ifndef INVARIANTS
+#undef _rm_assert
+#endif
+
+/*
+ * Note that this does not need to use witness_assert() for read lock
+ * assertions since an exact count of read locks held by this thread
+ * is computable.
+ */
+void
+_rm_assert(const struct rmlock *rm, int what, const char *file, int line)
+{
+	int count;
+
+	if (panicstr != NULL)
+		return;
+	switch (what) {
+	case RA_LOCKED:
+	case RA_LOCKED | RA_RECURSED:
+	case RA_LOCKED | RA_NOTRECURSED:
+	case RA_RLOCKED:
+	case RA_RLOCKED | RA_RECURSED:
+	case RA_RLOCKED | RA_NOTRECURSED:
+		/*
+		 * Handle the write-locked case.  Unlike other
+		 * primitives, writers can never recurse.
+		 */
+		if (rm_wowned(rm)) {
+			if (what & RA_RLOCKED)
+				panic("Lock %s exclusively locked @ %s:%d\n",
+				    rm->lock_object.lo_name, file, line);
+			if (what & RA_RECURSED)
+				panic("Lock %s not recursed @ %s:%d\n",
+				    rm->lock_object.lo_name, file, line);
+			break;
+		}
+
+		critical_enter();
+		count = rm_trackers_present(pcpu_find(curcpu), rm, curthread);
+		critical_exit();
+
+		if (count == 0)
+			panic("Lock %s not %slocked @ %s:%d\n",
+			    rm->lock_object.lo_name, (what & RA_RLOCKED) ?
+			    "read " : "", file, line);
+		if (count > 1) {
+			if (what & RA_NOTRECURSED)
+				panic("Lock %s recursed @ %s:%d\n",
+				    rm->lock_object.lo_name, file, line);
+		} else if (what & RA_RECURSED)
+			panic("Lock %s not recursed @ %s:%d\n",
+			    rm->lock_object.lo_name, file, line);
+		break;
+	case RA_WLOCKED:
+		if (!rm_wowned(rm))
+			panic("Lock %s not exclusively locked @ %s:%d\n",
+			    rm->lock_object.lo_name, file, line);
+		break;
+	case RA_UNLOCKED:
+		if (rm_wowned(rm))
+			panic("Lock %s exclusively locked @ %s:%d\n",
+			    rm->lock_object.lo_name, file, line);
+
+		critical_enter();
+		count = rm_trackers_present(pcpu_find(curcpu), rm, curthread);
+		critical_exit();
+
+		if (count != 0)
+			panic("Lock %s read locked @ %s:%d\n",
+			    rm->lock_object.lo_name, file, line);
+		break;
+	default:
+		panic("Unknown rm lock assertion: %d @ %s:%d", what, file,
+		    line);
+	}
+}
+#endif /* INVARIANT_SUPPORT */
+
+#ifdef DDB
+static void
+print_tracker(struct rm_priotracker *tr)
+{
+	struct thread *td;
+
+	td = tr->rmp_thread;
+	db_printf("   thread %p (tid %d, pid %d, \"%s\") {", td, td->td_tid,
+	    td->td_proc->p_pid, td->td_name);
+	if (tr->rmp_flags & RMPF_ONQUEUE) {
+		db_printf("ONQUEUE");
+		if (tr->rmp_flags & RMPF_SIGNAL)
+			db_printf(",SIGNAL");
+	} else
+		db_printf("0");
+	db_printf("}\n");
+}
+
+static void
+db_show_rm(const struct lock_object *lock)
+{
+	struct rm_priotracker *tr;
+	struct rm_queue *queue;
+	const struct rmlock *rm;
+	struct lock_class *lc;
+	struct pcpu *pc;
+
+	rm = (const struct rmlock *)lock;
+	db_printf(" writecpus: ");
+	ddb_display_cpuset(__DEQUALIFY(const cpuset_t *, &rm->rm_writecpus));
+	db_printf("\n");
+	db_printf(" per-CPU readers:\n");
+	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu)
+		for (queue = pc->pc_rm_queue.rmq_next;
+		    queue != &pc->pc_rm_queue; queue = queue->rmq_next) {
+			tr = (struct rm_priotracker *)queue;
+			if (tr->rmp_rmlock == rm)
+				print_tracker(tr);
+		}
+	db_printf(" active readers:\n");
+	LIST_FOREACH(tr, &rm->rm_activeReaders, rmp_qentry)
+		print_tracker(tr);
+	lc = LOCK_CLASS(&rm->rm_wlock_object);
+	db_printf("Backing write-lock (%s):\n", lc->lc_name);
+	lc->lc_ddb_show(&rm->rm_wlock_object);
+}
+#endif
diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c
new file mode 100644
index 0000000..bd40704
--- /dev/null
+++ b/sys/kern/kern_rwlock.c
@@ -0,0 +1,1232 @@
+/*-
+ * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Machine independent bits of reader/writer lock implementation.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+#include "opt_no_adaptive_rwlocks.h"
+
+#include <sys/param.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/turnstile.h>
+
+#include <machine/cpu.h>
+
+#if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS)
+#define	ADAPTIVE_RWLOCKS
+#endif
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+PMC_SOFT_DECLARE( , , lock, failed);
+#endif
+
+/*
+ * Return the rwlock address when the lock cookie address is provided.
+ * This functionality assumes that struct rwlock* have a member named rw_lock.
+ */
+#define	rwlock2rw(c)	(__containerof(c, struct rwlock, rw_lock))
+
+#ifdef ADAPTIVE_RWLOCKS
+static int rowner_retries = 10;
+static int rowner_loops = 10000;
+static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL,
+    "rwlock debugging");
+SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, "");
+SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, "");
+#endif
+
+#ifdef DDB
+#include <ddb/ddb.h>
+
+static void	db_show_rwlock(const struct lock_object *lock);
+#endif
+static void	assert_rw(const struct lock_object *lock, int what);
+static void	lock_rw(struct lock_object *lock, int how);
+#ifdef KDTRACE_HOOKS
+static int	owner_rw(const struct lock_object *lock, struct thread **owner);
+#endif
+static int	unlock_rw(struct lock_object *lock);
+
+struct lock_class lock_class_rw = {
+	.lc_name = "rw",
+	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE,
+	.lc_assert = assert_rw,
+#ifdef DDB
+	.lc_ddb_show = db_show_rwlock,
+#endif
+	.lc_lock = lock_rw,
+	.lc_unlock = unlock_rw,
+#ifdef KDTRACE_HOOKS
+	.lc_owner = owner_rw,
+#endif
+};
+
+/*
+ * Return a pointer to the owning thread if the lock is write-locked or
+ * NULL if the lock is unlocked or read-locked.
+ */
+#define	rw_wowner(rw)							\
+	((rw)->rw_lock & RW_LOCK_READ ? NULL :				\
+	    (struct thread *)RW_OWNER((rw)->rw_lock))
+
+/*
+ * Returns if a write owner is recursed.  Write ownership is not assured
+ * here and should be previously checked.
+ */
+#define	rw_recursed(rw)		((rw)->rw_recurse != 0)
+
+/*
+ * Return true if curthread helds the lock.
+ */
+#define	rw_wlocked(rw)		(rw_wowner((rw)) == curthread)
+
+/*
+ * Return a pointer to the owning thread for this lock who should receive
+ * any priority lent by threads that block on this lock.  Currently this
+ * is identical to rw_wowner().
+ */
+#define	rw_owner(rw)		rw_wowner(rw)
+
+#ifndef INVARIANTS
+#define	__rw_assert(c, what, file, line)
+#endif
+
+void
+assert_rw(const struct lock_object *lock, int what)
+{
+
+	rw_assert((const struct rwlock *)lock, what);
+}
+
+void
+lock_rw(struct lock_object *lock, int how)
+{
+	struct rwlock *rw;
+
+	rw = (struct rwlock *)lock;
+	if (how)
+		rw_wlock(rw);
+	else
+		rw_rlock(rw);
+}
+
+int
+unlock_rw(struct lock_object *lock)
+{
+	struct rwlock *rw;
+
+	rw = (struct rwlock *)lock;
+	rw_assert(rw, RA_LOCKED | LA_NOTRECURSED);
+	if (rw->rw_lock & RW_LOCK_READ) {
+		rw_runlock(rw);
+		return (0);
+	} else {
+		rw_wunlock(rw);
+		return (1);
+	}
+}
+
+#ifdef KDTRACE_HOOKS
+int
+owner_rw(const struct lock_object *lock, struct thread **owner)
+{
+	const struct rwlock *rw = (const struct rwlock *)lock;
+	uintptr_t x = rw->rw_lock;
+
+	*owner = rw_wowner(rw);
+	return ((x & RW_LOCK_READ) != 0 ?  (RW_READERS(x) != 0) :
+	    (*owner != NULL));
+}
+#endif
+
+void
+_rw_init_flags(volatile uintptr_t *c, const char *name, int opts)
+{
+	struct rwlock *rw;
+	int flags;
+
+	rw = rwlock2rw(c);
+
+	MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET |
+	    RW_RECURSE)) == 0);
+	ASSERT_ATOMIC_LOAD_PTR(rw->rw_lock,
+	    ("%s: rw_lock not aligned for %s: %p", __func__, name,
+	    &rw->rw_lock));
+
+	flags = LO_UPGRADABLE;
+	if (opts & RW_DUPOK)
+		flags |= LO_DUPOK;
+	if (opts & RW_NOPROFILE)
+		flags |= LO_NOPROFILE;
+	if (!(opts & RW_NOWITNESS))
+		flags |= LO_WITNESS;
+	if (opts & RW_RECURSE)
+		flags |= LO_RECURSABLE;
+	if (opts & RW_QUIET)
+		flags |= LO_QUIET;
+
+	lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags);
+	rw->rw_lock = RW_UNLOCKED;
+	rw->rw_recurse = 0;
+}
+
+void
+_rw_destroy(volatile uintptr_t *c)
+{
+	struct rwlock *rw;
+
+	rw = rwlock2rw(c);
+
+	KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock %p not unlocked", rw));
+	KASSERT(rw->rw_recurse == 0, ("rw lock %p still recursed", rw));
+	rw->rw_lock = RW_DESTROYED;
+	lock_destroy(&rw->lock_object);
+}
+
+void
+rw_sysinit(void *arg)
+{
+	struct rw_args *args = arg;
+
+	rw_init((struct rwlock *)args->ra_rw, args->ra_desc);
+}
+
+void
+rw_sysinit_flags(void *arg)
+{
+	struct rw_args_flags *args = arg;
+
+	rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc,
+	    args->ra_flags);
+}
+
+int
+_rw_wowned(const volatile uintptr_t *c)
+{
+
+	return (rw_wowner(rwlock2rw(c)) == curthread);
+}
+
+void
+_rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line)
+{
+	struct rwlock *rw;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	rw = rwlock2rw(c);
+
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("rw_wlock() by idle thread %p on rwlock %s @ %s:%d",
+	    curthread, rw->lock_object.lo_name, file, line));
+	KASSERT(rw->rw_lock != RW_DESTROYED,
+	    ("rw_wlock() of destroyed rwlock @ %s:%d", file, line));
+	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
+	    line, NULL);
+	__rw_wlock(rw, curthread, file, line);
+	LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line);
+	WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
+	curthread->td_locks++;
+}
+
+int
+__rw_try_wlock(volatile uintptr_t *c, const char *file, int line)
+{
+	struct rwlock *rw;
+	int rval;
+
+	if (SCHEDULER_STOPPED())
+		return (1);
+
+	rw = rwlock2rw(c);
+
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("rw_try_wlock() by idle thread %p on rwlock %s @ %s:%d",
+	    curthread, rw->lock_object.lo_name, file, line));
+	KASSERT(rw->rw_lock != RW_DESTROYED,
+	    ("rw_try_wlock() of destroyed rwlock @ %s:%d", file, line));
+
+	if (rw_wlocked(rw) &&
+	    (rw->lock_object.lo_flags & LO_RECURSABLE) != 0) {
+		rw->rw_recurse++;
+		rval = 1;
+	} else
+		rval = atomic_cmpset_acq_ptr(&rw->rw_lock, RW_UNLOCKED,
+		    (uintptr_t)curthread);
+
+	LOCK_LOG_TRY("WLOCK", &rw->lock_object, 0, rval, file, line);
+	if (rval) {
+		WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+		    file, line);
+		curthread->td_locks++;
+	}
+	return (rval);
+}
+
+void
+_rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line)
+{
+	struct rwlock *rw;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	rw = rwlock2rw(c);
+
+	KASSERT(rw->rw_lock != RW_DESTROYED,
+	    ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line));
+	__rw_assert(c, RA_WLOCKED, file, line);
+	WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
+	LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file,
+	    line);
+	if (!rw_recursed(rw))
+		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_RW_WUNLOCK_RELEASE, rw);
+	__rw_wunlock(rw, curthread, file, line);
+	curthread->td_locks--;
+}
+/*
+ * Determines whether a new reader can acquire a lock.  Succeeds if the
+ * reader already owns a read lock and the lock is locked for read to
+ * prevent deadlock from reader recursion.  Also succeeds if the lock
+ * is unlocked and has no writer waiters or spinners.  Failing otherwise
+ * prioritizes writers before readers.
+ */
+#define	RW_CAN_READ(_rw)						\
+    ((curthread->td_rw_rlocks && (_rw) & RW_LOCK_READ) || ((_rw) &	\
+    (RW_LOCK_READ | RW_LOCK_WRITE_WAITERS | RW_LOCK_WRITE_SPINNER)) ==	\
+    RW_LOCK_READ)
+
+void
+__rw_rlock(volatile uintptr_t *c, const char *file, int line)
+{
+	struct rwlock *rw;
+	struct turnstile *ts;
+#ifdef ADAPTIVE_RWLOCKS
+	volatile struct thread *owner;
+	int spintries = 0;
+	int i;
+#endif
+#ifdef LOCK_PROFILING
+	uint64_t waittime = 0;
+	int contested = 0;
+#endif
+	uintptr_t v;
+#ifdef KDTRACE_HOOKS
+	uint64_t spin_cnt = 0;
+	uint64_t sleep_cnt = 0;
+	int64_t sleep_time = 0;
+#endif
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	rw = rwlock2rw(c);
+
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("rw_rlock() by idle thread %p on rwlock %s @ %s:%d",
+	    curthread, rw->lock_object.lo_name, file, line));
+	KASSERT(rw->rw_lock != RW_DESTROYED,
+	    ("rw_rlock() of destroyed rwlock @ %s:%d", file, line));
+	KASSERT(rw_wowner(rw) != curthread,
+	    ("rw_rlock: wlock already held for %s @ %s:%d",
+	    rw->lock_object.lo_name, file, line));
+	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line, NULL);
+
+	for (;;) {
+#ifdef KDTRACE_HOOKS
+		spin_cnt++;
+#endif
+		/*
+		 * Handle the easy case.  If no other thread has a write
+		 * lock, then try to bump up the count of read locks.  Note
+		 * that we have to preserve the current state of the
+		 * RW_LOCK_WRITE_WAITERS flag.  If we fail to acquire a
+		 * read lock, then rw_lock must have changed, so restart
+		 * the loop.  Note that this handles the case of a
+		 * completely unlocked rwlock since such a lock is encoded
+		 * as a read lock with no waiters.
+		 */
+		v = rw->rw_lock;
+		if (RW_CAN_READ(v)) {
+			/*
+			 * The RW_LOCK_READ_WAITERS flag should only be set
+			 * if the lock has been unlocked and write waiters
+			 * were present.
+			 */
+			if (atomic_cmpset_acq_ptr(&rw->rw_lock, v,
+			    v + RW_ONE_READER)) {
+				if (LOCK_LOG_TEST(&rw->lock_object, 0))
+					CTR4(KTR_LOCK,
+					    "%s: %p succeed %p -> %p", __func__,
+					    rw, (void *)v,
+					    (void *)(v + RW_ONE_READER));
+				break;
+			}
+			continue;
+		}
+#ifdef HWPMC_HOOKS
+		PMC_SOFT_CALL( , , lock, failed);
+#endif
+		lock_profile_obtain_lock_failed(&rw->lock_object,
+		    &contested, &waittime);
+
+#ifdef ADAPTIVE_RWLOCKS
+		/*
+		 * If the owner is running on another CPU, spin until
+		 * the owner stops running or the state of the lock
+		 * changes.
+		 */
+		if ((v & RW_LOCK_READ) == 0) {
+			owner = (struct thread *)RW_OWNER(v);
+			if (TD_IS_RUNNING(owner)) {
+				if (LOCK_LOG_TEST(&rw->lock_object, 0))
+					CTR3(KTR_LOCK,
+					    "%s: spinning on %p held by %p",
+					    __func__, rw, owner);
+				while ((struct thread*)RW_OWNER(rw->rw_lock) ==
+				    owner && TD_IS_RUNNING(owner)) {
+					cpu_spinwait();
+#ifdef KDTRACE_HOOKS
+					spin_cnt++;
+#endif
+				}
+				continue;
+			}
+		} else if (spintries < rowner_retries) {
+			spintries++;
+			for (i = 0; i < rowner_loops; i++) {
+				v = rw->rw_lock;
+				if ((v & RW_LOCK_READ) == 0 || RW_CAN_READ(v))
+					break;
+				cpu_spinwait();
+			}
+			if (i != rowner_loops)
+				continue;
+		}
+#endif
+
+		/*
+		 * Okay, now it's the hard case.  Some other thread already
+		 * has a write lock or there are write waiters present,
+		 * acquire the turnstile lock so we can begin the process
+		 * of blocking.
+		 */
+		ts = turnstile_trywait(&rw->lock_object);
+
+		/*
+		 * The lock might have been released while we spun, so
+		 * recheck its state and restart the loop if needed.
+		 */
+		v = rw->rw_lock;
+		if (RW_CAN_READ(v)) {
+			turnstile_cancel(ts);
+			continue;
+		}
+
+#ifdef ADAPTIVE_RWLOCKS
+		/*
+		 * The current lock owner might have started executing
+		 * on another CPU (or the lock could have changed
+		 * owners) while we were waiting on the turnstile
+		 * chain lock.  If so, drop the turnstile lock and try
+		 * again.
+		 */
+		if ((v & RW_LOCK_READ) == 0) {
+			owner = (struct thread *)RW_OWNER(v);
+			if (TD_IS_RUNNING(owner)) {
+				turnstile_cancel(ts);
+				continue;
+			}
+		}
+#endif
+
+		/*
+		 * The lock is held in write mode or it already has waiters.
+		 */
+		MPASS(!RW_CAN_READ(v));
+
+		/*
+		 * If the RW_LOCK_READ_WAITERS flag is already set, then
+		 * we can go ahead and block.  If it is not set then try
+		 * to set it.  If we fail to set it drop the turnstile
+		 * lock and restart the loop.
+		 */
+		if (!(v & RW_LOCK_READ_WAITERS)) {
+			if (!atomic_cmpset_ptr(&rw->rw_lock, v,
+			    v | RW_LOCK_READ_WAITERS)) {
+				turnstile_cancel(ts);
+				continue;
+			}
+			if (LOCK_LOG_TEST(&rw->lock_object, 0))
+				CTR2(KTR_LOCK, "%s: %p set read waiters flag",
+				    __func__, rw);
+		}
+
+		/*
+		 * We were unable to acquire the lock and the read waiters
+		 * flag is set, so we must block on the turnstile.
+		 */
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
+			    rw);
+#ifdef KDTRACE_HOOKS
+		sleep_time -= lockstat_nsecs();
+#endif
+		turnstile_wait(ts, rw_owner(rw), TS_SHARED_QUEUE);
+#ifdef KDTRACE_HOOKS
+		sleep_time += lockstat_nsecs();
+		sleep_cnt++;
+#endif
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
+			    __func__, rw);
+	}
+
+	/*
+	 * TODO: acquire "owner of record" here.  Here be turnstile dragons
+	 * however.  turnstiles don't like owners changing between calls to
+	 * turnstile_wait() currently.
+	 */
+	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_RW_RLOCK_ACQUIRE, rw, contested,
+	    waittime, file, line);
+	LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line);
+	WITNESS_LOCK(&rw->lock_object, 0, file, line);
+	curthread->td_locks++;
+	curthread->td_rw_rlocks++;
+#ifdef KDTRACE_HOOKS
+	if (sleep_time)
+		LOCKSTAT_RECORD1(LS_RW_RLOCK_BLOCK, rw, sleep_time);
+
+	/*
+	 * Record only the loops spinning and not sleeping. 
+	 */
+	if (spin_cnt > sleep_cnt)
+		LOCKSTAT_RECORD1(LS_RW_RLOCK_SPIN, rw, (spin_cnt - sleep_cnt));
+#endif
+}
+
+int
+__rw_try_rlock(volatile uintptr_t *c, const char *file, int line)
+{
+	struct rwlock *rw;
+	uintptr_t x;
+
+	if (SCHEDULER_STOPPED())
+		return (1);
+
+	rw = rwlock2rw(c);
+
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("rw_try_rlock() by idle thread %p on rwlock %s @ %s:%d",
+	    curthread, rw->lock_object.lo_name, file, line));
+
+	for (;;) {
+		x = rw->rw_lock;
+		KASSERT(rw->rw_lock != RW_DESTROYED,
+		    ("rw_try_rlock() of destroyed rwlock @ %s:%d", file, line));
+		if (!(x & RW_LOCK_READ))
+			break;
+		if (atomic_cmpset_acq_ptr(&rw->rw_lock, x, x + RW_ONE_READER)) {
+			LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 1, file,
+			    line);
+			WITNESS_LOCK(&rw->lock_object, LOP_TRYLOCK, file, line);
+			curthread->td_locks++;
+			curthread->td_rw_rlocks++;
+			return (1);
+		}
+	}
+
+	LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 0, file, line);
+	return (0);
+}
+
+void
+_rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line)
+{
+	struct rwlock *rw;
+	struct turnstile *ts;
+	uintptr_t x, v, queue;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	rw = rwlock2rw(c);
+
+	KASSERT(rw->rw_lock != RW_DESTROYED,
+	    ("rw_runlock() of destroyed rwlock @ %s:%d", file, line));
+	__rw_assert(c, RA_RLOCKED, file, line);
+	WITNESS_UNLOCK(&rw->lock_object, 0, file, line);
+	LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line);
+
+	/* TODO: drop "owner of record" here. */
+
+	for (;;) {
+		/*
+		 * See if there is more than one read lock held.  If so,
+		 * just drop one and return.
+		 */
+		x = rw->rw_lock;
+		if (RW_READERS(x) > 1) {
+			if (atomic_cmpset_rel_ptr(&rw->rw_lock, x,
+			    x - RW_ONE_READER)) {
+				if (LOCK_LOG_TEST(&rw->lock_object, 0))
+					CTR4(KTR_LOCK,
+					    "%s: %p succeeded %p -> %p",
+					    __func__, rw, (void *)x,
+					    (void *)(x - RW_ONE_READER));
+				break;
+			}
+			continue;
+		}
+		/*
+		 * If there aren't any waiters for a write lock, then try
+		 * to drop it quickly.
+		 */
+		if (!(x & RW_LOCK_WAITERS)) {
+			MPASS((x & ~RW_LOCK_WRITE_SPINNER) ==
+			    RW_READERS_LOCK(1));
+			if (atomic_cmpset_rel_ptr(&rw->rw_lock, x,
+			    RW_UNLOCKED)) {
+				if (LOCK_LOG_TEST(&rw->lock_object, 0))
+					CTR2(KTR_LOCK, "%s: %p last succeeded",
+					    __func__, rw);
+				break;
+			}
+			continue;
+		}
+		/*
+		 * Ok, we know we have waiters and we think we are the
+		 * last reader, so grab the turnstile lock.
+		 */
+		turnstile_chain_lock(&rw->lock_object);
+		v = rw->rw_lock & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
+		MPASS(v & RW_LOCK_WAITERS);
+
+		/*
+		 * Try to drop our lock leaving the lock in a unlocked
+		 * state.
+		 *
+		 * If you wanted to do explicit lock handoff you'd have to
+		 * do it here.  You'd also want to use turnstile_signal()
+		 * and you'd have to handle the race where a higher
+		 * priority thread blocks on the write lock before the
+		 * thread you wakeup actually runs and have the new thread
+		 * "steal" the lock.  For now it's a lot simpler to just
+		 * wakeup all of the waiters.
+		 *
+		 * As above, if we fail, then another thread might have
+		 * acquired a read lock, so drop the turnstile lock and
+		 * restart.
+		 */
+		x = RW_UNLOCKED;
+		if (v & RW_LOCK_WRITE_WAITERS) {
+			queue = TS_EXCLUSIVE_QUEUE;
+			x |= (v & RW_LOCK_READ_WAITERS);
+		} else
+			queue = TS_SHARED_QUEUE;
+		if (!atomic_cmpset_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v,
+		    x)) {
+			turnstile_chain_unlock(&rw->lock_object);
+			continue;
+		}
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p last succeeded with waiters",
+			    __func__, rw);
+
+		/*
+		 * Ok.  The lock is released and all that's left is to
+		 * wake up the waiters.  Note that the lock might not be
+		 * free anymore, but in that case the writers will just
+		 * block again if they run before the new lock holder(s)
+		 * release the lock.
+		 */
+		ts = turnstile_lookup(&rw->lock_object);
+		MPASS(ts != NULL);
+		turnstile_broadcast(ts, queue);
+		turnstile_unpend(ts, TS_SHARED_LOCK);
+		turnstile_chain_unlock(&rw->lock_object);
+		break;
+	}
+	LOCKSTAT_PROFILE_RELEASE_LOCK(LS_RW_RUNLOCK_RELEASE, rw);
+	curthread->td_locks--;
+	curthread->td_rw_rlocks--;
+}
+
+/*
+ * This function is called when we are unable to obtain a write lock on the
+ * first try.  This means that at least one other thread holds either a
+ * read or write lock.
+ */
+void
+__rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
+    int line)
+{
+	struct rwlock *rw;
+	struct turnstile *ts;
+#ifdef ADAPTIVE_RWLOCKS
+	volatile struct thread *owner;
+	int spintries = 0;
+	int i;
+#endif
+	uintptr_t v, x;
+#ifdef LOCK_PROFILING
+	uint64_t waittime = 0;
+	int contested = 0;
+#endif
+#ifdef KDTRACE_HOOKS
+	uint64_t spin_cnt = 0;
+	uint64_t sleep_cnt = 0;
+	int64_t sleep_time = 0;
+#endif
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	rw = rwlock2rw(c);
+
+	if (rw_wlocked(rw)) {
+		KASSERT(rw->lock_object.lo_flags & LO_RECURSABLE,
+		    ("%s: recursing but non-recursive rw %s @ %s:%d\n",
+		    __func__, rw->lock_object.lo_name, file, line));
+		rw->rw_recurse++;
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw);
+		return;
+	}
+
+	if (LOCK_LOG_TEST(&rw->lock_object, 0))
+		CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
+		    rw->lock_object.lo_name, (void *)rw->rw_lock, file, line);
+
+	while (!_rw_write_lock(rw, tid)) {
+#ifdef KDTRACE_HOOKS
+		spin_cnt++;
+#endif
+#ifdef HWPMC_HOOKS
+		PMC_SOFT_CALL( , , lock, failed);
+#endif
+		lock_profile_obtain_lock_failed(&rw->lock_object,
+		    &contested, &waittime);
+#ifdef ADAPTIVE_RWLOCKS
+		/*
+		 * If the lock is write locked and the owner is
+		 * running on another CPU, spin until the owner stops
+		 * running or the state of the lock changes.
+		 */
+		v = rw->rw_lock;
+		owner = (struct thread *)RW_OWNER(v);
+		if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) {
+			if (LOCK_LOG_TEST(&rw->lock_object, 0))
+				CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
+				    __func__, rw, owner);
+			while ((struct thread*)RW_OWNER(rw->rw_lock) == owner &&
+			    TD_IS_RUNNING(owner)) {
+				cpu_spinwait();
+#ifdef KDTRACE_HOOKS
+				spin_cnt++;
+#endif
+			}
+			continue;
+		}
+		if ((v & RW_LOCK_READ) && RW_READERS(v) &&
+		    spintries < rowner_retries) {
+			if (!(v & RW_LOCK_WRITE_SPINNER)) {
+				if (!atomic_cmpset_ptr(&rw->rw_lock, v,
+				    v | RW_LOCK_WRITE_SPINNER)) {
+					continue;
+				}
+			}
+			spintries++;
+			for (i = 0; i < rowner_loops; i++) {
+				if ((rw->rw_lock & RW_LOCK_WRITE_SPINNER) == 0)
+					break;
+				cpu_spinwait();
+			}
+#ifdef KDTRACE_HOOKS
+			spin_cnt += rowner_loops - i;
+#endif
+			if (i != rowner_loops)
+				continue;
+		}
+#endif
+		ts = turnstile_trywait(&rw->lock_object);
+		v = rw->rw_lock;
+
+#ifdef ADAPTIVE_RWLOCKS
+		/*
+		 * The current lock owner might have started executing
+		 * on another CPU (or the lock could have changed
+		 * owners) while we were waiting on the turnstile
+		 * chain lock.  If so, drop the turnstile lock and try
+		 * again.
+		 */
+		if (!(v & RW_LOCK_READ)) {
+			owner = (struct thread *)RW_OWNER(v);
+			if (TD_IS_RUNNING(owner)) {
+				turnstile_cancel(ts);
+				continue;
+			}
+		}
+#endif
+		/*
+		 * Check for the waiters flags about this rwlock.
+		 * If the lock was released, without maintain any pending
+		 * waiters queue, simply try to acquire it.
+		 * If a pending waiters queue is present, claim the lock
+		 * ownership and maintain the pending queue.
+		 */
+		x = v & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
+		if ((v & ~x) == RW_UNLOCKED) {
+			x &= ~RW_LOCK_WRITE_SPINNER;
+			if (atomic_cmpset_acq_ptr(&rw->rw_lock, v, tid | x)) {
+				if (x)
+					turnstile_claim(ts);
+				else
+					turnstile_cancel(ts);
+				break;
+			}
+			turnstile_cancel(ts);
+			continue;
+		}
+		/*
+		 * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to
+		 * set it.  If we fail to set it, then loop back and try
+		 * again.
+		 */
+		if (!(v & RW_LOCK_WRITE_WAITERS)) {
+			if (!atomic_cmpset_ptr(&rw->rw_lock, v,
+			    v | RW_LOCK_WRITE_WAITERS)) {
+				turnstile_cancel(ts);
+				continue;
+			}
+			if (LOCK_LOG_TEST(&rw->lock_object, 0))
+				CTR2(KTR_LOCK, "%s: %p set write waiters flag",
+				    __func__, rw);
+		}
+		/*
+		 * We were unable to acquire the lock and the write waiters
+		 * flag is set, so we must block on the turnstile.
+		 */
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
+			    rw);
+#ifdef KDTRACE_HOOKS
+		sleep_time -= lockstat_nsecs();
+#endif
+		turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE);
+#ifdef KDTRACE_HOOKS
+		sleep_time += lockstat_nsecs();
+		sleep_cnt++;
+#endif
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
+			    __func__, rw);
+#ifdef ADAPTIVE_RWLOCKS
+		spintries = 0;
+#endif
+	}
+	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_RW_WLOCK_ACQUIRE, rw, contested,
+	    waittime, file, line);
+#ifdef KDTRACE_HOOKS
+	if (sleep_time)
+		LOCKSTAT_RECORD1(LS_RW_WLOCK_BLOCK, rw, sleep_time);
+
+	/*
+	 * Record only the loops spinning and not sleeping.
+	 */ 
+	if (spin_cnt > sleep_cnt)
+		LOCKSTAT_RECORD1(LS_RW_WLOCK_SPIN, rw, (spin_cnt - sleep_cnt));
+#endif
+}
+
+/*
+ * This function is called if the first try at releasing a write lock failed.
+ * This means that one of the 2 waiter bits must be set indicating that at
+ * least one thread is waiting on this lock.
+ */
+void
+__rw_wunlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
+    int line)
+{
+	struct rwlock *rw;
+	struct turnstile *ts;
+	uintptr_t v;
+	int queue;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	rw = rwlock2rw(c);
+
+	if (rw_wlocked(rw) && rw_recursed(rw)) {
+		rw->rw_recurse--;
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw);
+		return;
+	}
+
+	KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS),
+	    ("%s: neither of the waiter flags are set", __func__));
+
+	if (LOCK_LOG_TEST(&rw->lock_object, 0))
+		CTR2(KTR_LOCK, "%s: %p contested", __func__, rw);
+
+	turnstile_chain_lock(&rw->lock_object);
+	ts = turnstile_lookup(&rw->lock_object);
+	MPASS(ts != NULL);
+
+	/*
+	 * Use the same algo as sx locks for now.  Prefer waking up shared
+	 * waiters if we have any over writers.  This is probably not ideal.
+	 *
+	 * 'v' is the value we are going to write back to rw_lock.  If we
+	 * have waiters on both queues, we need to preserve the state of
+	 * the waiter flag for the queue we don't wake up.  For now this is
+	 * hardcoded for the algorithm mentioned above.
+	 *
+	 * In the case of both readers and writers waiting we wakeup the
+	 * readers but leave the RW_LOCK_WRITE_WAITERS flag set.  If a
+	 * new writer comes in before a reader it will claim the lock up
+	 * above.  There is probably a potential priority inversion in
+	 * there that could be worked around either by waking both queues
+	 * of waiters or doing some complicated lock handoff gymnastics.
+	 */
+	v = RW_UNLOCKED;
+	if (rw->rw_lock & RW_LOCK_WRITE_WAITERS) {
+		queue = TS_EXCLUSIVE_QUEUE;
+		v |= (rw->rw_lock & RW_LOCK_READ_WAITERS);
+	} else
+		queue = TS_SHARED_QUEUE;
+
+	/* Wake up all waiters for the specific queue. */
+	if (LOCK_LOG_TEST(&rw->lock_object, 0))
+		CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw,
+		    queue == TS_SHARED_QUEUE ? "read" : "write");
+	turnstile_broadcast(ts, queue);
+	atomic_store_rel_ptr(&rw->rw_lock, v);
+	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+	turnstile_chain_unlock(&rw->lock_object);
+}
+
+/*
+ * Attempt to do a non-blocking upgrade from a read lock to a write
+ * lock.  This will only succeed if this thread holds a single read
+ * lock.  Returns true if the upgrade succeeded and false otherwise.
+ */
+int
+__rw_try_upgrade(volatile uintptr_t *c, const char *file, int line)
+{
+	struct rwlock *rw;
+	uintptr_t v, x, tid;
+	struct turnstile *ts;
+	int success;
+
+	if (SCHEDULER_STOPPED())
+		return (1);
+
+	rw = rwlock2rw(c);
+
+	KASSERT(rw->rw_lock != RW_DESTROYED,
+	    ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line));
+	__rw_assert(c, RA_RLOCKED, file, line);
+
+	/*
+	 * Attempt to switch from one reader to a writer.  If there
+	 * are any write waiters, then we will have to lock the
+	 * turnstile first to prevent races with another writer
+	 * calling turnstile_wait() before we have claimed this
+	 * turnstile.  So, do the simple case of no waiters first.
+	 */
+	tid = (uintptr_t)curthread;
+	success = 0;
+	for (;;) {
+		v = rw->rw_lock;
+		if (RW_READERS(v) > 1)
+			break;
+		if (!(v & RW_LOCK_WAITERS)) {
+			success = atomic_cmpset_ptr(&rw->rw_lock, v, tid);
+			if (!success)
+				continue;
+			break;
+		}
+
+		/*
+		 * Ok, we think we have waiters, so lock the turnstile.
+		 */
+		ts = turnstile_trywait(&rw->lock_object);
+		v = rw->rw_lock;
+		if (RW_READERS(v) > 1) {
+			turnstile_cancel(ts);
+			break;
+		}
+		/*
+		 * Try to switch from one reader to a writer again.  This time
+		 * we honor the current state of the waiters flags.
+		 * If we obtain the lock with the flags set, then claim
+		 * ownership of the turnstile.
+		 */
+		x = rw->rw_lock & RW_LOCK_WAITERS;
+		success = atomic_cmpset_ptr(&rw->rw_lock, v, tid | x);
+		if (success) {
+			if (x)
+				turnstile_claim(ts);
+			else
+				turnstile_cancel(ts);
+			break;
+		}
+		turnstile_cancel(ts);
+	}
+	LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line);
+	if (success) {
+		curthread->td_rw_rlocks--;
+		WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+		    file, line);
+		LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE, rw);
+	}
+	return (success);
+}
+
+/*
+ * Downgrade a write lock into a single read lock.
+ */
+void
+__rw_downgrade(volatile uintptr_t *c, const char *file, int line)
+{
+	struct rwlock *rw;
+	struct turnstile *ts;
+	uintptr_t tid, v;
+	int rwait, wwait;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	rw = rwlock2rw(c);
+
+	KASSERT(rw->rw_lock != RW_DESTROYED,
+	    ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line));
+	__rw_assert(c, RA_WLOCKED | RA_NOTRECURSED, file, line);
+#ifndef INVARIANTS
+	if (rw_recursed(rw))
+		panic("downgrade of a recursed lock");
+#endif
+
+	WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line);
+
+	/*
+	 * Convert from a writer to a single reader.  First we handle
+	 * the easy case with no waiters.  If there are any waiters, we
+	 * lock the turnstile and "disown" the lock.
+	 */
+	tid = (uintptr_t)curthread;
+	if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1)))
+		goto out;
+
+	/*
+	 * Ok, we think we have waiters, so lock the turnstile so we can
+	 * read the waiter flags without any races.
+	 */
+	turnstile_chain_lock(&rw->lock_object);
+	v = rw->rw_lock & RW_LOCK_WAITERS;
+	rwait = v & RW_LOCK_READ_WAITERS;
+	wwait = v & RW_LOCK_WRITE_WAITERS;
+	MPASS(rwait | wwait);
+
+	/*
+	 * Downgrade from a write lock while preserving waiters flag
+	 * and give up ownership of the turnstile.
+	 */
+	ts = turnstile_lookup(&rw->lock_object);
+	MPASS(ts != NULL);
+	if (!wwait)
+		v &= ~RW_LOCK_READ_WAITERS;
+	atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v);
+	/*
+	 * Wake other readers if there are no writers pending.  Otherwise they
+	 * won't be able to acquire the lock anyway.
+	 */
+	if (rwait && !wwait) {
+		turnstile_broadcast(ts, TS_SHARED_QUEUE);
+		turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+	} else
+		turnstile_disown(ts);
+	turnstile_chain_unlock(&rw->lock_object);
+out:
+	curthread->td_rw_rlocks++;
+	LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line);
+	LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE, rw);
+}
+
+#ifdef INVARIANT_SUPPORT
+#ifndef INVARIANTS
+#undef __rw_assert
+#endif
+
+/*
+ * In the non-WITNESS case, rw_assert() can only detect that at least
+ * *some* thread owns an rlock, but it cannot guarantee that *this*
+ * thread owns an rlock.
+ */
+void
+__rw_assert(const volatile uintptr_t *c, int what, const char *file, int line)
+{
+	const struct rwlock *rw;
+
+	if (panicstr != NULL)
+		return;
+
+	rw = rwlock2rw(c);
+
+	switch (what) {
+	case RA_LOCKED:
+	case RA_LOCKED | RA_RECURSED:
+	case RA_LOCKED | RA_NOTRECURSED:
+	case RA_RLOCKED:
+	case RA_RLOCKED | RA_RECURSED:
+	case RA_RLOCKED | RA_NOTRECURSED:
+#ifdef WITNESS
+		witness_assert(&rw->lock_object, what, file, line);
+#else
+		/*
+		 * If some other thread has a write lock or we have one
+		 * and are asserting a read lock, fail.  Also, if no one
+		 * has a lock at all, fail.
+		 */
+		if (rw->rw_lock == RW_UNLOCKED ||
+		    (!(rw->rw_lock & RW_LOCK_READ) && (what & RA_RLOCKED ||
+		    rw_wowner(rw) != curthread)))
+			panic("Lock %s not %slocked @ %s:%d\n",
+			    rw->lock_object.lo_name, (what & RA_RLOCKED) ?
+			    "read " : "", file, line);
+
+		if (!(rw->rw_lock & RW_LOCK_READ) && !(what & RA_RLOCKED)) {
+			if (rw_recursed(rw)) {
+				if (what & RA_NOTRECURSED)
+					panic("Lock %s recursed @ %s:%d\n",
+					    rw->lock_object.lo_name, file,
+					    line);
+			} else if (what & RA_RECURSED)
+				panic("Lock %s not recursed @ %s:%d\n",
+				    rw->lock_object.lo_name, file, line);
+		}
+#endif
+		break;
+	case RA_WLOCKED:
+	case RA_WLOCKED | RA_RECURSED:
+	case RA_WLOCKED | RA_NOTRECURSED:
+		if (rw_wowner(rw) != curthread)
+			panic("Lock %s not exclusively locked @ %s:%d\n",
+			    rw->lock_object.lo_name, file, line);
+		if (rw_recursed(rw)) {
+			if (what & RA_NOTRECURSED)
+				panic("Lock %s recursed @ %s:%d\n",
+				    rw->lock_object.lo_name, file, line);
+		} else if (what & RA_RECURSED)
+			panic("Lock %s not recursed @ %s:%d\n",
+			    rw->lock_object.lo_name, file, line);
+		break;
+	case RA_UNLOCKED:
+#ifdef WITNESS
+		witness_assert(&rw->lock_object, what, file, line);
+#else
+		/*
+		 * If we hold a write lock fail.  We can't reliably check
+		 * to see if we hold a read lock or not.
+		 */
+		if (rw_wowner(rw) == curthread)
+			panic("Lock %s exclusively locked @ %s:%d\n",
+			    rw->lock_object.lo_name, file, line);
+#endif
+		break;
+	default:
+		panic("Unknown rw lock assertion: %d @ %s:%d", what, file,
+		    line);
+	}
+}
+#endif /* INVARIANT_SUPPORT */
+
+#ifdef DDB
+void
+db_show_rwlock(const struct lock_object *lock)
+{
+	const struct rwlock *rw;
+	struct thread *td;
+
+	rw = (const struct rwlock *)lock;
+
+	db_printf(" state: ");
+	if (rw->rw_lock == RW_UNLOCKED)
+		db_printf("UNLOCKED\n");
+	else if (rw->rw_lock == RW_DESTROYED) {
+		db_printf("DESTROYED\n");
+		return;
+	} else if (rw->rw_lock & RW_LOCK_READ)
+		db_printf("RLOCK: %ju locks\n",
+		    (uintmax_t)(RW_READERS(rw->rw_lock)));
+	else {
+		td = rw_wowner(rw);
+		db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
+		    td->td_tid, td->td_proc->p_pid, td->td_name);
+		if (rw_recursed(rw))
+			db_printf(" recursed: %u\n", rw->rw_recurse);
+	}
+	db_printf(" waiters: ");
+	switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) {
+	case RW_LOCK_READ_WAITERS:
+		db_printf("readers\n");
+		break;
+	case RW_LOCK_WRITE_WAITERS:
+		db_printf("writers\n");
+		break;
+	case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS:
+		db_printf("readers and writers\n");
+		break;
+	default:
+		db_printf("none\n");
+		break;
+	}
+}
+
+#endif
diff --git a/sys/kern/kern_sdt.c b/sys/kern/kern_sdt.c
new file mode 100644
index 0000000..c8e1940
--- /dev/null
+++ b/sys/kern/kern_sdt.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright 2006-2008 John Birrell <jb@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_kdtrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sdt.h>
+
+/*
+ * Hook for the DTrace probe function. The SDT provider will set this to
+ * dtrace_probe() when it loads.
+ */
+sdt_probe_func_t sdt_probe_func = sdt_probe_stub;
+
+/*
+ * This is a stub for probe calls in case kernel DTrace support isn't
+ * enabled. It should never get called because there is no DTrace support
+ * to enable it.
+ */
+void
+sdt_probe_stub(uint32_t id, uintptr_t arg0, uintptr_t arg1,
+    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
+{
+
+	printf("sdt_probe_stub: Why did this get called?\n");
+}
diff --git a/sys/kern/kern_sema.c b/sys/kern/kern_sema.c
new file mode 100644
index 0000000..f09099e
--- /dev/null
+++ b/sys/kern/kern_sema.c
@@ -0,0 +1,176 @@
+/*-
+ * Copyright (C) 2001 Jason Evans <jasone@freebsd.org>.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice(s), this list of conditions and the following disclaimer as
+ *    the first lines of this file unmodified other than the possible 
+ *    addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice(s), this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Counting semaphores.
+ *
+ * Priority propagation will not generally raise the priority of semaphore
+ * "owners" (a misnomer in the context of semaphores), so should not be relied
+ * upon in combination with semaphores.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ktr.h>
+#include <sys/condvar.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sema.h>
+
+void
+sema_init(struct sema *sema, int value, const char *description)
+{
+
+	KASSERT((value >= 0), ("%s(): negative value\n", __func__));
+
+	bzero(sema, sizeof(*sema));
+	mtx_init(&sema->sema_mtx, description, "sema backing lock",
+	    MTX_DEF | MTX_NOWITNESS | MTX_QUIET);
+	cv_init(&sema->sema_cv, description);
+	sema->sema_value = value;
+
+	CTR4(KTR_LOCK, "%s(%p, %d, \"%s\")", __func__, sema, value, description);
+}
+
+void
+sema_destroy(struct sema *sema)
+{
+
+	CTR3(KTR_LOCK, "%s(%p) \"%s\"", __func__, sema,
+	    cv_wmesg(&sema->sema_cv));
+
+	KASSERT((sema->sema_waiters == 0), ("%s(): waiters\n", __func__));
+
+	mtx_destroy(&sema->sema_mtx);
+	cv_destroy(&sema->sema_cv);
+}
+
+void
+_sema_post(struct sema *sema, const char *file, int line)
+{
+
+	mtx_lock(&sema->sema_mtx);
+	sema->sema_value++;
+	if (sema->sema_waiters && sema->sema_value > 0)
+		cv_signal(&sema->sema_cv);
+
+	CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+	    cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+
+	mtx_unlock(&sema->sema_mtx);
+}
+
+void
+_sema_wait(struct sema *sema, const char *file, int line)
+{
+
+	mtx_lock(&sema->sema_mtx);
+	while (sema->sema_value == 0) {
+		sema->sema_waiters++;
+		cv_wait(&sema->sema_cv, &sema->sema_mtx);
+		sema->sema_waiters--;
+	}
+	sema->sema_value--;
+
+	CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+	    cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+
+	mtx_unlock(&sema->sema_mtx);
+}
+
+int
+_sema_timedwait(struct sema *sema, int timo, const char *file, int line)
+{
+	int error;
+
+	mtx_lock(&sema->sema_mtx);
+
+	/*
+	 * A spurious wakeup will cause the timeout interval to start over.
+	 * This isn't a big deal as long as spurious wakeups don't occur
+	 * continuously, since the timeout period is merely a lower bound on how
+	 * long to wait.
+	 */
+	for (error = 0; sema->sema_value == 0 && error == 0;) {
+		sema->sema_waiters++;
+		error = cv_timedwait(&sema->sema_cv, &sema->sema_mtx, timo);
+		sema->sema_waiters--;
+	}
+	if (sema->sema_value > 0) {
+		/* Success. */
+		sema->sema_value--;
+		error = 0;
+
+		CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+		    cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+	} else {
+		CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema,
+		    cv_wmesg(&sema->sema_cv), file, line);
+	}
+
+	mtx_unlock(&sema->sema_mtx);
+	return (error);
+}
+
+int
+_sema_trywait(struct sema *sema, const char *file, int line)
+{
+	int ret;
+
+	mtx_lock(&sema->sema_mtx);
+
+	if (sema->sema_value > 0) {
+		/* Success. */
+		sema->sema_value--;
+		ret = 1;
+
+		CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+		    cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+	} else {
+		ret = 0;
+
+		CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema,
+		    cv_wmesg(&sema->sema_cv), file, line);
+	}
+
+	mtx_unlock(&sema->sema_mtx);
+	return (ret);
+}
+
+int
+sema_value(struct sema *sema)
+{
+	int ret;
+
+	mtx_lock(&sema->sema_mtx);
+	ret = sema->sema_value;
+	mtx_unlock(&sema->sema_mtx);
+	return (ret);
+}
diff --git a/sys/kern/kern_sharedpage.c b/sys/kern/kern_sharedpage.c
new file mode 100644
index 0000000..fd619cd
--- /dev/null
+++ b/sys/kern/kern_sharedpage.c
@@ -0,0 +1,239 @@
+/*-
+ * Copyright (c) 2010, 2012 Konstantin Belousov <kib@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/vdso.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+static struct sx shared_page_alloc_sx;
+static vm_object_t shared_page_obj;
+static int shared_page_free;
+char *shared_page_mapping;
+
+void
+shared_page_write(int base, int size, const void *data)
+{
+
+	bcopy(data, shared_page_mapping + base, size);
+}
+
+static int
+shared_page_alloc_locked(int size, int align)
+{
+	int res;
+
+	res = roundup(shared_page_free, align);
+	if (res + size >= IDX_TO_OFF(shared_page_obj->size))
+		res = -1;
+	else
+		shared_page_free = res + size;
+	return (res);
+}
+
+int
+shared_page_alloc(int size, int align)
+{
+	int res;
+
+	sx_xlock(&shared_page_alloc_sx);
+	res = shared_page_alloc_locked(size, align);
+	sx_xunlock(&shared_page_alloc_sx);
+	return (res);
+}
+
+int
+shared_page_fill(int size, int align, const void *data)
+{
+	int res;
+
+	sx_xlock(&shared_page_alloc_sx);
+	res = shared_page_alloc_locked(size, align);
+	if (res != -1)
+		shared_page_write(res, size, data);
+	sx_xunlock(&shared_page_alloc_sx);
+	return (res);
+}
+
+static void
+shared_page_init(void *dummy __unused)
+{
+	vm_page_t m;
+	vm_offset_t addr;
+
+	sx_init(&shared_page_alloc_sx, "shpsx");
+	shared_page_obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE,
+	    VM_PROT_DEFAULT, 0, NULL);
+	VM_OBJECT_WLOCK(shared_page_obj);
+	m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_NOBUSY | VM_ALLOC_ZERO);
+	m->valid = VM_PAGE_BITS_ALL;
+	VM_OBJECT_WUNLOCK(shared_page_obj);
+	addr = kva_alloc(PAGE_SIZE);
+	pmap_qenter(addr, &m, 1);
+	shared_page_mapping = (char *)addr;
+}
+
+SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init,
+    NULL);
+
+static void
+timehands_update(struct sysentvec *sv)
+{
+	struct vdso_timehands th;
+	struct vdso_timekeep *tk;
+	uint32_t enabled, idx;
+
+	enabled = tc_fill_vdso_timehands(&th);
+	tk = (struct vdso_timekeep *)(shared_page_mapping +
+	    sv->sv_timekeep_off);
+	idx = sv->sv_timekeep_curr;
+	atomic_store_rel_32(&tk->tk_th[idx].th_gen, 0);
+	if (++idx >= VDSO_TH_NUM)
+		idx = 0;
+	sv->sv_timekeep_curr = idx;
+	if (++sv->sv_timekeep_gen == 0)
+		sv->sv_timekeep_gen = 1;
+	th.th_gen = 0;
+	if (enabled)
+		tk->tk_th[idx] = th;
+	tk->tk_enabled = enabled;
+	atomic_store_rel_32(&tk->tk_th[idx].th_gen, sv->sv_timekeep_gen);
+	tk->tk_current = idx;
+}
+
+#ifdef COMPAT_FREEBSD32
+static void
+timehands_update32(struct sysentvec *sv)
+{
+	struct vdso_timekeep32 *tk;
+	struct vdso_timehands32 th;
+	uint32_t enabled, idx;
+
+	enabled = tc_fill_vdso_timehands32(&th);
+	tk = (struct vdso_timekeep32 *)(shared_page_mapping +
+	    sv->sv_timekeep_off);
+	idx = sv->sv_timekeep_curr;
+	atomic_store_rel_32(&tk->tk_th[idx].th_gen, 0);
+	if (++idx >= VDSO_TH_NUM)
+		idx = 0;
+	sv->sv_timekeep_curr = idx;
+	if (++sv->sv_timekeep_gen == 0)
+		sv->sv_timekeep_gen = 1;
+	th.th_gen = 0;
+	if (enabled)
+		tk->tk_th[idx] = th;
+	tk->tk_enabled = enabled;
+	atomic_store_rel_32(&tk->tk_th[idx].th_gen, sv->sv_timekeep_gen);
+	tk->tk_current = idx;
+}
+#endif
+
+/*
+ * This is hackish, but easiest way to avoid creating list structures
+ * that needs to be iterated over from the hardclock interrupt
+ * context.
+ */
+static struct sysentvec *host_sysentvec;
+#ifdef COMPAT_FREEBSD32
+static struct sysentvec *compat32_sysentvec;
+#endif
+
+void
+timekeep_push_vdso(void)
+{
+
+	if (host_sysentvec != NULL && host_sysentvec->sv_timekeep_base != 0)
+		timehands_update(host_sysentvec);
+#ifdef COMPAT_FREEBSD32
+	if (compat32_sysentvec != NULL &&
+	    compat32_sysentvec->sv_timekeep_base != 0)
+		timehands_update32(compat32_sysentvec);
+#endif
+}
+
+void
+exec_sysvec_init(void *param)
+{
+	struct sysentvec *sv;
+	int tk_base;
+	uint32_t tk_ver;
+
+	sv = (struct sysentvec *)param;
+
+	if ((sv->sv_flags & SV_SHP) == 0)
+		return;
+	sv->sv_shared_page_obj = shared_page_obj;
+	sv->sv_sigcode_base = sv->sv_shared_page_base +
+	    shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode);
+	if ((sv->sv_flags & SV_ABI_MASK) != SV_ABI_FREEBSD)
+		return;
+	tk_ver = VDSO_TK_VER_CURR;
+#ifdef COMPAT_FREEBSD32
+	if ((sv->sv_flags & SV_ILP32) != 0) {
+		tk_base = shared_page_alloc(sizeof(struct vdso_timekeep32) +
+		    sizeof(struct vdso_timehands32) * VDSO_TH_NUM, 16);
+		KASSERT(tk_base != -1, ("tk_base -1 for 32bit"));
+		shared_page_write(tk_base + offsetof(struct vdso_timekeep32,
+		    tk_ver), sizeof(uint32_t), &tk_ver);
+		KASSERT(compat32_sysentvec == 0,
+		    ("Native compat32 already registered"));
+		compat32_sysentvec = sv;
+	} else {
+#endif
+		tk_base = shared_page_alloc(sizeof(struct vdso_timekeep) +
+		    sizeof(struct vdso_timehands) * VDSO_TH_NUM, 16);
+		KASSERT(tk_base != -1, ("tk_base -1 for native"));
+		shared_page_write(tk_base + offsetof(struct vdso_timekeep,
+		    tk_ver), sizeof(uint32_t), &tk_ver);
+		KASSERT(host_sysentvec == 0, ("Native already registered"));
+		host_sysentvec = sv;
+#ifdef COMPAT_FREEBSD32
+	}
+#endif
+	sv->sv_timekeep_base = sv->sv_shared_page_base + tk_base;
+	sv->sv_timekeep_off = tk_base;
+	timekeep_push_vdso();
+}
diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
new file mode 100644
index 0000000..b120263
--- /dev/null
+++ b/sys/kern/kern_shutdown.c
@@ -0,0 +1,893 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_shutdown.c	8.3 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_kdb.h"
+#include "opt_panic.h"
+#include "opt_sched.h"
+#include "opt_watchdog.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/eventhandler.h>
+#include <sys/jail.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/kerneldump.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/reboot.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vnode.h>
+#include <sys/watchdog.h>
+
+#include <ddb/ddb.h>
+
+#include <machine/cpu.h>
+#include <machine/pcb.h>
+#include <machine/smp.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/swap_pager.h>
+
+#include <sys/signalvar.h>
+
+#ifndef PANIC_REBOOT_WAIT_TIME
+#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
+#endif
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#ifdef KDB
+#ifdef KDB_UNATTENDED
+int debugger_on_panic = 0;
+#else
+int debugger_on_panic = 1;
+#endif
+SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic,
+    CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_TUN,
+    &debugger_on_panic, 0, "Run debugger on kernel panic");
+TUNABLE_INT("debug.debugger_on_panic", &debugger_on_panic);
+
+#ifdef KDB_TRACE
+static int trace_on_panic = 1;
+#else
+static int trace_on_panic = 0;
+#endif
+SYSCTL_INT(_debug, OID_AUTO, trace_on_panic,
+    CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_TUN,
+    &trace_on_panic, 0, "Print stack trace on kernel panic");
+TUNABLE_INT("debug.trace_on_panic", &trace_on_panic);
+#endif /* KDB */
+
+static int sync_on_panic = 0;
+SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RW | CTLFLAG_TUN,
+	&sync_on_panic, 0, "Do a sync before rebooting from a panic");
+TUNABLE_INT("kern.sync_on_panic", &sync_on_panic);
+
+static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0,
+    "Shutdown environment");
+
+#ifndef DIAGNOSTIC
+static int show_busybufs;
+#else
+static int show_busybufs = 1;
+#endif
+SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW,
+	&show_busybufs, 0, "");
+
+/*
+ * Variable panicstr contains argument to first call to panic; used as flag
+ * to indicate that the kernel has already called panic.
+ */
+const char *panicstr;
+
+int dumping;				/* system is dumping */
+int rebooting;				/* system is rebooting */
+static struct dumperinfo dumper;	/* our selected dumper */
+
+/* Context information for dump-debuggers. */
+static struct pcb dumppcb;		/* Registers. */
+lwpid_t dumptid;			/* Thread ID. */
+
+static void poweroff_wait(void *, int);
+static void shutdown_halt(void *junk, int howto);
+static void shutdown_panic(void *junk, int howto);
+static void shutdown_reset(void *junk, int howto);
+static void vpanic(const char *fmt, va_list ap) __dead2;
+
+/* register various local shutdown events */
+static void
+shutdown_conf(void *unused)
+{
+
+	EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL,
+	    SHUTDOWN_PRI_FIRST);
+	EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL,
+	    SHUTDOWN_PRI_LAST + 100);
+	EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL,
+	    SHUTDOWN_PRI_LAST + 100);
+	EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL,
+	    SHUTDOWN_PRI_LAST + 200);
+}
+
+SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL);
+
+/*
+ * The system call that results in a reboot.
+ */
+/* ARGSUSED */
+int
+sys_reboot(struct thread *td, struct reboot_args *uap)
+{
+	int error;
+
+	error = 0;
+#ifdef MAC
+	error = mac_system_check_reboot(td->td_ucred, uap->opt);
+#endif
+	if (error == 0)
+		error = priv_check(td, PRIV_REBOOT);
+	if (error == 0) {
+		mtx_lock(&Giant);
+		kern_reboot(uap->opt);
+		mtx_unlock(&Giant);
+	}
+	return (error);
+}
+
+/*
+ * Called by events that want to shut down.. e.g  <CTL><ALT><DEL> on a PC
+ */
+static int shutdown_howto = 0;
+
+void
+shutdown_nice(int howto)
+{
+
+	shutdown_howto = howto;
+
+	/* Send a signal to init(8) and have it shutdown the world */
+	if (initproc != NULL) {
+		PROC_LOCK(initproc);
+		kern_psignal(initproc, SIGINT);
+		PROC_UNLOCK(initproc);
+	} else {
+		/* No init(8) running, so simply reboot */
+		kern_reboot(RB_NOSYNC);
+	}
+	return;
+}
+static int	waittime = -1;
+
+static void
+print_uptime(void)
+{
+	int f;
+	struct timespec ts;
+
+	getnanouptime(&ts);
+	printf("Uptime: ");
+	f = 0;
+	if (ts.tv_sec >= 86400) {
+		printf("%ldd", (long)ts.tv_sec / 86400);
+		ts.tv_sec %= 86400;
+		f = 1;
+	}
+	if (f || ts.tv_sec >= 3600) {
+		printf("%ldh", (long)ts.tv_sec / 3600);
+		ts.tv_sec %= 3600;
+		f = 1;
+	}
+	if (f || ts.tv_sec >= 60) {
+		printf("%ldm", (long)ts.tv_sec / 60);
+		ts.tv_sec %= 60;
+		f = 1;
+	}
+	printf("%lds\n", (long)ts.tv_sec);
+}
+
+int
+doadump(boolean_t textdump)
+{
+	boolean_t coredump;
+
+	if (dumping)
+		return (EBUSY);
+	if (dumper.dumper == NULL)
+		return (ENXIO);
+
+	savectx(&dumppcb);
+	dumptid = curthread->td_tid;
+	dumping++;
+
+	coredump = TRUE;
+#ifdef DDB
+	if (textdump && textdump_pending) {
+		coredump = FALSE;
+		textdump_dumpsys(&dumper);
+	}
+#endif
+	if (coredump)
+		dumpsys(&dumper);
+
+	dumping--;
+	return (0);
+}
+
+static int
+isbufbusy(struct buf *bp)
+{
+	if (((bp->b_flags & (B_INVAL | B_PERSISTENT)) == 0 &&
+	    BUF_ISLOCKED(bp)) ||
+	    ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
+		return (1);
+	return (0);
+}
+
+/*
+ * Shutdown the system cleanly to prepare for reboot, halt, or power off.
+ */
+void
+kern_reboot(int howto)
+{
+	static int first_buf_printf = 1;
+
+#if defined(SMP)
+	/*
+	 * Bind us to CPU 0 so that all shutdown code runs there.  Some
+	 * systems don't shutdown properly (i.e., ACPI power off) if we
+	 * run on another processor.
+	 */
+	if (!SCHEDULER_STOPPED()) {
+		thread_lock(curthread);
+		sched_bind(curthread, 0);
+		thread_unlock(curthread);
+		KASSERT(PCPU_GET(cpuid) == 0, ("boot: not running on cpu 0"));
+	}
+#endif
+	/* We're in the process of rebooting. */
+	rebooting = 1;
+
+	/* collect extra flags that shutdown_nice might have set */
+	howto |= shutdown_howto;
+
+	/* We are out of the debugger now. */
+	kdb_active = 0;
+
+	/*
+	 * Do any callouts that should be done BEFORE syncing the filesystems.
+	 */
+	EVENTHANDLER_INVOKE(shutdown_pre_sync, howto);
+
+	/* 
+	 * Now sync filesystems
+	 */
+	if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
+		register struct buf *bp;
+		int iter, nbusy, pbusy;
+#ifndef PREEMPTION
+		int subiter;
+#endif
+
+		waittime = 0;
+
+		wdog_kern_pat(WD_LASTVAL);
+		sys_sync(curthread, NULL);
+
+		/*
+		 * With soft updates, some buffers that are
+		 * written will be remarked as dirty until other
+		 * buffers are written.
+		 */
+		for (iter = pbusy = 0; iter < 20; iter++) {
+			nbusy = 0;
+			for (bp = &buf[nbuf]; --bp >= buf; )
+				if (isbufbusy(bp))
+					nbusy++;
+			if (nbusy == 0) {
+				if (first_buf_printf)
+					printf("All buffers synced.");
+				break;
+			}
+			if (first_buf_printf) {
+				printf("Syncing disks, buffers remaining... ");
+				first_buf_printf = 0;
+			}
+			printf("%d ", nbusy);
+			if (nbusy < pbusy)
+				iter = 0;
+			pbusy = nbusy;
+
+			wdog_kern_pat(WD_LASTVAL);
+			sys_sync(curthread, NULL);
+
+#ifdef PREEMPTION
+			/*
+			 * Drop Giant and spin for a while to allow
+			 * interrupt threads to run.
+			 */
+			DROP_GIANT();
+			DELAY(50000 * iter);
+			PICKUP_GIANT();
+#else
+			/*
+			 * Drop Giant and context switch several times to
+			 * allow interrupt threads to run.
+			 */
+			DROP_GIANT();
+			for (subiter = 0; subiter < 50 * iter; subiter++) {
+				thread_lock(curthread);
+				mi_switch(SW_VOL, NULL);
+				thread_unlock(curthread);
+				DELAY(1000);
+			}
+			PICKUP_GIANT();
+#endif
+		}
+		printf("\n");
+		/*
+		 * Count only busy local buffers to prevent forcing 
+		 * a fsck if we're just a client of a wedged NFS server
+		 */
+		nbusy = 0;
+		for (bp = &buf[nbuf]; --bp >= buf; ) {
+			if (isbufbusy(bp)) {
+#if 0
+/* XXX: This is bogus.  We should probably have a BO_REMOTE flag instead */
+				if (bp->b_dev == NULL) {
+					TAILQ_REMOVE(&mountlist,
+					    bp->b_vp->v_mount, mnt_list);
+					continue;
+				}
+#endif
+				nbusy++;
+				if (show_busybufs > 0) {
+					printf(
+	    "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
+					    nbusy, bp, bp->b_vp, bp->b_flags,
+					    (intmax_t)bp->b_blkno,
+					    (intmax_t)bp->b_lblkno);
+					BUF_LOCKPRINTINFO(bp);
+					if (show_busybufs > 1)
+						vn_printf(bp->b_vp,
+						    "vnode content: ");
+				}
+			}
+		}
+		if (nbusy) {
+			/*
+			 * Failed to sync all blocks. Indicate this and don't
+			 * unmount filesystems (thus forcing an fsck on reboot).
+			 */
+			printf("Giving up on %d buffers\n", nbusy);
+			DELAY(5000000);	/* 5 seconds */
+		} else {
+			if (!first_buf_printf)
+				printf("Final sync complete\n");
+			/*
+			 * Unmount filesystems
+			 */
+			if (panicstr == 0)
+				vfs_unmountall();
+		}
+		swapoff_all();
+		DELAY(100000);		/* wait for console output to finish */
+	}
+
+	print_uptime();
+
+	cngrab();
+
+	/*
+	 * Ok, now do things that assume all filesystem activity has
+	 * been completed.
+	 */
+	EVENTHANDLER_INVOKE(shutdown_post_sync, howto);
+
+	if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) 
+		doadump(TRUE);
+
+	/* Now that we're going to really halt the system... */
+	EVENTHANDLER_INVOKE(shutdown_final, howto);
+
+	for(;;) ;	/* safety against shutdown_reset not working */
+	/* NOTREACHED */
+}
+
+/*
+ * If the shutdown was a clean halt, behave accordingly.
+ */
+static void
+shutdown_halt(void *junk, int howto)
+{
+
+	if (howto & RB_HALT) {
+		printf("\n");
+		printf("The operating system has halted.\n");
+		printf("Please press any key to reboot.\n\n");
+		switch (cngetc()) {
+		case -1:		/* No console, just die */
+			cpu_halt();
+			/* NOTREACHED */
+		default:
+			howto &= ~RB_HALT;
+			break;
+		}
+	}
+}
+
+/*
+ * Check to see if the system paniced, pause and then reboot
+ * according to the specified delay.
+ */
+static void
+shutdown_panic(void *junk, int howto)
+{
+	int loop;
+
+	if (howto & RB_DUMP) {
+		if (PANIC_REBOOT_WAIT_TIME != 0) {
+			if (PANIC_REBOOT_WAIT_TIME != -1) {
+				printf("Automatic reboot in %d seconds - "
+				       "press a key on the console to abort\n",
+					PANIC_REBOOT_WAIT_TIME);
+				for (loop = PANIC_REBOOT_WAIT_TIME * 10;
+				     loop > 0; --loop) {
+					DELAY(1000 * 100); /* 1/10th second */
+					/* Did user type a key? */
+					if (cncheckc() != -1)
+						break;
+				}
+				if (!loop)
+					return;
+			}
+		} else { /* zero time specified - reboot NOW */
+			return;
+		}
+		printf("--> Press a key on the console to reboot,\n");
+		printf("--> or switch off the system now.\n");
+		cngetc();
+	}
+}
+
+/*
+ * Everything done, now reset
+ */
+static void
+shutdown_reset(void *junk, int howto)
+{
+
+	printf("Rebooting...\n");
+	DELAY(1000000);	/* wait 1 sec for printf's to complete and be read */
+
+	/*
+	 * Acquiring smp_ipi_mtx here has a double effect:
+	 * - it disables interrupts avoiding CPU0 preemption
+	 *   by fast handlers (thus deadlocking  against other CPUs)
+	 * - it avoids deadlocks against smp_rendezvous() or, more 
+	 *   generally, threads busy-waiting, with this spinlock held,
+	 *   and waiting for responses by threads on other CPUs
+	 *   (ie. smp_tlb_shootdown()).
+	 *
+	 * For the !SMP case it just needs to handle the former problem.
+	 */
+#ifdef SMP
+	mtx_lock_spin(&smp_ipi_mtx);
+#else
+	spinlock_enter();
+#endif
+
+	/* cpu_boot(howto); */ /* doesn't do anything at the moment */
+	cpu_reset();
+	/* NOTREACHED */ /* assuming reset worked */
+}
+
+#if defined(WITNESS) || defined(INVARIANTS)
+static int kassert_warn_only = 0;
+#ifdef KDB
+static int kassert_do_kdb = 0;
+#endif
+#ifdef KTR
+static int kassert_do_ktr = 0;
+#endif
+static int kassert_do_log = 1;
+static int kassert_log_pps_limit = 4;
+static int kassert_log_mute_at = 0;
+static int kassert_log_panic_at = 0;
+static int kassert_warnings = 0;
+
+SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW, NULL, "kassert options");
+
+SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, CTLFLAG_RW | CTLFLAG_TUN,
+    &kassert_warn_only, 0,
+    "KASSERT triggers a panic (1) or just a warning (0)");
+TUNABLE_INT("debug.kassert.warn_only", &kassert_warn_only);
+
+#ifdef KDB
+SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, CTLFLAG_RW | CTLFLAG_TUN,
+    &kassert_do_kdb, 0, "KASSERT will enter the debugger");
+TUNABLE_INT("debug.kassert.do_kdb", &kassert_do_kdb);
+#endif
+
+#ifdef KTR
+SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, CTLFLAG_RW | CTLFLAG_TUN,
+    &kassert_do_ktr, 0,
+    "KASSERT does a KTR, set this to the KTRMASK you want");
+TUNABLE_INT("debug.kassert.do_ktr", &kassert_do_ktr);
+#endif
+
+SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, CTLFLAG_RW | CTLFLAG_TUN,
+    &kassert_do_log, 0, "KASSERT triggers a panic (1) or just a warning (0)");
+TUNABLE_INT("debug.kassert.do_log", &kassert_do_log);
+
+SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RW | CTLFLAG_TUN,
+    &kassert_warnings, 0, "number of KASSERTs that have been triggered");
+TUNABLE_INT("debug.kassert.warnings", &kassert_warnings);
+
+SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, CTLFLAG_RW | CTLFLAG_TUN,
+    &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic");
+TUNABLE_INT("debug.kassert.log_panic_at", &kassert_log_panic_at);
+
+SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, CTLFLAG_RW | CTLFLAG_TUN,
+    &kassert_log_pps_limit, 0, "limit number of log messages per second");
+TUNABLE_INT("debug.kassert.log_pps_limit", &kassert_log_pps_limit);
+
+SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, CTLFLAG_RW | CTLFLAG_TUN,
+    &kassert_log_mute_at, 0, "max number of KASSERTS to log");
+TUNABLE_INT("debug.kassert.log_mute_at", &kassert_log_mute_at);
+
+static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS);
+
+SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
+    kassert_sysctl_kassert, "I", "set to trigger a test kassert");
+
+static int
+kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS)
+{
+	int error, i;
+
+	error = sysctl_wire_old_buffer(req, sizeof(int));
+	if (error == 0) {
+		i = 0;
+		error = sysctl_handle_int(oidp, &i, 0, req);
+	}
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i));
+	return (0);
+}
+
+/*
+ * Called by KASSERT, this decides if we will panic
+ * or if we will log via printf and/or ktr.
+ */
+void
+kassert_panic(const char *fmt, ...)
+{
+	static char buf[256];
+	va_list ap;
+
+	va_start(ap, fmt);
+	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
+	va_end(ap);
+
+	/*
+	 * panic if we're not just warning, or if we've exceeded
+	 * kassert_log_panic_at warnings.
+	 */
+	if (!kassert_warn_only ||
+	    (kassert_log_panic_at > 0 &&
+	     kassert_warnings >= kassert_log_panic_at)) {
+		va_start(ap, fmt);
+		vpanic(fmt, ap);
+		/* NORETURN */
+	}
+#ifdef KTR
+	if (kassert_do_ktr)
+		CTR0(ktr_mask, buf);
+#endif /* KTR */
+	/*
+	 * log if we've not yet met the mute limit.
+	 */
+	if (kassert_do_log &&
+	    (kassert_log_mute_at == 0 ||
+	     kassert_warnings < kassert_log_mute_at)) {
+		static  struct timeval lasterr;
+		static  int curerr;
+
+		if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) {
+			printf("KASSERT failed: %s\n", buf);
+			kdb_backtrace();
+		}
+	}
+#ifdef KDB
+	if (kassert_do_kdb) {
+		kdb_enter(KDB_WHY_KASSERT, buf);
+	}
+#endif
+	atomic_add_int(&kassert_warnings, 1);
+}
+#endif
+
+/*
+ * Panic is called on unresolvable fatal errors.  It prints "panic: mesg",
+ * and then reboots.  If we are called twice, then we avoid trying to sync
+ * the disks as this often leads to recursive panics.
+ */
+void
+panic(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vpanic(fmt, ap);
+}
+
+static void
+vpanic(const char *fmt, va_list ap)
+{
+#ifdef SMP
+	cpuset_t other_cpus;
+#endif
+	struct thread *td = curthread;
+	int bootopt, newpanic;
+	static char buf[256];
+
+	spinlock_enter();
+
+#ifdef SMP
+	/*
+	 * stop_cpus_hard(other_cpus) should prevent multiple CPUs from
+	 * concurrently entering panic.  Only the winner will proceed
+	 * further.
+	 */
+	if (panicstr == NULL && !kdb_active) {
+		other_cpus = all_cpus;
+		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+		stop_cpus_hard(other_cpus);
+	}
+
+	/*
+	 * We set stop_scheduler here and not in the block above,
+	 * because we want to ensure that if panic has been called and
+	 * stop_scheduler_on_panic is true, then stop_scheduler will
+	 * always be set.  Even if panic has been entered from kdb.
+	 */
+	td->td_stopsched = 1;
+#endif
+
+	bootopt = RB_AUTOBOOT;
+	newpanic = 0;
+	if (panicstr)
+		bootopt |= RB_NOSYNC;
+	else {
+		bootopt |= RB_DUMP;
+		panicstr = fmt;
+		newpanic = 1;
+	}
+
+	if (newpanic) {
+		(void)vsnprintf(buf, sizeof(buf), fmt, ap);
+		panicstr = buf;
+		cngrab();
+		printf("panic: %s\n", buf);
+	} else {
+		printf("panic: ");
+		vprintf(fmt, ap);
+		printf("\n");
+	}
+#ifdef SMP
+	printf("cpuid = %d\n", PCPU_GET(cpuid));
+#endif
+
+#ifdef KDB
+	if (newpanic && trace_on_panic)
+		kdb_backtrace();
+	if (debugger_on_panic)
+		kdb_enter(KDB_WHY_PANIC, "panic");
+#endif
+	/*thread_lock(td); */
+	td->td_flags |= TDF_INPANIC;
+	/* thread_unlock(td); */
+	if (!sync_on_panic)
+		bootopt |= RB_NOSYNC;
+	kern_reboot(bootopt);
+}
+
+/*
+ * Support for poweroff delay.
+ *
+ * Please note that setting this delay too short might power off your machine
+ * before the write cache on your hard disk has been flushed, leading to
+ * soft-updates inconsistencies.
+ */
+#ifndef POWEROFF_DELAY
+# define POWEROFF_DELAY 5000
+#endif
+static int poweroff_delay = POWEROFF_DELAY;
+
+SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW,
+    &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)");
+
+static void
+poweroff_wait(void *junk, int howto)
+{
+
+	if (!(howto & RB_POWEROFF) || poweroff_delay <= 0)
+		return;
+	DELAY(poweroff_delay * 1000);
+}
+
+/*
+ * Some system processes (e.g. syncer) need to be stopped at appropriate
+ * points in their main loops prior to a system shutdown, so that they
+ * won't interfere with the shutdown process (e.g. by holding a disk buf
+ * to cause sync to fail).  For each of these system processes, register
+ * shutdown_kproc() as a handler for one of shutdown events.
+ */
+static int kproc_shutdown_wait = 60;
+SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW,
+    &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process");
+
+void
+kproc_shutdown(void *arg, int howto)
+{
+	struct proc *p;
+	int error;
+
+	if (panicstr)
+		return;
+
+	p = (struct proc *)arg;
+	printf("Waiting (max %d seconds) for system process `%s' to stop...",
+	    kproc_shutdown_wait, p->p_comm);
+	error = kproc_suspend(p, kproc_shutdown_wait * hz);
+
+	if (error == EWOULDBLOCK)
+		printf("timed out\n");
+	else
+		printf("done\n");
+}
+
+void
+kthread_shutdown(void *arg, int howto)
+{
+	struct thread *td;
+	int error;
+
+	if (panicstr)
+		return;
+
+	td = (struct thread *)arg;
+	printf("Waiting (max %d seconds) for system thread `%s' to stop...",
+	    kproc_shutdown_wait, td->td_name);
+	error = kthread_suspend(td, kproc_shutdown_wait * hz);
+
+	if (error == EWOULDBLOCK)
+		printf("timed out\n");
+	else
+		printf("done\n");
+}
+
+static char dumpdevname[sizeof(((struct cdev*)NULL)->si_name)];
+SYSCTL_STRING(_kern_shutdown, OID_AUTO, dumpdevname, CTLFLAG_RD,
+    dumpdevname, 0, "Device for kernel dumps");
+
+/* Registration of dumpers */
+int
+set_dumper(struct dumperinfo *di, const char *devname)
+{
+	size_t wantcopy;
+
+	if (di == NULL) {
+		bzero(&dumper, sizeof dumper);
+		dumpdevname[0] = '\0';
+		return (0);
+	}
+	if (dumper.dumper != NULL)
+		return (EBUSY);
+	dumper = *di;
+	wantcopy = strlcpy(dumpdevname, devname, sizeof(dumpdevname));
+	if (wantcopy >= sizeof(dumpdevname)) {
+		printf("set_dumper: device name truncated from '%s' -> '%s'\n",
+			devname, dumpdevname);
+	}
+	return (0);
+}
+
+/* Call dumper with bounds checking. */
+int
+dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical,
+    off_t offset, size_t length)
+{
+
+	if (length != 0 && (offset < di->mediaoffset ||
+	    offset - di->mediaoffset + length > di->mediasize)) {
+		printf("Attempt to write outside dump device boundaries.\n"
+	    "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n",
+		    (intmax_t)offset, (intmax_t)di->mediaoffset,
+		    (uintmax_t)length, (intmax_t)di->mediasize);
+		return (ENOSPC);
+	}
+	return (di->dumper(di->priv, virtual, physical, offset, length));
+}
+
+void
+mkdumpheader(struct kerneldumpheader *kdh, char *magic, uint32_t archver,
+    uint64_t dumplen, uint32_t blksz)
+{
+
+	bzero(kdh, sizeof(*kdh));
+	strncpy(kdh->magic, magic, sizeof(kdh->magic));
+	strncpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture));
+	kdh->version = htod32(KERNELDUMPVERSION);
+	kdh->architectureversion = htod32(archver);
+	kdh->dumplength = htod64(dumplen);
+	kdh->dumptime = htod64(time_second);
+	kdh->blocksize = htod32(blksz);
+	strncpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname));
+	strncpy(kdh->versionstring, version, sizeof(kdh->versionstring));
+	if (panicstr != NULL)
+		strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring));
+	kdh->parity = kerneldump_parity(kdh);
+}
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
new file mode 100644
index 0000000..1797ebc
--- /dev/null
+++ b/sys/kern/kern_sig.c
@@ -0,0 +1,3469 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+#include "opt_core.h"
+#include "opt_procdesc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/vnode.h>
+#include <sys/acct.h>
+#include <sys/capability.h>
+#include <sys/condvar.h>
+#include <sys/event.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/ktrace.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/procdesc.h>
+#include <sys/posix4.h>
+#include <sys/pioctl.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sdt.h>
+#include <sys/sbuf.h>
+#include <sys/sleepqueue.h>
+#include <sys/smp.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+#include <sys/timers.h>
+#include <sys/unistd.h>
+#include <sys/wait.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+
+#include <sys/jail.h>
+
+#include <machine/cpu.h>
+
+#include <security/audit/audit.h>
+
+#define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
+
+SDT_PROVIDER_DECLARE(proc);
+SDT_PROBE_DEFINE3(proc, kernel, , signal_send, signal-send, "struct thread *",
+    "struct proc *", "int");
+SDT_PROBE_DEFINE2(proc, kernel, , signal_clear, signal-clear, "int",
+    "ksiginfo_t *");
+SDT_PROBE_DEFINE3(proc, kernel, , signal_discard, signal-discard,
+    "struct thread *", "struct proc *", "int");
+
+static int	coredump(struct thread *);
+static int	killpg1(struct thread *td, int sig, int pgid, int all,
+		    ksiginfo_t *ksi);
+static int	issignal(struct thread *td);
+static int	sigprop(int sig);
+static void	tdsigwakeup(struct thread *, int, sig_t, int);
+static void	sig_suspend_threads(struct thread *, struct proc *, int);
+static int	filt_sigattach(struct knote *kn);
+static void	filt_sigdetach(struct knote *kn);
+static int	filt_signal(struct knote *kn, long hint);
+static struct thread *sigtd(struct proc *p, int sig, int prop);
+static void	sigqueue_start(void);
+
+static uma_zone_t	ksiginfo_zone = NULL;
+struct filterops sig_filtops = {
+	.f_isfd = 0,
+	.f_attach = filt_sigattach,
+	.f_detach = filt_sigdetach,
+	.f_event = filt_signal,
+};
+
+static int	kern_logsigexit = 1;
+SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
+    &kern_logsigexit, 0,
+    "Log processes quitting on abnormal signals to syslog(3)");
+
+static int	kern_forcesigexit = 1;
+SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
+    &kern_forcesigexit, 0, "Force trap signal to be handled");
+
+static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0,
+    "POSIX real time signal");
+
+static int	max_pending_per_proc = 128;
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
+    &max_pending_per_proc, 0, "Max pending signals per proc");
+
+static int	preallocate_siginfo = 1024;
+TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo);
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD,
+    &preallocate_siginfo, 0, "Preallocated signal memory size");
+
+static int	signal_overflow = 0;
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
+    &signal_overflow, 0, "Number of signals overflew");
+
+static int	signal_alloc_fail = 0;
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
+    &signal_alloc_fail, 0, "signals failed to be allocated");
+
+SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
+
+/*
+ * Policy -- Can ucred cr1 send SIGIO to process cr2?
+ * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
+ * in the right situations.
+ */
+#define CANSIGIO(cr1, cr2) \
+	((cr1)->cr_uid == 0 || \
+	    (cr1)->cr_ruid == (cr2)->cr_ruid || \
+	    (cr1)->cr_uid == (cr2)->cr_ruid || \
+	    (cr1)->cr_ruid == (cr2)->cr_uid || \
+	    (cr1)->cr_uid == (cr2)->cr_uid)
+
+static int	sugid_coredump;
+TUNABLE_INT("kern.sugid_coredump", &sugid_coredump);
+SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW,
+    &sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
+
+static int	capmode_coredump;
+TUNABLE_INT("kern.capmode_coredump", &capmode_coredump);
+SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RW,
+    &capmode_coredump, 0, "Allow processes in capability mode to dump core");
+
+static int	do_coredump = 1;
+SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
+	&do_coredump, 0, "Enable/Disable coredumps");
+
+static int	set_core_nodump_flag = 0;
+SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
+	0, "Enable setting the NODUMP flag on coredump files");
+
+/*
+ * Signal properties and actions.
+ * The array below categorizes the signals and their default actions
+ * according to the following properties:
+ */
+#define	SA_KILL		0x01		/* terminates process by default */
+#define	SA_CORE		0x02		/* ditto and coredumps */
+#define	SA_STOP		0x04		/* suspend process */
+#define	SA_TTYSTOP	0x08		/* ditto, from tty */
+#define	SA_IGNORE	0x10		/* ignore by default */
+#define	SA_CONT		0x20		/* continue if suspended */
+#define	SA_CANTMASK	0x40		/* non-maskable, catchable */
+
+static int sigproptbl[NSIG] = {
+	SA_KILL,			/* SIGHUP */
+	SA_KILL,			/* SIGINT */
+	SA_KILL|SA_CORE,		/* SIGQUIT */
+	SA_KILL|SA_CORE,		/* SIGILL */
+	SA_KILL|SA_CORE,		/* SIGTRAP */
+	SA_KILL|SA_CORE,		/* SIGABRT */
+	SA_KILL|SA_CORE,		/* SIGEMT */
+	SA_KILL|SA_CORE,		/* SIGFPE */
+	SA_KILL,			/* SIGKILL */
+	SA_KILL|SA_CORE,		/* SIGBUS */
+	SA_KILL|SA_CORE,		/* SIGSEGV */
+	SA_KILL|SA_CORE,		/* SIGSYS */
+	SA_KILL,			/* SIGPIPE */
+	SA_KILL,			/* SIGALRM */
+	SA_KILL,			/* SIGTERM */
+	SA_IGNORE,			/* SIGURG */
+	SA_STOP,			/* SIGSTOP */
+	SA_STOP|SA_TTYSTOP,		/* SIGTSTP */
+	SA_IGNORE|SA_CONT,		/* SIGCONT */
+	SA_IGNORE,			/* SIGCHLD */
+	SA_STOP|SA_TTYSTOP,		/* SIGTTIN */
+	SA_STOP|SA_TTYSTOP,		/* SIGTTOU */
+	SA_IGNORE,			/* SIGIO */
+	SA_KILL,			/* SIGXCPU */
+	SA_KILL,			/* SIGXFSZ */
+	SA_KILL,			/* SIGVTALRM */
+	SA_KILL,			/* SIGPROF */
+	SA_IGNORE,			/* SIGWINCH  */
+	SA_IGNORE,			/* SIGINFO */
+	SA_KILL,			/* SIGUSR1 */
+	SA_KILL,			/* SIGUSR2 */
+};
+
+static void reschedule_signals(struct proc *p, sigset_t block, int flags);
+
+static void
+sigqueue_start(void)
+{
+	ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
+		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	uma_prealloc(ksiginfo_zone, preallocate_siginfo);
+	p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
+	p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
+	p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
+}
+
+ksiginfo_t *
+ksiginfo_alloc(int wait)
+{
+	int flags;
+
+	flags = M_ZERO;
+	if (! wait)
+		flags |= M_NOWAIT;
+	if (ksiginfo_zone != NULL)
+		return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags));
+	return (NULL);
+}
+
+void
+ksiginfo_free(ksiginfo_t *ksi)
+{
+	uma_zfree(ksiginfo_zone, ksi);
+}
+
+static __inline int
+ksiginfo_tryfree(ksiginfo_t *ksi)
+{
+	if (!(ksi->ksi_flags & KSI_EXT)) {
+		uma_zfree(ksiginfo_zone, ksi);
+		return (1);
+	}
+	return (0);
+}
+
+void
+sigqueue_init(sigqueue_t *list, struct proc *p)
+{
+	SIGEMPTYSET(list->sq_signals);
+	SIGEMPTYSET(list->sq_kill);
+	TAILQ_INIT(&list->sq_list);
+	list->sq_proc = p;
+	list->sq_flags = SQ_INIT;
+}
+
+/*
+ * Get a signal's ksiginfo.
+ * Return:
+ *	0	-	signal not found
+ *	others	-	signal number
+ */
+static int
+sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
+{
+	struct proc *p = sq->sq_proc;
+	struct ksiginfo *ksi, *next;
+	int count = 0;
+
+	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+
+	if (!SIGISMEMBER(sq->sq_signals, signo))
+		return (0);
+
+	if (SIGISMEMBER(sq->sq_kill, signo)) {
+		count++;
+		SIGDELSET(sq->sq_kill, signo);
+	}
+
+	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
+		if (ksi->ksi_signo == signo) {
+			if (count == 0) {
+				TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+				ksi->ksi_sigq = NULL;
+				ksiginfo_copy(ksi, si);
+				if (ksiginfo_tryfree(ksi) && p != NULL)
+					p->p_pendingcnt--;
+			}
+			if (++count > 1)
+				break;
+		}
+	}
+
+	if (count <= 1)
+		SIGDELSET(sq->sq_signals, signo);
+	si->ksi_signo = signo;
+	return (signo);
+}
+
+void
+sigqueue_take(ksiginfo_t *ksi)
+{
+	struct ksiginfo *kp;
+	struct proc	*p;
+	sigqueue_t	*sq;
+
+	if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
+		return;
+
+	p = sq->sq_proc;
+	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+	ksi->ksi_sigq = NULL;
+	if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
+		p->p_pendingcnt--;
+
+	for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
+	     kp = TAILQ_NEXT(kp, ksi_link)) {
+		if (kp->ksi_signo == ksi->ksi_signo)
+			break;
+	}
+	if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo))
+		SIGDELSET(sq->sq_signals, ksi->ksi_signo);
+}
+
+static int
+sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
+{
+	struct proc *p = sq->sq_proc;
+	struct ksiginfo *ksi;
+	int ret = 0;
+
+	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+
+	if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
+		SIGADDSET(sq->sq_kill, signo);
+		goto out_set_bit;
+	}
+
+	/* directly insert the ksi, don't copy it */
+	if (si->ksi_flags & KSI_INS) {
+		if (si->ksi_flags & KSI_HEAD)
+			TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link);
+		else
+			TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
+		si->ksi_sigq = sq;
+		goto out_set_bit;
+	}
+
+	if (__predict_false(ksiginfo_zone == NULL)) {
+		SIGADDSET(sq->sq_kill, signo);
+		goto out_set_bit;
+	}
+
+	if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
+		signal_overflow++;
+		ret = EAGAIN;
+	} else if ((ksi = ksiginfo_alloc(0)) == NULL) {
+		signal_alloc_fail++;
+		ret = EAGAIN;
+	} else {
+		if (p != NULL)
+			p->p_pendingcnt++;
+		ksiginfo_copy(si, ksi);
+		ksi->ksi_signo = signo;
+		if (si->ksi_flags & KSI_HEAD)
+			TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link);
+		else
+			TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
+		ksi->ksi_sigq = sq;
+	}
+
+	if ((si->ksi_flags & KSI_TRAP) != 0 ||
+	    (si->ksi_flags & KSI_SIGQ) == 0) {
+		if (ret != 0)
+			SIGADDSET(sq->sq_kill, signo);
+		ret = 0;
+		goto out_set_bit;
+	}
+
+	if (ret != 0)
+		return (ret);
+
+out_set_bit:
+	SIGADDSET(sq->sq_signals, signo);
+	return (ret);
+}
+
+void
+sigqueue_flush(sigqueue_t *sq)
+{
+	struct proc *p = sq->sq_proc;
+	ksiginfo_t *ksi;
+
+	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+
+	if (p != NULL)
+		PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
+		TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+		ksi->ksi_sigq = NULL;
+		if (ksiginfo_tryfree(ksi) && p != NULL)
+			p->p_pendingcnt--;
+	}
+
+	SIGEMPTYSET(sq->sq_signals);
+	SIGEMPTYSET(sq->sq_kill);
+}
+
+static void
+sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set)
+{
+	sigset_t tmp;
+	struct proc *p1, *p2;
+	ksiginfo_t *ksi, *next;
+
+	KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
+	KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
+	p1 = src->sq_proc;
+	p2 = dst->sq_proc;
+	/* Move siginfo to target list */
+	TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
+		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
+			TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
+			if (p1 != NULL)
+				p1->p_pendingcnt--;
+			TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
+			ksi->ksi_sigq = dst;
+			if (p2 != NULL)
+				p2->p_pendingcnt++;
+		}
+	}
+
+	/* Move pending bits to target list */
+	tmp = src->sq_kill;
+	SIGSETAND(tmp, *set);
+	SIGSETOR(dst->sq_kill, tmp);
+	SIGSETNAND(src->sq_kill, tmp);
+
+	tmp = src->sq_signals;
+	SIGSETAND(tmp, *set);
+	SIGSETOR(dst->sq_signals, tmp);
+	SIGSETNAND(src->sq_signals, tmp);
+}
+
+#if 0
+static void
+sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
+{
+	sigset_t set;
+
+	SIGEMPTYSET(set);
+	SIGADDSET(set, signo);
+	sigqueue_move_set(src, dst, &set);
+}
+#endif
+
+static void
+sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set)
+{
+	struct proc *p = sq->sq_proc;
+	ksiginfo_t *ksi, *next;
+
+	KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
+
+	/* Remove siginfo queue */
+	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
+		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
+			TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+			ksi->ksi_sigq = NULL;
+			if (ksiginfo_tryfree(ksi) && p != NULL)
+				p->p_pendingcnt--;
+		}
+	}
+	SIGSETNAND(sq->sq_kill, *set);
+	SIGSETNAND(sq->sq_signals, *set);
+}
+
+void
+sigqueue_delete(sigqueue_t *sq, int signo)
+{
+	sigset_t set;
+
+	SIGEMPTYSET(set);
+	SIGADDSET(set, signo);
+	sigqueue_delete_set(sq, &set);
+}
+
+/* Remove a set of signals for a process */
+static void
+sigqueue_delete_set_proc(struct proc *p, const sigset_t *set)
+{
+	sigqueue_t worklist;
+	struct thread *td0;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	sigqueue_init(&worklist, NULL);
+	sigqueue_move_set(&p->p_sigqueue, &worklist, set);
+
+	FOREACH_THREAD_IN_PROC(p, td0)
+		sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
+
+	sigqueue_flush(&worklist);
+}
+
+void
+sigqueue_delete_proc(struct proc *p, int signo)
+{
+	sigset_t set;
+
+	SIGEMPTYSET(set);
+	SIGADDSET(set, signo);
+	sigqueue_delete_set_proc(p, &set);
+}
+
+static void
+sigqueue_delete_stopmask_proc(struct proc *p)
+{
+	sigset_t set;
+
+	SIGEMPTYSET(set);
+	SIGADDSET(set, SIGSTOP);
+	SIGADDSET(set, SIGTSTP);
+	SIGADDSET(set, SIGTTIN);
+	SIGADDSET(set, SIGTTOU);
+	sigqueue_delete_set_proc(p, &set);
+}
+
+/*
+ * Determine signal that should be delivered to thread td, the current
+ * thread, 0 if none.  If there is a pending stop signal with default
+ * action, the process stops in issignal().
+ */
+int
+cursig(struct thread *td)
+{
+	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
+	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
+	return (SIGPENDING(td) ? issignal(td) : 0);
+}
+
+/*
+ * Arrange for ast() to handle unmasked pending signals on return to user
+ * mode.  This must be called whenever a signal is added to td_sigqueue or
+ * unmasked in td_sigmask.
+ */
+void
+signotify(struct thread *td)
+{
+	struct proc *p;
+
+	p = td->td_proc;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	if (SIGPENDING(td)) {
+		thread_lock(td);
+		td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
+		thread_unlock(td);
+	}
+}
+
+int
+sigonstack(size_t sp)
+{
+	struct thread *td = curthread;
+
+	return ((td->td_pflags & TDP_ALTSTACK) ?
+#if defined(COMPAT_43)
+	    ((td->td_sigstk.ss_size == 0) ?
+		(td->td_sigstk.ss_flags & SS_ONSTACK) :
+		((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size))
+#else
+	    ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)
+#endif
+	    : 0);
+}
+
+static __inline int
+sigprop(int sig)
+{
+
+	if (sig > 0 && sig < NSIG)
+		return (sigproptbl[_SIG_IDX(sig)]);
+	return (0);
+}
+
+int
+sig_ffs(sigset_t *set)
+{
+	int i;
+
+	for (i = 0; i < _SIG_WORDS; i++)
+		if (set->__bits[i])
+			return (ffs(set->__bits[i]) + (i * 32));
+	return (0);
+}
+
+/*
+ * kern_sigaction
+ * sigaction
+ * freebsd4_sigaction
+ * osigaction
+ */
+int
+kern_sigaction(td, sig, act, oact, flags)
+	struct thread *td;
+	register int sig;
+	struct sigaction *act, *oact;
+	int flags;
+{
+	struct sigacts *ps;
+	struct proc *p = td->td_proc;
+
+	if (!_SIG_VALID(sig))
+		return (EINVAL);
+
+	PROC_LOCK(p);
+	ps = p->p_sigacts;
+	mtx_lock(&ps->ps_mtx);
+	if (oact) {
+		oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
+		oact->sa_flags = 0;
+		if (SIGISMEMBER(ps->ps_sigonstack, sig))
+			oact->sa_flags |= SA_ONSTACK;
+		if (!SIGISMEMBER(ps->ps_sigintr, sig))
+			oact->sa_flags |= SA_RESTART;
+		if (SIGISMEMBER(ps->ps_sigreset, sig))
+			oact->sa_flags |= SA_RESETHAND;
+		if (SIGISMEMBER(ps->ps_signodefer, sig))
+			oact->sa_flags |= SA_NODEFER;
+		if (SIGISMEMBER(ps->ps_siginfo, sig)) {
+			oact->sa_flags |= SA_SIGINFO;
+			oact->sa_sigaction =
+			    (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)];
+		} else
+			oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
+		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
+			oact->sa_flags |= SA_NOCLDSTOP;
+		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
+			oact->sa_flags |= SA_NOCLDWAIT;
+	}
+	if (act) {
+		if ((sig == SIGKILL || sig == SIGSTOP) &&
+		    act->sa_handler != SIG_DFL) {
+			mtx_unlock(&ps->ps_mtx);
+			PROC_UNLOCK(p);
+			return (EINVAL);
+		}
+
+		/*
+		 * Change setting atomically.
+		 */
+
+		ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
+		SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
+		if (act->sa_flags & SA_SIGINFO) {
+			ps->ps_sigact[_SIG_IDX(sig)] =
+			    (__sighandler_t *)act->sa_sigaction;
+			SIGADDSET(ps->ps_siginfo, sig);
+		} else {
+			ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
+			SIGDELSET(ps->ps_siginfo, sig);
+		}
+		if (!(act->sa_flags & SA_RESTART))
+			SIGADDSET(ps->ps_sigintr, sig);
+		else
+			SIGDELSET(ps->ps_sigintr, sig);
+		if (act->sa_flags & SA_ONSTACK)
+			SIGADDSET(ps->ps_sigonstack, sig);
+		else
+			SIGDELSET(ps->ps_sigonstack, sig);
+		if (act->sa_flags & SA_RESETHAND)
+			SIGADDSET(ps->ps_sigreset, sig);
+		else
+			SIGDELSET(ps->ps_sigreset, sig);
+		if (act->sa_flags & SA_NODEFER)
+			SIGADDSET(ps->ps_signodefer, sig);
+		else
+			SIGDELSET(ps->ps_signodefer, sig);
+		if (sig == SIGCHLD) {
+			if (act->sa_flags & SA_NOCLDSTOP)
+				ps->ps_flag |= PS_NOCLDSTOP;
+			else
+				ps->ps_flag &= ~PS_NOCLDSTOP;
+			if (act->sa_flags & SA_NOCLDWAIT) {
+				/*
+				 * Paranoia: since SA_NOCLDWAIT is implemented
+				 * by reparenting the dying child to PID 1 (and
+				 * trust it to reap the zombie), PID 1 itself
+				 * is forbidden to set SA_NOCLDWAIT.
+				 */
+				if (p->p_pid == 1)
+					ps->ps_flag &= ~PS_NOCLDWAIT;
+				else
+					ps->ps_flag |= PS_NOCLDWAIT;
+			} else
+				ps->ps_flag &= ~PS_NOCLDWAIT;
+			if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
+				ps->ps_flag |= PS_CLDSIGIGN;
+			else
+				ps->ps_flag &= ~PS_CLDSIGIGN;
+		}
+		/*
+		 * Set bit in ps_sigignore for signals that are set to SIG_IGN,
+		 * and for signals set to SIG_DFL where the default is to
+		 * ignore. However, don't put SIGCONT in ps_sigignore, as we
+		 * have to restart the process.
+		 */
+		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
+		    (sigprop(sig) & SA_IGNORE &&
+		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
+			/* never to be seen again */
+			sigqueue_delete_proc(p, sig);
+			if (sig != SIGCONT)
+				/* easier in psignal */
+				SIGADDSET(ps->ps_sigignore, sig);
+			SIGDELSET(ps->ps_sigcatch, sig);
+		} else {
+			SIGDELSET(ps->ps_sigignore, sig);
+			if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
+				SIGDELSET(ps->ps_sigcatch, sig);
+			else
+				SIGADDSET(ps->ps_sigcatch, sig);
+		}
+#ifdef COMPAT_FREEBSD4
+		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
+		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
+		    (flags & KSA_FREEBSD4) == 0)
+			SIGDELSET(ps->ps_freebsd4, sig);
+		else
+			SIGADDSET(ps->ps_freebsd4, sig);
+#endif
+#ifdef COMPAT_43
+		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
+		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
+		    (flags & KSA_OSIGSET) == 0)
+			SIGDELSET(ps->ps_osigset, sig);
+		else
+			SIGADDSET(ps->ps_osigset, sig);
+#endif
+	}
+	mtx_unlock(&ps->ps_mtx);
+	PROC_UNLOCK(p);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigaction_args {
+	int	sig;
+	struct	sigaction *act;
+	struct	sigaction *oact;
+};
+#endif
+int
+sys_sigaction(td, uap)
+	struct thread *td;
+	register struct sigaction_args *uap;
+{
+	struct sigaction act, oact;
+	register struct sigaction *actp, *oactp;
+	int error;
+
+	actp = (uap->act != NULL) ? &act : NULL;
+	oactp = (uap->oact != NULL) ? &oact : NULL;
+	if (actp) {
+		error = copyin(uap->act, actp, sizeof(act));
+		if (error)
+			return (error);
+	}
+	error = kern_sigaction(td, uap->sig, actp, oactp, 0);
+	if (oactp && !error)
+		error = copyout(oactp, uap->oact, sizeof(oact));
+	return (error);
+}
+
+#ifdef COMPAT_FREEBSD4
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_sigaction_args {
+	int	sig;
+	struct	sigaction *act;
+	struct	sigaction *oact;
+};
+#endif
+int
+freebsd4_sigaction(td, uap)
+	struct thread *td;
+	register struct freebsd4_sigaction_args *uap;
+{
+	struct sigaction act, oact;
+	register struct sigaction *actp, *oactp;
+	int error;
+
+
+	actp = (uap->act != NULL) ? &act : NULL;
+	oactp = (uap->oact != NULL) ? &oact : NULL;
+	if (actp) {
+		error = copyin(uap->act, actp, sizeof(act));
+		if (error)
+			return (error);
+	}
+	error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
+	if (oactp && !error)
+		error = copyout(oactp, uap->oact, sizeof(oact));
+	return (error);
+}
+#endif	/* COMAPT_FREEBSD4 */
+
+#ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
+#ifndef _SYS_SYSPROTO_H_
+struct osigaction_args {
+	int	signum;
+	struct	osigaction *nsa;
+	struct	osigaction *osa;
+};
+#endif
+int
+osigaction(td, uap)
+	struct thread *td;
+	register struct osigaction_args *uap;
+{
+	struct osigaction sa;
+	struct sigaction nsa, osa;
+	register struct sigaction *nsap, *osap;
+	int error;
+
+	if (uap->signum <= 0 || uap->signum >= ONSIG)
+		return (EINVAL);
+
+	nsap = (uap->nsa != NULL) ? &nsa : NULL;
+	osap = (uap->osa != NULL) ? &osa : NULL;
+
+	if (nsap) {
+		error = copyin(uap->nsa, &sa, sizeof(sa));
+		if (error)
+			return (error);
+		nsap->sa_handler = sa.sa_handler;
+		nsap->sa_flags = sa.sa_flags;
+		OSIG2SIG(sa.sa_mask, nsap->sa_mask);
+	}
+	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
+	if (osap && !error) {
+		sa.sa_handler = osap->sa_handler;
+		sa.sa_flags = osap->sa_flags;
+		SIG2OSIG(osap->sa_mask, sa.sa_mask);
+		error = copyout(&sa, uap->osa, sizeof(sa));
+	}
+	return (error);
+}
+
+#if !defined(__i386__)
+/* Avoid replicating the same stub everywhere */
+int
+osigreturn(td, uap)
+	struct thread *td;
+	struct osigreturn_args *uap;
+{
+
+	return (nosys(td, (struct nosys_args *)uap));
+}
+#endif
+#endif /* COMPAT_43 */
+
+/*
+ * Initialize signal state for process 0;
+ * set to ignore signals that are ignored by default.
+ */
+void
+siginit(p)
+	struct proc *p;
+{
+	register int i;
+	struct sigacts *ps;
+
+	PROC_LOCK(p);
+	ps = p->p_sigacts;
+	mtx_lock(&ps->ps_mtx);
+	for (i = 1; i <= NSIG; i++)
+		if (sigprop(i) & SA_IGNORE && i != SIGCONT)
+			SIGADDSET(ps->ps_sigignore, i);
+	mtx_unlock(&ps->ps_mtx);
+	PROC_UNLOCK(p);
+}
+
+/*
+ * Reset signals for an exec of the specified process.
+ */
+void
+execsigs(struct proc *p)
+{
+	struct sigacts *ps;
+	int sig;
+	struct thread *td;
+
+	/*
+	 * Reset caught signals.  Held signals remain held
+	 * through td_sigmask (unless they were caught,
+	 * and are now ignored by default).
+	 */
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	td = FIRST_THREAD_IN_PROC(p);
+	ps = p->p_sigacts;
+	mtx_lock(&ps->ps_mtx);
+	while (SIGNOTEMPTY(ps->ps_sigcatch)) {
+		sig = sig_ffs(&ps->ps_sigcatch);
+		SIGDELSET(ps->ps_sigcatch, sig);
+		if (sigprop(sig) & SA_IGNORE) {
+			if (sig != SIGCONT)
+				SIGADDSET(ps->ps_sigignore, sig);
+			sigqueue_delete_proc(p, sig);
+		}
+		ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+	}
+	/*
+	 * Reset stack state to the user stack.
+	 * Clear set of signals caught on the signal stack.
+	 */
+	td->td_sigstk.ss_flags = SS_DISABLE;
+	td->td_sigstk.ss_size = 0;
+	td->td_sigstk.ss_sp = 0;
+	td->td_pflags &= ~TDP_ALTSTACK;
+	/*
+	 * Reset no zombies if child dies flag as Solaris does.
+	 */
+	ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
+	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
+		ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
+	mtx_unlock(&ps->ps_mtx);
+}
+
+/*
+ * kern_sigprocmask()
+ *
+ *	Manipulate signal mask.
+ */
+int
+kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset,
+    int flags)
+{
+	sigset_t new_block, oset1;
+	struct proc *p;
+	int error;
+
+	p = td->td_proc;
+	if (!(flags & SIGPROCMASK_PROC_LOCKED))
+		PROC_LOCK(p);
+	if (oset != NULL)
+		*oset = td->td_sigmask;
+
+	error = 0;
+	if (set != NULL) {
+		switch (how) {
+		case SIG_BLOCK:
+			SIG_CANTMASK(*set);
+			oset1 = td->td_sigmask;
+			SIGSETOR(td->td_sigmask, *set);
+			new_block = td->td_sigmask;
+			SIGSETNAND(new_block, oset1);
+			break;
+		case SIG_UNBLOCK:
+			SIGSETNAND(td->td_sigmask, *set);
+			signotify(td);
+			goto out;
+		case SIG_SETMASK:
+			SIG_CANTMASK(*set);
+			oset1 = td->td_sigmask;
+			if (flags & SIGPROCMASK_OLD)
+				SIGSETLO(td->td_sigmask, *set);
+			else
+				td->td_sigmask = *set;
+			new_block = td->td_sigmask;
+			SIGSETNAND(new_block, oset1);
+			signotify(td);
+			break;
+		default:
+			error = EINVAL;
+			goto out;
+		}
+
+		/*
+		 * The new_block set contains signals that were not previously
+		 * blocked, but are blocked now.
+		 *
+		 * In case we block any signal that was not previously blocked
+		 * for td, and process has the signal pending, try to schedule
+		 * signal delivery to some thread that does not block the
+		 * signal, possibly waking it up.
+		 */
+		if (p->p_numthreads != 1)
+			reschedule_signals(p, new_block, flags);
+	}
+
+out:
+	if (!(flags & SIGPROCMASK_PROC_LOCKED))
+		PROC_UNLOCK(p);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigprocmask_args {
+	int	how;
+	const sigset_t *set;
+	sigset_t *oset;
+};
+#endif
+int
+sys_sigprocmask(td, uap)
+	register struct thread *td;
+	struct sigprocmask_args *uap;
+{
+	sigset_t set, oset;
+	sigset_t *setp, *osetp;
+	int error;
+
+	setp = (uap->set != NULL) ? &set : NULL;
+	osetp = (uap->oset != NULL) ? &oset : NULL;
+	if (setp) {
+		error = copyin(uap->set, setp, sizeof(set));
+		if (error)
+			return (error);
+	}
+	error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
+	if (osetp && !error) {
+		error = copyout(osetp, uap->oset, sizeof(oset));
+	}
+	return (error);
+}
+
+#ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
+#ifndef _SYS_SYSPROTO_H_
+struct osigprocmask_args {
+	int	how;
+	osigset_t mask;
+};
+#endif
+int
+osigprocmask(td, uap)
+	register struct thread *td;
+	struct osigprocmask_args *uap;
+{
+	sigset_t set, oset;
+	int error;
+
+	OSIG2SIG(uap->mask, set);
+	error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
+	SIG2OSIG(oset, td->td_retval[0]);
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+int
+sys_sigwait(struct thread *td, struct sigwait_args *uap)
+{
+	ksiginfo_t ksi;
+	sigset_t set;
+	int error;
+
+	error = copyin(uap->set, &set, sizeof(set));
+	if (error) {
+		td->td_retval[0] = error;
+		return (0);
+	}
+
+	error = kern_sigtimedwait(td, set, &ksi, NULL);
+	if (error) {
+		if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT)
+			error = ERESTART;
+		if (error == ERESTART)
+			return (error);
+		td->td_retval[0] = error;
+		return (0);
+	}
+
+	error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
+	td->td_retval[0] = error;
+	return (0);
+}
+
+int
+sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
+{
+	struct timespec ts;
+	struct timespec *timeout;
+	sigset_t set;
+	ksiginfo_t ksi;
+	int error;
+
+	if (uap->timeout) {
+		error = copyin(uap->timeout, &ts, sizeof(ts));
+		if (error)
+			return (error);
+
+		timeout = &ts;
+	} else
+		timeout = NULL;
+
+	error = copyin(uap->set, &set, sizeof(set));
+	if (error)
+		return (error);
+
+	error = kern_sigtimedwait(td, set, &ksi, timeout);
+	if (error)
+		return (error);
+
+	if (uap->info)
+		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
+
+	if (error == 0)
+		td->td_retval[0] = ksi.ksi_signo;
+	return (error);
+}
+
+int
+sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
+{
+	ksiginfo_t ksi;
+	sigset_t set;
+	int error;
+
+	error = copyin(uap->set, &set, sizeof(set));
+	if (error)
+		return (error);
+
+	error = kern_sigtimedwait(td, set, &ksi, NULL);
+	if (error)
+		return (error);
+
+	if (uap->info)
+		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
+
+	if (error == 0)
+		td->td_retval[0] = ksi.ksi_signo;
+	return (error);
+}
+
+int
+kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
+	struct timespec *timeout)
+{
+	struct sigacts *ps;
+	sigset_t saved_mask, new_block;
+	struct proc *p;
+	int error, sig, timo, timevalid = 0;
+	struct timespec rts, ets, ts;
+	struct timeval tv;
+
+	p = td->td_proc;
+	error = 0;
+	ets.tv_sec = 0;
+	ets.tv_nsec = 0;
+
+	if (timeout != NULL) {
+		if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) {
+			timevalid = 1;
+			getnanouptime(&rts);
+			ets = rts;
+			timespecadd(&ets, timeout);
+		}
+	}
+	ksiginfo_init(ksi);
+	/* Some signals can not be waited for. */
+	SIG_CANTMASK(waitset);
+	ps = p->p_sigacts;
+	PROC_LOCK(p);
+	saved_mask = td->td_sigmask;
+	SIGSETNAND(td->td_sigmask, waitset);
+	for (;;) {
+		mtx_lock(&ps->ps_mtx);
+		sig = cursig(td);
+		mtx_unlock(&ps->ps_mtx);
+		if (sig != 0 && SIGISMEMBER(waitset, sig)) {
+			if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 ||
+			    sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) {
+				error = 0;
+				break;
+			}
+		}
+
+		if (error != 0)
+			break;
+
+		/*
+		 * POSIX says this must be checked after looking for pending
+		 * signals.
+		 */
+		if (timeout != NULL) {
+			if (!timevalid) {
+				error = EINVAL;
+				break;
+			}
+			getnanouptime(&rts);
+			if (timespeccmp(&rts, &ets, >=)) {
+				error = EAGAIN;
+				break;
+			}
+			ts = ets;
+			timespecsub(&ts, &rts);
+			TIMESPEC_TO_TIMEVAL(&tv, &ts);
+			timo = tvtohz(&tv);
+		} else {
+			timo = 0;
+		}
+
+		error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", timo);
+
+		if (timeout != NULL) {
+			if (error == ERESTART) {
+				/* Timeout can not be restarted. */
+				error = EINTR;
+			} else if (error == EAGAIN) {
+				/* We will calculate timeout by ourself. */
+				error = 0;
+			}
+		}
+	}
+
+	new_block = saved_mask;
+	SIGSETNAND(new_block, td->td_sigmask);
+	td->td_sigmask = saved_mask;
+	/*
+	 * Fewer signals can be delivered to us, reschedule signal
+	 * notification.
+	 */
+	if (p->p_numthreads != 1)
+		reschedule_signals(p, new_block, 0);
+
+	if (error == 0) {
+		SDT_PROBE(proc, kernel, , signal_clear, sig, ksi, 0, 0, 0);
+
+		if (ksi->ksi_code == SI_TIMER)
+			itimer_accept(p, ksi->ksi_timerid, ksi);
+
+#ifdef KTRACE
+		if (KTRPOINT(td, KTR_PSIG)) {
+			sig_t action;
+
+			mtx_lock(&ps->ps_mtx);
+			action = ps->ps_sigact[_SIG_IDX(sig)];
+			mtx_unlock(&ps->ps_mtx);
+			ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code);
+		}
+#endif
+		if (sig == SIGKILL)
+			sigexit(td, sig);
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigpending_args {
+	sigset_t	*set;
+};
+#endif
+int
+sys_sigpending(td, uap)
+	struct thread *td;
+	struct sigpending_args *uap;
+{
+	struct proc *p = td->td_proc;
+	sigset_t pending;
+
+	PROC_LOCK(p);
+	pending = p->p_sigqueue.sq_signals;
+	SIGSETOR(pending, td->td_sigqueue.sq_signals);
+	PROC_UNLOCK(p);
+	return (copyout(&pending, uap->set, sizeof(sigset_t)));
+}
+
+#ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
+#ifndef _SYS_SYSPROTO_H_
+struct osigpending_args {
+	int	dummy;
+};
+#endif
+int
+osigpending(td, uap)
+	struct thread *td;
+	struct osigpending_args *uap;
+{
+	struct proc *p = td->td_proc;
+	sigset_t pending;
+
+	PROC_LOCK(p);
+	pending = p->p_sigqueue.sq_signals;
+	SIGSETOR(pending, td->td_sigqueue.sq_signals);
+	PROC_UNLOCK(p);
+	SIG2OSIG(pending, td->td_retval[0]);
+	return (0);
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_43)
+/*
+ * Generalized interface signal handler, 4.3-compatible.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct osigvec_args {
+	int	signum;
+	struct	sigvec *nsv;
+	struct	sigvec *osv;
+};
+#endif
+/* ARGSUSED */
+int
+osigvec(td, uap)
+	struct thread *td;
+	register struct osigvec_args *uap;
+{
+	struct sigvec vec;
+	struct sigaction nsa, osa;
+	register struct sigaction *nsap, *osap;
+	int error;
+
+	if (uap->signum <= 0 || uap->signum >= ONSIG)
+		return (EINVAL);
+	nsap = (uap->nsv != NULL) ? &nsa : NULL;
+	osap = (uap->osv != NULL) ? &osa : NULL;
+	if (nsap) {
+		error = copyin(uap->nsv, &vec, sizeof(vec));
+		if (error)
+			return (error);
+		nsap->sa_handler = vec.sv_handler;
+		OSIG2SIG(vec.sv_mask, nsap->sa_mask);
+		nsap->sa_flags = vec.sv_flags;
+		nsap->sa_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
+	}
+	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
+	if (osap && !error) {
+		vec.sv_handler = osap->sa_handler;
+		SIG2OSIG(osap->sa_mask, vec.sv_mask);
+		vec.sv_flags = osap->sa_flags;
+		vec.sv_flags &= ~SA_NOCLDWAIT;
+		vec.sv_flags ^= SA_RESTART;
+		error = copyout(&vec, uap->osv, sizeof(vec));
+	}
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct osigblock_args {
+	int	mask;
+};
+#endif
+int
+osigblock(td, uap)
+	register struct thread *td;
+	struct osigblock_args *uap;
+{
+	sigset_t set, oset;
+
+	OSIG2SIG(uap->mask, set);
+	kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0);
+	SIG2OSIG(oset, td->td_retval[0]);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct osigsetmask_args {
+	int	mask;
+};
+#endif
+int
+osigsetmask(td, uap)
+	struct thread *td;
+	struct osigsetmask_args *uap;
+{
+	sigset_t set, oset;
+
+	OSIG2SIG(uap->mask, set);
+	kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0);
+	SIG2OSIG(oset, td->td_retval[0]);
+	return (0);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Suspend calling thread until signal, providing mask to be set in the
+ * meantime.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sigsuspend_args {
+	const sigset_t *sigmask;
+};
+#endif
+/* ARGSUSED */
+int
+sys_sigsuspend(td, uap)
+	struct thread *td;
+	struct sigsuspend_args *uap;
+{
+	sigset_t mask;
+	int error;
+
+	error = copyin(uap->sigmask, &mask, sizeof(mask));
+	if (error)
+		return (error);
+	return (kern_sigsuspend(td, mask));
+}
+
+int
+kern_sigsuspend(struct thread *td, sigset_t mask)
+{
+	struct proc *p = td->td_proc;
+	int has_sig, sig;
+
+	/*
+	 * When returning from sigsuspend, we want
+	 * the old mask to be restored after the
+	 * signal handler has finished.  Thus, we
+	 * save it here and mark the sigacts structure
+	 * to indicate this.
+	 */
+	PROC_LOCK(p);
+	kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask,
+	    SIGPROCMASK_PROC_LOCKED);
+	td->td_pflags |= TDP_OLDMASK;
+
+	/*
+	 * Process signals now. Otherwise, we can get spurious wakeup
+	 * due to signal entered process queue, but delivered to other
+	 * thread. But sigsuspend should return only on signal
+	 * delivery.
+	 */
+	(p->p_sysent->sv_set_syscall_retval)(td, EINTR);
+	for (has_sig = 0; !has_sig;) {
+		while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause",
+			0) == 0)
+			/* void */;
+		thread_suspend_check(0);
+		mtx_lock(&p->p_sigacts->ps_mtx);
+		while ((sig = cursig(td)) != 0)
+			has_sig += postsig(sig);
+		mtx_unlock(&p->p_sigacts->ps_mtx);
+	}
+	PROC_UNLOCK(p);
+	td->td_errno = EINTR;
+	td->td_pflags |= TDP_NERRNO;
+	return (EJUSTRETURN);
+}
+
+#ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
+/*
+ * Compatibility sigsuspend call for old binaries.  Note nonstandard calling
+ * convention: libc stub passes mask, not pointer, to save a copyin.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct osigsuspend_args {
+	osigset_t mask;
+};
+#endif
+/* ARGSUSED */
+int
+osigsuspend(td, uap)
+	struct thread *td;
+	struct osigsuspend_args *uap;
+{
+	sigset_t mask;
+
+	OSIG2SIG(uap->mask, mask);
+	return (kern_sigsuspend(td, mask));
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_43)
+#ifndef _SYS_SYSPROTO_H_
+struct osigstack_args {
+	struct	sigstack *nss;
+	struct	sigstack *oss;
+};
+#endif
+/* ARGSUSED */
+int
+osigstack(td, uap)
+	struct thread *td;
+	register struct osigstack_args *uap;
+{
+	struct sigstack nss, oss;
+	int error = 0;
+
+	if (uap->nss != NULL) {
+		error = copyin(uap->nss, &nss, sizeof(nss));
+		if (error)
+			return (error);
+	}
+	oss.ss_sp = td->td_sigstk.ss_sp;
+	oss.ss_onstack = sigonstack(cpu_getstack(td));
+	if (uap->nss != NULL) {
+		td->td_sigstk.ss_sp = nss.ss_sp;
+		td->td_sigstk.ss_size = 0;
+		td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK;
+		td->td_pflags |= TDP_ALTSTACK;
+	}
+	if (uap->oss != NULL)
+		error = copyout(&oss, uap->oss, sizeof(oss));
+
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigaltstack_args {
+	stack_t	*ss;
+	stack_t	*oss;
+};
+#endif
+/* ARGSUSED */
+int
+sys_sigaltstack(td, uap)
+	struct thread *td;
+	register struct sigaltstack_args *uap;
+{
+	stack_t ss, oss;
+	int error;
+
+	if (uap->ss != NULL) {
+		error = copyin(uap->ss, &ss, sizeof(ss));
+		if (error)
+			return (error);
+	}
+	error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
+	    (uap->oss != NULL) ? &oss : NULL);
+	if (error)
+		return (error);
+	if (uap->oss != NULL)
+		error = copyout(&oss, uap->oss, sizeof(stack_t));
+	return (error);
+}
+
+int
+kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss)
+{
+	struct proc *p = td->td_proc;
+	int oonstack;
+
+	oonstack = sigonstack(cpu_getstack(td));
+
+	if (oss != NULL) {
+		*oss = td->td_sigstk;
+		oss->ss_flags = (td->td_pflags & TDP_ALTSTACK)
+		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
+	}
+
+	if (ss != NULL) {
+		if (oonstack)
+			return (EPERM);
+		if ((ss->ss_flags & ~SS_DISABLE) != 0)
+			return (EINVAL);
+		if (!(ss->ss_flags & SS_DISABLE)) {
+			if (ss->ss_size < p->p_sysent->sv_minsigstksz)
+				return (ENOMEM);
+
+			td->td_sigstk = *ss;
+			td->td_pflags |= TDP_ALTSTACK;
+		} else {
+			td->td_pflags &= ~TDP_ALTSTACK;
+		}
+	}
+	return (0);
+}
+
+/*
+ * Common code for kill process group/broadcast kill.
+ * cp is calling process.
+ */
+static int
+killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi)
+{
+	struct proc *p;
+	struct pgrp *pgrp;
+	int err;
+	int ret;
+
+	ret = ESRCH;
+	if (all) {
+		/*
+		 * broadcast
+		 */
+		sx_slock(&allproc_lock);
+		FOREACH_PROC_IN_SYSTEM(p) {
+			PROC_LOCK(p);
+			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
+			    p == td->td_proc || p->p_state == PRS_NEW) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			err = p_cansignal(td, p, sig);
+			if (err == 0) {
+				if (sig)
+					pksignal(p, sig, ksi);
+				ret = err;
+			}
+			else if (ret == ESRCH)
+				ret = err;
+			PROC_UNLOCK(p);
+		}
+		sx_sunlock(&allproc_lock);
+	} else {
+		sx_slock(&proctree_lock);
+		if (pgid == 0) {
+			/*
+			 * zero pgid means send to my process group.
+			 */
+			pgrp = td->td_proc->p_pgrp;
+			PGRP_LOCK(pgrp);
+		} else {
+			pgrp = pgfind(pgid);
+			if (pgrp == NULL) {
+				sx_sunlock(&proctree_lock);
+				return (ESRCH);
+			}
+		}
+		sx_sunlock(&proctree_lock);
+		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
+			PROC_LOCK(p);
+			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
+			    p->p_state == PRS_NEW) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			err = p_cansignal(td, p, sig);
+			if (err == 0) {
+				if (sig)
+					pksignal(p, sig, ksi);
+				ret = err;
+			}
+			else if (ret == ESRCH)
+				ret = err;
+			PROC_UNLOCK(p);
+		}
+		PGRP_UNLOCK(pgrp);
+	}
+	return (ret);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct kill_args {
+	int	pid;
+	int	signum;
+};
+#endif
+/* ARGSUSED */
+int
+sys_kill(struct thread *td, struct kill_args *uap)
+{
+	ksiginfo_t ksi;
+	struct proc *p;
+	int error;
+
+	/*
+	 * A process in capability mode can send signals only to himself.
+	 * The main rationale behind this is that abort(3) is implemented as
+	 * kill(getpid(), SIGABRT).
+	 */
+	if (IN_CAPABILITY_MODE(td) && uap->pid != td->td_proc->p_pid)
+		return (ECAPMODE);
+
+	AUDIT_ARG_SIGNUM(uap->signum);
+	AUDIT_ARG_PID(uap->pid);
+	if ((u_int)uap->signum > _SIG_MAXSIG)
+		return (EINVAL);
+
+	ksiginfo_init(&ksi);
+	ksi.ksi_signo = uap->signum;
+	ksi.ksi_code = SI_USER;
+	ksi.ksi_pid = td->td_proc->p_pid;
+	ksi.ksi_uid = td->td_ucred->cr_ruid;
+
+	if (uap->pid > 0) {
+		/* kill single process */
+		if ((p = pfind(uap->pid)) == NULL) {
+			if ((p = zpfind(uap->pid)) == NULL)
+				return (ESRCH);
+		}
+		AUDIT_ARG_PROCESS(p);
+		error = p_cansignal(td, p, uap->signum);
+		if (error == 0 && uap->signum)
+			pksignal(p, uap->signum, &ksi);
+		PROC_UNLOCK(p);
+		return (error);
+	}
+	switch (uap->pid) {
+	case -1:		/* broadcast signal */
+		return (killpg1(td, uap->signum, 0, 1, &ksi));
+	case 0:			/* signal own process group */
+		return (killpg1(td, uap->signum, 0, 0, &ksi));
+	default:		/* negative explicit process group */
+		return (killpg1(td, uap->signum, -uap->pid, 0, &ksi));
+	}
+	/* NOTREACHED */
+}
+
+int
+sys_pdkill(td, uap)
+	struct thread *td;
+	struct pdkill_args *uap;
+{
+#ifdef PROCDESC
+	struct proc *p;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_SIGNUM(uap->signum);
+	AUDIT_ARG_FD(uap->fd);
+	if ((u_int)uap->signum > _SIG_MAXSIG)
+		return (EINVAL);
+
+	error = procdesc_find(td, uap->fd,
+	    cap_rights_init(&rights, CAP_PDKILL), &p);
+	if (error)
+		return (error);
+	AUDIT_ARG_PROCESS(p);
+	error = p_cansignal(td, p, uap->signum);
+	if (error == 0 && uap->signum)
+		kern_psignal(p, uap->signum);
+	PROC_UNLOCK(p);
+	return (error);
+#else
+	return (ENOSYS);
+#endif
+}
+
+#if defined(COMPAT_43)
+#ifndef _SYS_SYSPROTO_H_
+struct okillpg_args {
+	int	pgid;
+	int	signum;
+};
+#endif
+/* ARGSUSED */
+int
+okillpg(struct thread *td, struct okillpg_args *uap)
+{
+	ksiginfo_t ksi;
+
+	AUDIT_ARG_SIGNUM(uap->signum);
+	AUDIT_ARG_PID(uap->pgid);
+	if ((u_int)uap->signum > _SIG_MAXSIG)
+		return (EINVAL);
+
+	ksiginfo_init(&ksi);
+	ksi.ksi_signo = uap->signum;
+	ksi.ksi_code = SI_USER;
+	ksi.ksi_pid = td->td_proc->p_pid;
+	ksi.ksi_uid = td->td_ucred->cr_ruid;
+	return (killpg1(td, uap->signum, uap->pgid, 0, &ksi));
+}
+#endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigqueue_args {
+	pid_t pid;
+	int signum;
+	/* union sigval */ void *value;
+};
+#endif
+int
+sys_sigqueue(struct thread *td, struct sigqueue_args *uap)
+{
+	ksiginfo_t ksi;
+	struct proc *p;
+	int error;
+
+	if ((u_int)uap->signum > _SIG_MAXSIG)
+		return (EINVAL);
+
+	/*
+	 * Specification says sigqueue can only send signal to
+	 * single process.
+	 */
+	if (uap->pid <= 0)
+		return (EINVAL);
+
+	if ((p = pfind(uap->pid)) == NULL) {
+		if ((p = zpfind(uap->pid)) == NULL)
+			return (ESRCH);
+	}
+	error = p_cansignal(td, p, uap->signum);
+	if (error == 0 && uap->signum != 0) {
+		ksiginfo_init(&ksi);
+		ksi.ksi_flags = KSI_SIGQ;
+		ksi.ksi_signo = uap->signum;
+		ksi.ksi_code = SI_QUEUE;
+		ksi.ksi_pid = td->td_proc->p_pid;
+		ksi.ksi_uid = td->td_ucred->cr_ruid;
+		ksi.ksi_value.sival_ptr = uap->value;
+		error = pksignal(p, ksi.ksi_signo, &ksi);
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+/*
+ * Send a signal to a process group.
+ */
+void
+gsignal(int pgid, int sig, ksiginfo_t *ksi)
+{
+	struct pgrp *pgrp;
+
+	if (pgid != 0) {
+		sx_slock(&proctree_lock);
+		pgrp = pgfind(pgid);
+		sx_sunlock(&proctree_lock);
+		if (pgrp != NULL) {
+			pgsignal(pgrp, sig, 0, ksi);
+			PGRP_UNLOCK(pgrp);
+		}
+	}
+}
+
+/*
+ * Send a signal to a process group.  If checktty is 1,
+ * limit to members which have a controlling terminal.
+ */
+void
+pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi)
+{
+	struct proc *p;
+
+	if (pgrp) {
+		PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
+		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
+			PROC_LOCK(p);
+			if (p->p_state == PRS_NORMAL &&
+			    (checkctty == 0 || p->p_flag & P_CONTROLT))
+				pksignal(p, sig, ksi);
+			PROC_UNLOCK(p);
+		}
+	}
+}
+
+/*
+ * Send a signal caused by a trap to the current thread.  If it will be
+ * caught immediately, deliver it with correct code.  Otherwise, post it
+ * normally.
+ */
+void
+trapsignal(struct thread *td, ksiginfo_t *ksi)
+{
+	struct sigacts *ps;
+	sigset_t mask;
+	struct proc *p;
+	int sig;
+	int code;
+
+	p = td->td_proc;
+	sig = ksi->ksi_signo;
+	code = ksi->ksi_code;
+	KASSERT(_SIG_VALID(sig), ("invalid signal"));
+
+	PROC_LOCK(p);
+	ps = p->p_sigacts;
+	mtx_lock(&ps->ps_mtx);
+	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
+	    !SIGISMEMBER(td->td_sigmask, sig)) {
+		td->td_ru.ru_nsignals++;
+#ifdef KTRACE
+		if (KTRPOINT(curthread, KTR_PSIG))
+			ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
+			    &td->td_sigmask, code);
+#endif
+		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
+				ksi, &td->td_sigmask);
+		mask = ps->ps_catchmask[_SIG_IDX(sig)];
+		if (!SIGISMEMBER(ps->ps_signodefer, sig))
+			SIGADDSET(mask, sig);
+		kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
+		    SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED);
+		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
+			/*
+			 * See kern_sigaction() for origin of this code.
+			 */
+			SIGDELSET(ps->ps_sigcatch, sig);
+			if (sig != SIGCONT &&
+			    sigprop(sig) & SA_IGNORE)
+				SIGADDSET(ps->ps_sigignore, sig);
+			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+		}
+		mtx_unlock(&ps->ps_mtx);
+	} else {
+		/*
+		 * Avoid a possible infinite loop if the thread
+		 * masking the signal or process is ignoring the
+		 * signal.
+		 */
+		if (kern_forcesigexit &&
+		    (SIGISMEMBER(td->td_sigmask, sig) ||
+		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
+			SIGDELSET(td->td_sigmask, sig);
+			SIGDELSET(ps->ps_sigcatch, sig);
+			SIGDELSET(ps->ps_sigignore, sig);
+			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+		}
+		mtx_unlock(&ps->ps_mtx);
+		p->p_code = code;	/* XXX for core dump/debugger */
+		p->p_sig = sig;		/* XXX to verify code */
+		tdsendsignal(p, td, sig, ksi);
+	}
+	PROC_UNLOCK(p);
+}
+
+static struct thread *
+sigtd(struct proc *p, int sig, int prop)
+{
+	struct thread *td, *signal_td;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	/*
+	 * Check if current thread can handle the signal without
+	 * switching context to another thread.
+	 */
+	if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
+		return (curthread);
+	signal_td = NULL;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		if (!SIGISMEMBER(td->td_sigmask, sig)) {
+			signal_td = td;
+			break;
+		}
+	}
+	if (signal_td == NULL)
+		signal_td = FIRST_THREAD_IN_PROC(p);
+	return (signal_td);
+}
+
+/*
+ * Send the signal to the process.  If the signal has an action, the action
+ * is usually performed by the target process rather than the caller; we add
+ * the signal to the set of pending signals for the process.
+ *
+ * Exceptions:
+ *   o When a stop signal is sent to a sleeping process that takes the
+ *     default action, the process is stopped without awakening it.
+ *   o SIGCONT restarts stopped processes (or puts them back to sleep)
+ *     regardless of the signal action (eg, blocked or ignored).
+ *
+ * Other ignored signals are discarded immediately.
+ *
+ * NB: This function may be entered from the debugger via the "kill" DDB
+ * command.  There is little that can be done to mitigate the possibly messy
+ * side effects of this unwise possibility.
+ */
+void
+kern_psignal(struct proc *p, int sig)
+{
+	ksiginfo_t ksi;
+
+	ksiginfo_init(&ksi);
+	ksi.ksi_signo = sig;
+	ksi.ksi_code = SI_KERNEL;
+	(void) tdsendsignal(p, NULL, sig, &ksi);
+}
+
+int
+pksignal(struct proc *p, int sig, ksiginfo_t *ksi)
+{
+
+	return (tdsendsignal(p, NULL, sig, ksi));
+}
+
+/* Utility function for finding a thread to send signal event to. */
+int
+sigev_findtd(struct proc *p ,struct sigevent *sigev, struct thread **ttd)
+{
+	struct thread *td;
+
+	if (sigev->sigev_notify == SIGEV_THREAD_ID) {
+		td = tdfind(sigev->sigev_notify_thread_id, p->p_pid);
+		if (td == NULL)
+			return (ESRCH);
+		*ttd = td;
+	} else {
+		*ttd = NULL;
+		PROC_LOCK(p);
+	}
+	return (0);
+}
+
+void
+tdsignal(struct thread *td, int sig)
+{
+	ksiginfo_t ksi;
+
+	ksiginfo_init(&ksi);
+	ksi.ksi_signo = sig;
+	ksi.ksi_code = SI_KERNEL;
+	(void) tdsendsignal(td->td_proc, td, sig, &ksi);
+}
+
+void
+tdksignal(struct thread *td, int sig, ksiginfo_t *ksi)
+{
+
+	(void) tdsendsignal(td->td_proc, td, sig, ksi);
+}
+
+int
+tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
+{
+	sig_t action;
+	sigqueue_t *sigqueue;
+	int prop;
+	struct sigacts *ps;
+	int intrval;
+	int ret = 0;
+	int wakeup_swapper;
+
+	MPASS(td == NULL || p == td->td_proc);
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	if (!_SIG_VALID(sig))
+		panic("%s(): invalid signal %d", __func__, sig);
+
+	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__));
+
+	/*
+	 * IEEE Std 1003.1-2001: return success when killing a zombie.
+	 */
+	if (p->p_state == PRS_ZOMBIE) {
+		if (ksi && (ksi->ksi_flags & KSI_INS))
+			ksiginfo_tryfree(ksi);
+		return (ret);
+	}
+
+	ps = p->p_sigacts;
+	KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
+	prop = sigprop(sig);
+
+	if (td == NULL) {
+		td = sigtd(p, sig, prop);
+		sigqueue = &p->p_sigqueue;
+	} else
+		sigqueue = &td->td_sigqueue;
+
+	SDT_PROBE(proc, kernel, , signal_send, td, p, sig, 0, 0 );
+
+	/*
+	 * If the signal is being ignored,
+	 * then we forget about it immediately.
+	 * (Note: we don't set SIGCONT in ps_sigignore,
+	 * and if it is set to SIG_IGN,
+	 * action will be SIG_DFL here.)
+	 */
+	mtx_lock(&ps->ps_mtx);
+	if (SIGISMEMBER(ps->ps_sigignore, sig)) {
+		SDT_PROBE(proc, kernel, , signal_discard, td, p, sig, 0, 0 );
+
+		mtx_unlock(&ps->ps_mtx);
+		if (ksi && (ksi->ksi_flags & KSI_INS))
+			ksiginfo_tryfree(ksi);
+		return (ret);
+	}
+	if (SIGISMEMBER(td->td_sigmask, sig))
+		action = SIG_HOLD;
+	else if (SIGISMEMBER(ps->ps_sigcatch, sig))
+		action = SIG_CATCH;
+	else
+		action = SIG_DFL;
+	if (SIGISMEMBER(ps->ps_sigintr, sig))
+		intrval = EINTR;
+	else
+		intrval = ERESTART;
+	mtx_unlock(&ps->ps_mtx);
+
+	if (prop & SA_CONT)
+		sigqueue_delete_stopmask_proc(p);
+	else if (prop & SA_STOP) {
+		/*
+		 * If sending a tty stop signal to a member of an orphaned
+		 * process group, discard the signal here if the action
+		 * is default; don't stop the process below if sleeping,
+		 * and don't clear any pending SIGCONT.
+		 */
+		if ((prop & SA_TTYSTOP) &&
+		    (p->p_pgrp->pg_jobc == 0) &&
+		    (action == SIG_DFL)) {
+			if (ksi && (ksi->ksi_flags & KSI_INS))
+				ksiginfo_tryfree(ksi);
+			return (ret);
+		}
+		sigqueue_delete_proc(p, SIGCONT);
+		if (p->p_flag & P_CONTINUED) {
+			p->p_flag &= ~P_CONTINUED;
+			PROC_LOCK(p->p_pptr);
+			sigqueue_take(p->p_ksi);
+			PROC_UNLOCK(p->p_pptr);
+		}
+	}
+
+	ret = sigqueue_add(sigqueue, sig, ksi);
+	if (ret != 0)
+		return (ret);
+	signotify(td);
+	/*
+	 * Defer further processing for signals which are held,
+	 * except that stopped processes must be continued by SIGCONT.
+	 */
+	if (action == SIG_HOLD &&
+	    !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG)))
+		return (ret);
+	/*
+	 * SIGKILL: Remove procfs STOPEVENTs.
+	 */
+	if (sig == SIGKILL) {
+		/* from procfs_ioctl.c: PIOCBIC */
+		p->p_stops = 0;
+		/* from procfs_ioctl.c: PIOCCONT */
+		p->p_step = 0;
+		wakeup(&p->p_step);
+	}
+	/*
+	 * Some signals have a process-wide effect and a per-thread
+	 * component.  Most processing occurs when the process next
+	 * tries to cross the user boundary, however there are some
+	 * times when processing needs to be done immediately, such as
+	 * waking up threads so that they can cross the user boundary.
+	 * We try to do the per-process part here.
+	 */
+	if (P_SHOULDSTOP(p)) {
+		KASSERT(!(p->p_flag & P_WEXIT),
+		    ("signal to stopped but exiting process"));
+		if (sig == SIGKILL) {
+			/*
+			 * If traced process is already stopped,
+			 * then no further action is necessary.
+			 */
+			if (p->p_flag & P_TRACED)
+				goto out;
+			/*
+			 * SIGKILL sets process running.
+			 * It will die elsewhere.
+			 * All threads must be restarted.
+			 */
+			p->p_flag &= ~P_STOPPED_SIG;
+			goto runfast;
+		}
+
+		if (prop & SA_CONT) {
+			/*
+			 * If traced process is already stopped,
+			 * then no further action is necessary.
+			 */
+			if (p->p_flag & P_TRACED)
+				goto out;
+			/*
+			 * If SIGCONT is default (or ignored), we continue the
+			 * process but don't leave the signal in sigqueue as
+			 * it has no further action.  If SIGCONT is held, we
+			 * continue the process and leave the signal in
+			 * sigqueue.  If the process catches SIGCONT, let it
+			 * handle the signal itself.  If it isn't waiting on
+			 * an event, it goes back to run state.
+			 * Otherwise, process goes back to sleep state.
+			 */
+			p->p_flag &= ~P_STOPPED_SIG;
+			PROC_SLOCK(p);
+			if (p->p_numthreads == p->p_suspcount) {
+				PROC_SUNLOCK(p);
+				p->p_flag |= P_CONTINUED;
+				p->p_xstat = SIGCONT;
+				PROC_LOCK(p->p_pptr);
+				childproc_continued(p);
+				PROC_UNLOCK(p->p_pptr);
+				PROC_SLOCK(p);
+			}
+			if (action == SIG_DFL) {
+				thread_unsuspend(p);
+				PROC_SUNLOCK(p);
+				sigqueue_delete(sigqueue, sig);
+				goto out;
+			}
+			if (action == SIG_CATCH) {
+				/*
+				 * The process wants to catch it so it needs
+				 * to run at least one thread, but which one?
+				 */
+				PROC_SUNLOCK(p);
+				goto runfast;
+			}
+			/*
+			 * The signal is not ignored or caught.
+			 */
+			thread_unsuspend(p);
+			PROC_SUNLOCK(p);
+			goto out;
+		}
+
+		if (prop & SA_STOP) {
+			/*
+			 * If traced process is already stopped,
+			 * then no further action is necessary.
+			 */
+			if (p->p_flag & P_TRACED)
+				goto out;
+			/*
+			 * Already stopped, don't need to stop again
+			 * (If we did the shell could get confused).
+			 * Just make sure the signal STOP bit set.
+			 */
+			p->p_flag |= P_STOPPED_SIG;
+			sigqueue_delete(sigqueue, sig);
+			goto out;
+		}
+
+		/*
+		 * All other kinds of signals:
+		 * If a thread is sleeping interruptibly, simulate a
+		 * wakeup so that when it is continued it will be made
+		 * runnable and can look at the signal.  However, don't make
+		 * the PROCESS runnable, leave it stopped.
+		 * It may run a bit until it hits a thread_suspend_check().
+		 */
+		wakeup_swapper = 0;
+		PROC_SLOCK(p);
+		thread_lock(td);
+		if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
+			wakeup_swapper = sleepq_abort(td, intrval);
+		thread_unlock(td);
+		PROC_SUNLOCK(p);
+		if (wakeup_swapper)
+			kick_proc0();
+		goto out;
+		/*
+		 * Mutexes are short lived. Threads waiting on them will
+		 * hit thread_suspend_check() soon.
+		 */
+	} else if (p->p_state == PRS_NORMAL) {
+		if (p->p_flag & P_TRACED || action == SIG_CATCH) {
+			tdsigwakeup(td, sig, action, intrval);
+			goto out;
+		}
+
+		MPASS(action == SIG_DFL);
+
+		if (prop & SA_STOP) {
+			if (p->p_flag & (P_PPWAIT|P_WEXIT))
+				goto out;
+			p->p_flag |= P_STOPPED_SIG;
+			p->p_xstat = sig;
+			PROC_SLOCK(p);
+			sig_suspend_threads(td, p, 1);
+			if (p->p_numthreads == p->p_suspcount) {
+				/*
+				 * only thread sending signal to another
+				 * process can reach here, if thread is sending
+				 * signal to its process, because thread does
+				 * not suspend itself here, p_numthreads
+				 * should never be equal to p_suspcount.
+				 */
+				thread_stopped(p);
+				PROC_SUNLOCK(p);
+				sigqueue_delete_proc(p, p->p_xstat);
+			} else
+				PROC_SUNLOCK(p);
+			goto out;
+		}
+	} else {
+		/* Not in "NORMAL" state. discard the signal. */
+		sigqueue_delete(sigqueue, sig);
+		goto out;
+	}
+
+	/*
+	 * The process is not stopped so we need to apply the signal to all the
+	 * running threads.
+	 */
+runfast:
+	tdsigwakeup(td, sig, action, intrval);
+	PROC_SLOCK(p);
+	thread_unsuspend(p);
+	PROC_SUNLOCK(p);
+out:
+	/* If we jump here, proc slock should not be owned. */
+	PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
+	return (ret);
+}
+
+/*
+ * The force of a signal has been directed against a single
+ * thread.  We need to see what we can do about knocking it
+ * out of any sleep it may be in etc.
+ */
+static void
+tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval)
+{
+	struct proc *p = td->td_proc;
+	register int prop;
+	int wakeup_swapper;
+
+	wakeup_swapper = 0;
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	prop = sigprop(sig);
+
+	PROC_SLOCK(p);
+	thread_lock(td);
+	/*
+	 * Bring the priority of a thread up if we want it to get
+	 * killed in this lifetime.
+	 */
+	if (action == SIG_DFL && (prop & SA_KILL) && td->td_priority > PUSER)
+		sched_prio(td, PUSER);
+	if (TD_ON_SLEEPQ(td)) {
+		/*
+		 * If thread is sleeping uninterruptibly
+		 * we can't interrupt the sleep... the signal will
+		 * be noticed when the process returns through
+		 * trap() or syscall().
+		 */
+		if ((td->td_flags & TDF_SINTR) == 0)
+			goto out;
+		/*
+		 * If SIGCONT is default (or ignored) and process is
+		 * asleep, we are finished; the process should not
+		 * be awakened.
+		 */
+		if ((prop & SA_CONT) && action == SIG_DFL) {
+			thread_unlock(td);
+			PROC_SUNLOCK(p);
+			sigqueue_delete(&p->p_sigqueue, sig);
+			/*
+			 * It may be on either list in this state.
+			 * Remove from both for now.
+			 */
+			sigqueue_delete(&td->td_sigqueue, sig);
+			return;
+		}
+
+		/*
+		 * Don't awaken a sleeping thread for SIGSTOP if the
+		 * STOP signal is deferred.
+		 */
+		if ((prop & SA_STOP) && (td->td_flags & TDF_SBDRY))
+			goto out;
+
+		/*
+		 * Give low priority threads a better chance to run.
+		 */
+		if (td->td_priority > PUSER)
+			sched_prio(td, PUSER);
+
+		wakeup_swapper = sleepq_abort(td, intrval);
+	} else {
+		/*
+		 * Other states do nothing with the signal immediately,
+		 * other than kicking ourselves if we are running.
+		 * It will either never be noticed, or noticed very soon.
+		 */
+#ifdef SMP
+		if (TD_IS_RUNNING(td) && td != curthread)
+			forward_signal(td);
+#endif
+	}
+out:
+	PROC_SUNLOCK(p);
+	thread_unlock(td);
+	if (wakeup_swapper)
+		kick_proc0();
+}
+
+static void
+sig_suspend_threads(struct thread *td, struct proc *p, int sending)
+{
+	struct thread *td2;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+
+	FOREACH_THREAD_IN_PROC(p, td2) {
+		thread_lock(td2);
+		td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
+		if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
+		    (td2->td_flags & TDF_SINTR)) {
+			if (td2->td_flags & TDF_SBDRY) {
+				/*
+				 * Once a thread is asleep with
+				 * TDF_SBDRY set, it should never
+				 * become suspended due to this check.
+				 */
+				KASSERT(!TD_IS_SUSPENDED(td2),
+				    ("thread with deferred stops suspended"));
+			} else if (!TD_IS_SUSPENDED(td2)) {
+				thread_suspend_one(td2);
+			}
+		} else if (!TD_IS_SUSPENDED(td2)) {
+			if (sending || td != td2)
+				td2->td_flags |= TDF_ASTPENDING;
+#ifdef SMP
+			if (TD_IS_RUNNING(td2) && td2 != td)
+				forward_signal(td2);
+#endif
+		}
+		thread_unlock(td2);
+	}
+}
+
+int
+ptracestop(struct thread *td, int sig)
+{
+	struct proc *p = td->td_proc;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process"));
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
+	    &p->p_mtx.lock_object, "Stopping for traced signal");
+
+	td->td_dbgflags |= TDB_XSIG;
+	td->td_xsig = sig;
+	PROC_SLOCK(p);
+	while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) {
+		if (p->p_flag & P_SINGLE_EXIT) {
+			td->td_dbgflags &= ~TDB_XSIG;
+			PROC_SUNLOCK(p);
+			return (sig);
+		}
+		/*
+		 * Just make wait() to work, the last stopped thread
+		 * will win.
+		 */
+		p->p_xstat = sig;
+		p->p_xthread = td;
+		p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE);
+		sig_suspend_threads(td, p, 0);
+		if ((td->td_dbgflags & TDB_STOPATFORK) != 0) {
+			td->td_dbgflags &= ~TDB_STOPATFORK;
+			cv_broadcast(&p->p_dbgwait);
+		}
+stopme:
+		thread_suspend_switch(td);
+		if (p->p_xthread == td)
+			p->p_xthread = NULL;
+		if (!(p->p_flag & P_TRACED))
+			break;
+		if (td->td_dbgflags & TDB_SUSPEND) {
+			if (p->p_flag & P_SINGLE_EXIT)
+				break;
+			goto stopme;
+		}
+	}
+	PROC_SUNLOCK(p);
+	return (td->td_xsig);
+}
+
+static void
+reschedule_signals(struct proc *p, sigset_t block, int flags)
+{
+	struct sigacts *ps;
+	struct thread *td;
+	int sig;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if (SIGISEMPTY(p->p_siglist))
+		return;
+	ps = p->p_sigacts;
+	SIGSETAND(block, p->p_siglist);
+	while ((sig = sig_ffs(&block)) != 0) {
+		SIGDELSET(block, sig);
+		td = sigtd(p, sig, 0);
+		signotify(td);
+		if (!(flags & SIGPROCMASK_PS_LOCKED))
+			mtx_lock(&ps->ps_mtx);
+		if (p->p_flag & P_TRACED || SIGISMEMBER(ps->ps_sigcatch, sig))
+			tdsigwakeup(td, sig, SIG_CATCH,
+			    (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR :
+			     ERESTART));
+		if (!(flags & SIGPROCMASK_PS_LOCKED))
+			mtx_unlock(&ps->ps_mtx);
+	}
+}
+
+void
+tdsigcleanup(struct thread *td)
+{
+	struct proc *p;
+	sigset_t unblocked;
+
+	p = td->td_proc;
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	sigqueue_flush(&td->td_sigqueue);
+	if (p->p_numthreads == 1)
+		return;
+
+	/*
+	 * Since we cannot handle signals, notify signal post code
+	 * about this by filling the sigmask.
+	 *
+	 * Also, if needed, wake up thread(s) that do not block the
+	 * same signals as the exiting thread, since the thread might
+	 * have been selected for delivery and woken up.
+	 */
+	SIGFILLSET(unblocked);
+	SIGSETNAND(unblocked, td->td_sigmask);
+	SIGFILLSET(td->td_sigmask);
+	reschedule_signals(p, unblocked, 0);
+
+}
+
+/*
+ * Defer the delivery of SIGSTOP for the current thread.  Returns true
+ * if stops were deferred and false if they were already deferred.
+ */
+int
+sigdeferstop(void)
+{
+	struct thread *td;
+
+	td = curthread;
+	if (td->td_flags & TDF_SBDRY)
+		return (0);
+	thread_lock(td);
+	td->td_flags |= TDF_SBDRY;
+	thread_unlock(td);
+	return (1);
+}
+
+/*
+ * Permit the delivery of SIGSTOP for the current thread.  This does
+ * not immediately suspend if a stop was posted.  Instead, the thread
+ * will suspend either via ast() or a subsequent interruptible sleep.
+ */
+void
+sigallowstop()
+{
+	struct thread *td;
+
+	td = curthread;
+	thread_lock(td);
+	td->td_flags &= ~TDF_SBDRY;
+	thread_unlock(td);
+}
+
+/*
+ * If the current process has received a signal (should be caught or cause
+ * termination, should interrupt current syscall), return the signal number.
+ * Stop signals with default action are processed immediately, then cleared;
+ * they aren't returned.  This is checked after each entry to the system for
+ * a syscall or trap (though this can usually be done without calling issignal
+ * by checking the pending signal masks in cursig.) The normal call
+ * sequence is
+ *
+ *	while (sig = cursig(curthread))
+ *		postsig(sig);
+ */
+static int
+issignal(struct thread *td)
+{
+	struct proc *p;
+	struct sigacts *ps;
+	struct sigqueue *queue;
+	sigset_t sigpending;
+	int sig, prop, newsig;
+
+	p = td->td_proc;
+	ps = p->p_sigacts;
+	mtx_assert(&ps->ps_mtx, MA_OWNED);
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	for (;;) {
+		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
+
+		sigpending = td->td_sigqueue.sq_signals;
+		SIGSETOR(sigpending, p->p_sigqueue.sq_signals);
+		SIGSETNAND(sigpending, td->td_sigmask);
+
+		if (p->p_flag & P_PPWAIT || td->td_flags & TDF_SBDRY)
+			SIG_STOPSIGMASK(sigpending);
+		if (SIGISEMPTY(sigpending))	/* no signal to send */
+			return (0);
+		sig = sig_ffs(&sigpending);
+
+		if (p->p_stops & S_SIG) {
+			mtx_unlock(&ps->ps_mtx);
+			stopevent(p, S_SIG, sig);
+			mtx_lock(&ps->ps_mtx);
+		}
+
+		/*
+		 * We should see pending but ignored signals
+		 * only if P_TRACED was on when they were posted.
+		 */
+		if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
+			sigqueue_delete(&td->td_sigqueue, sig);
+			sigqueue_delete(&p->p_sigqueue, sig);
+			continue;
+		}
+		if (p->p_flag & P_TRACED && (p->p_flag & P_PPTRACE) == 0) {
+			/*
+			 * If traced, always stop.
+			 * Remove old signal from queue before the stop.
+			 * XXX shrug off debugger, it causes siginfo to
+			 * be thrown away.
+			 */
+			queue = &td->td_sigqueue;
+			td->td_dbgksi.ksi_signo = 0;
+			if (sigqueue_get(queue, sig, &td->td_dbgksi) == 0) {
+				queue = &p->p_sigqueue;
+				sigqueue_get(queue, sig, &td->td_dbgksi);
+			}
+
+			mtx_unlock(&ps->ps_mtx);
+			newsig = ptracestop(td, sig);
+			mtx_lock(&ps->ps_mtx);
+
+			if (sig != newsig) {
+
+				/*
+				 * If parent wants us to take the signal,
+				 * then it will leave it in p->p_xstat;
+				 * otherwise we just look for signals again.
+				*/
+				if (newsig == 0)
+					continue;
+				sig = newsig;
+
+				/*
+				 * Put the new signal into td_sigqueue. If the
+				 * signal is being masked, look for other
+				 * signals.
+				 */
+				sigqueue_add(queue, sig, NULL);
+				if (SIGISMEMBER(td->td_sigmask, sig))
+					continue;
+				signotify(td);
+			} else {
+				if (td->td_dbgksi.ksi_signo != 0) {
+					td->td_dbgksi.ksi_flags |= KSI_HEAD;
+					if (sigqueue_add(&td->td_sigqueue, sig,
+					    &td->td_dbgksi) != 0)
+						td->td_dbgksi.ksi_signo = 0;
+				}
+				if (td->td_dbgksi.ksi_signo == 0)
+					sigqueue_add(&td->td_sigqueue, sig,
+					    NULL);
+			}
+
+			/*
+			 * If the traced bit got turned off, go back up
+			 * to the top to rescan signals.  This ensures
+			 * that p_sig* and p_sigact are consistent.
+			 */
+			if ((p->p_flag & P_TRACED) == 0)
+				continue;
+		}
+
+		prop = sigprop(sig);
+
+		/*
+		 * Decide whether the signal should be returned.
+		 * Return the signal's number, or fall through
+		 * to clear it from the pending mask.
+		 */
+		switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
+
+		case (intptr_t)SIG_DFL:
+			/*
+			 * Don't take default actions on system processes.
+			 */
+			if (p->p_pid <= 1) {
+#ifdef DIAGNOSTIC
+				/*
+				 * Are you sure you want to ignore SIGSEGV
+				 * in init? XXX
+				 */
+				printf("Process (pid %lu) got signal %d\n",
+					(u_long)p->p_pid, sig);
+#endif
+				break;		/* == ignore */
+			}
+			/*
+			 * If there is a pending stop signal to process
+			 * with default action, stop here,
+			 * then clear the signal.  However,
+			 * if process is member of an orphaned
+			 * process group, ignore tty stop signals.
+			 */
+			if (prop & SA_STOP) {
+				if (p->p_flag & (P_TRACED|P_WEXIT) ||
+				    (p->p_pgrp->pg_jobc == 0 &&
+				     prop & SA_TTYSTOP))
+					break;	/* == ignore */
+				mtx_unlock(&ps->ps_mtx);
+				WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
+				    &p->p_mtx.lock_object, "Catching SIGSTOP");
+				p->p_flag |= P_STOPPED_SIG;
+				p->p_xstat = sig;
+				PROC_SLOCK(p);
+				sig_suspend_threads(td, p, 0);
+				thread_suspend_switch(td);
+				PROC_SUNLOCK(p);
+				mtx_lock(&ps->ps_mtx);
+				break;
+			} else if (prop & SA_IGNORE) {
+				/*
+				 * Except for SIGCONT, shouldn't get here.
+				 * Default action is to ignore; drop it.
+				 */
+				break;		/* == ignore */
+			} else
+				return (sig);
+			/*NOTREACHED*/
+
+		case (intptr_t)SIG_IGN:
+			/*
+			 * Masking above should prevent us ever trying
+			 * to take action on an ignored signal other
+			 * than SIGCONT, unless process is traced.
+			 */
+			if ((prop & SA_CONT) == 0 &&
+			    (p->p_flag & P_TRACED) == 0)
+				printf("issignal\n");
+			break;		/* == ignore */
+
+		default:
+			/*
+			 * This signal has an action, let
+			 * postsig() process it.
+			 */
+			return (sig);
+		}
+		sigqueue_delete(&td->td_sigqueue, sig);	/* take the signal! */
+		sigqueue_delete(&p->p_sigqueue, sig);
+	}
+	/* NOTREACHED */
+}
+
+void
+thread_stopped(struct proc *p)
+{
+	int n;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	n = p->p_suspcount;
+	if (p == curproc)
+		n++;
+	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
+		PROC_SUNLOCK(p);
+		p->p_flag &= ~P_WAITED;
+		PROC_LOCK(p->p_pptr);
+		childproc_stopped(p, (p->p_flag & P_TRACED) ?
+			CLD_TRAPPED : CLD_STOPPED);
+		PROC_UNLOCK(p->p_pptr);
+		PROC_SLOCK(p);
+	}
+}
+
+/*
+ * Take the action for the specified signal
+ * from the current set of pending signals.
+ */
+int
+postsig(sig)
+	register int sig;
+{
+	struct thread *td = curthread;
+	register struct proc *p = td->td_proc;
+	struct sigacts *ps;
+	sig_t action;
+	ksiginfo_t ksi;
+	sigset_t returnmask, mask;
+
+	KASSERT(sig != 0, ("postsig"));
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	ps = p->p_sigacts;
+	mtx_assert(&ps->ps_mtx, MA_OWNED);
+	ksiginfo_init(&ksi);
+	if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 &&
+	    sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0)
+		return (0);
+	ksi.ksi_signo = sig;
+	if (ksi.ksi_code == SI_TIMER)
+		itimer_accept(p, ksi.ksi_timerid, &ksi);
+	action = ps->ps_sigact[_SIG_IDX(sig)];
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_PSIG))
+		ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
+		    &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code);
+#endif
+	if (p->p_stops & S_SIG) {
+		mtx_unlock(&ps->ps_mtx);
+		stopevent(p, S_SIG, sig);
+		mtx_lock(&ps->ps_mtx);
+	}
+
+	if (action == SIG_DFL) {
+		/*
+		 * Default action, where the default is to kill
+		 * the process.  (Other cases were ignored above.)
+		 */
+		mtx_unlock(&ps->ps_mtx);
+		sigexit(td, sig);
+		/* NOTREACHED */
+	} else {
+		/*
+		 * If we get here, the signal must be caught.
+		 */
+		KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig),
+		    ("postsig action"));
+		/*
+		 * Set the new mask value and also defer further
+		 * occurrences of this signal.
+		 *
+		 * Special case: user has done a sigsuspend.  Here the
+		 * current mask is not of interest, but rather the
+		 * mask from before the sigsuspend is what we want
+		 * restored after the signal processing is completed.
+		 */
+		if (td->td_pflags & TDP_OLDMASK) {
+			returnmask = td->td_oldsigmask;
+			td->td_pflags &= ~TDP_OLDMASK;
+		} else
+			returnmask = td->td_sigmask;
+
+		mask = ps->ps_catchmask[_SIG_IDX(sig)];
+		if (!SIGISMEMBER(ps->ps_signodefer, sig))
+			SIGADDSET(mask, sig);
+		kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
+		    SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED);
+
+		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
+			/*
+			 * See kern_sigaction() for origin of this code.
+			 */
+			SIGDELSET(ps->ps_sigcatch, sig);
+			if (sig != SIGCONT &&
+			    sigprop(sig) & SA_IGNORE)
+				SIGADDSET(ps->ps_sigignore, sig);
+			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+		}
+		td->td_ru.ru_nsignals++;
+		if (p->p_sig == sig) {
+			p->p_code = 0;
+			p->p_sig = 0;
+		}
+		(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
+	}
+	return (1);
+}
+
+/*
+ * Kill the current process for stated reason.
+ */
+void
+killproc(p, why)
+	struct proc *p;
+	char *why;
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid,
+	    p->p_comm);
+	log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid,
+	    p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, why);
+	p->p_flag |= P_WKILLED;
+	kern_psignal(p, SIGKILL);
+}
+
+/*
+ * Force the current process to exit with the specified signal, dumping core
+ * if appropriate.  We bypass the normal tests for masked and caught signals,
+ * allowing unrecoverable failures to terminate the process without changing
+ * signal state.  Mark the accounting record with the signal termination.
+ * If dumping core, save the signal number for the debugger.  Calls exit and
+ * does not return.
+ */
+void
+sigexit(td, sig)
+	struct thread *td;
+	int sig;
+{
+	struct proc *p = td->td_proc;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	p->p_acflag |= AXSIG;
+	/*
+	 * We must be single-threading to generate a core dump.  This
+	 * ensures that the registers in the core file are up-to-date.
+	 * Also, the ELF dump handler assumes that the thread list doesn't
+	 * change out from under it.
+	 *
+	 * XXX If another thread attempts to single-thread before us
+	 *     (e.g. via fork()), we won't get a dump at all.
+	 */
+	if ((sigprop(sig) & SA_CORE) && (thread_single(SINGLE_NO_EXIT) == 0)) {
+		p->p_sig = sig;
+		/*
+		 * Log signals which would cause core dumps
+		 * (Log as LOG_INFO to appease those who don't want
+		 * these messages.)
+		 * XXX : Todo, as well as euid, write out ruid too
+		 * Note that coredump() drops proc lock.
+		 */
+		if (coredump(td) == 0)
+			sig |= WCOREFLAG;
+		if (kern_logsigexit)
+			log(LOG_INFO,
+			    "pid %d (%s), uid %d: exited on signal %d%s\n",
+			    p->p_pid, p->p_comm,
+			    td->td_ucred ? td->td_ucred->cr_uid : -1,
+			    sig &~ WCOREFLAG,
+			    sig & WCOREFLAG ? " (core dumped)" : "");
+	} else
+		PROC_UNLOCK(p);
+	exit1(td, W_EXITCODE(0, sig));
+	/* NOTREACHED */
+}
+
+/*
+ * Send queued SIGCHLD to parent when child process's state
+ * is changed.
+ */
+static void
+sigparent(struct proc *p, int reason, int status)
+{
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
+
+	if (p->p_ksi != NULL) {
+		p->p_ksi->ksi_signo  = SIGCHLD;
+		p->p_ksi->ksi_code   = reason;
+		p->p_ksi->ksi_status = status;
+		p->p_ksi->ksi_pid    = p->p_pid;
+		p->p_ksi->ksi_uid    = p->p_ucred->cr_ruid;
+		if (KSI_ONQ(p->p_ksi))
+			return;
+	}
+	pksignal(p->p_pptr, SIGCHLD, p->p_ksi);
+}
+
+static void
+childproc_jobstate(struct proc *p, int reason, int status)
+{
+	struct sigacts *ps;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
+
+	/*
+	 * Wake up parent sleeping in kern_wait(), also send
+	 * SIGCHLD to parent, but SIGCHLD does not guarantee
+	 * that parent will awake, because parent may masked
+	 * the signal.
+	 */
+	p->p_pptr->p_flag |= P_STATCHILD;
+	wakeup(p->p_pptr);
+
+	ps = p->p_pptr->p_sigacts;
+	mtx_lock(&ps->ps_mtx);
+	if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
+		mtx_unlock(&ps->ps_mtx);
+		sigparent(p, reason, status);
+	} else
+		mtx_unlock(&ps->ps_mtx);
+}
+
+void
+childproc_stopped(struct proc *p, int reason)
+{
+	childproc_jobstate(p, reason, p->p_xstat);
+}
+
+void
+childproc_continued(struct proc *p)
+{
+	childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
+}
+
+void
+childproc_exited(struct proc *p)
+{
+	int reason;
+	int status = p->p_xstat; /* convert to int */
+
+	reason = CLD_EXITED;
+	if (WCOREDUMP(status))
+		reason = CLD_DUMPED;
+	else if (WIFSIGNALED(status))
+		reason = CLD_KILLED;
+	/*
+	 * XXX avoid calling wakeup(p->p_pptr), the work is
+	 * done in exit1().
+	 */
+	sigparent(p, reason, status);
+}
+
+/*
+ * We only have 1 character for the core count in the format
+ * string, so the range will be 0-9
+ */
+#define MAX_NUM_CORES 10
+static int num_cores = 5;
+
+static int
+sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	int new_val;
+
+	new_val = num_cores;
+	error = sysctl_handle_int(oidp, &new_val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (new_val > MAX_NUM_CORES)
+		new_val = MAX_NUM_CORES;
+	if (new_val < 0)
+		new_val = 0;
+	num_cores = new_val;
+	return (0);
+}
+SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT|CTLFLAG_RW,
+	    0, sizeof(int), sysctl_debug_num_cores_check, "I", "");
+
+#if defined(COMPRESS_USER_CORES)
+int compress_user_cores = 1;
+SYSCTL_INT(_kern, OID_AUTO, compress_user_cores, CTLFLAG_RW,
+    &compress_user_cores, 0, "Compression of user corefiles");
+
+int compress_user_cores_gzlevel = -1; /* default level */
+SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_gzlevel, CTLFLAG_RW,
+    &compress_user_cores_gzlevel, -1, "Corefile gzip compression level");
+
+#define GZ_SUFFIX	".gz"
+#define GZ_SUFFIX_LEN	3
+#endif
+
+static char corefilename[MAXPATHLEN] = {"%N.core"};
+TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
+SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
+    sizeof(corefilename), "Process corefile name format string");
+
+/*
+ * corefile_open(comm, uid, pid, td, compress, vpp, namep)
+ * Expand the name described in corefilename, using name, uid, and pid
+ * and open/create core file.
+ * corefilename is a printf-like string, with three format specifiers:
+ *	%N	name of process ("name")
+ *	%P	process id (pid)
+ *	%U	user id (uid)
+ * For example, "%N.core" is the default; they can be disabled completely
+ * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
+ * This is controlled by the sysctl variable kern.corefile (see above).
+ */
+static int
+corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td,
+    int compress, struct vnode **vpp, char **namep)
+{
+	struct nameidata nd;
+	struct sbuf sb;
+	const char *format;
+	char *hostname, *name;
+	int indexpos, i, error, cmode, flags, oflags;
+
+	hostname = NULL;
+	format = corefilename;
+	name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO);
+	indexpos = -1;
+	(void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN);
+	for (i = 0; format[i] != '\0'; i++) {
+		switch (format[i]) {
+		case '%':	/* Format character */
+			i++;
+			switch (format[i]) {
+			case '%':
+				sbuf_putc(&sb, '%');
+				break;
+			case 'H':	/* hostname */
+				if (hostname == NULL) {
+					hostname = malloc(MAXHOSTNAMELEN,
+					    M_TEMP, M_WAITOK);
+				}
+				getcredhostname(td->td_ucred, hostname,
+				    MAXHOSTNAMELEN);
+				sbuf_printf(&sb, "%s", hostname);
+				break;
+			case 'I':	/* autoincrementing index */
+				sbuf_printf(&sb, "0");
+				indexpos = sbuf_len(&sb) - 1;
+				break;
+			case 'N':	/* process name */
+				sbuf_printf(&sb, "%s", comm);
+				break;
+			case 'P':	/* process id */
+				sbuf_printf(&sb, "%u", pid);
+				break;
+			case 'U':	/* user id */
+				sbuf_printf(&sb, "%u", uid);
+				break;
+			default:
+				log(LOG_ERR,
+				    "Unknown format character %c in "
+				    "corename `%s'\n", format[i], format);
+				break;
+			}
+			break;
+		default:
+			sbuf_putc(&sb, format[i]);
+			break;
+		}
+	}
+	free(hostname, M_TEMP);
+#ifdef COMPRESS_USER_CORES
+	if (compress)
+		sbuf_printf(&sb, GZ_SUFFIX);
+#endif
+	if (sbuf_error(&sb) != 0) {
+		log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
+		    "long\n", (long)pid, comm, (u_long)uid);
+		sbuf_delete(&sb);
+		free(name, M_TEMP);
+		return (ENOMEM);
+	}
+	sbuf_finish(&sb);
+	sbuf_delete(&sb);
+
+	cmode = S_IRUSR | S_IWUSR;
+	oflags = VN_OPEN_NOAUDIT | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
+
+	/*
+	 * If the core format has a %I in it, then we need to check
+	 * for existing corefiles before returning a name.
+	 * To do this we iterate over 0..num_cores to find a
+	 * non-existing core file name to use.
+	 */
+	if (indexpos != -1) {
+		for (i = 0; i < num_cores; i++) {
+			flags = O_CREAT | O_EXCL | FWRITE | O_NOFOLLOW;
+			name[indexpos] = '0' + i;
+			NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td);
+			error = vn_open_cred(&nd, &flags, cmode, oflags,
+			    td->td_ucred, NULL);
+			if (error) {
+				if (error == EEXIST)
+					continue;
+				log(LOG_ERR,
+				    "pid %d (%s), uid (%u):  Path `%s' failed "
+				    "on initial open test, error = %d\n",
+				    pid, comm, uid, name, error);
+			}
+			goto out;
+		}
+	}
+
+	flags = O_CREAT | FWRITE | O_NOFOLLOW;
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td);
+	error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL);
+out:
+	if (error) {
+#ifdef AUDIT
+		audit_proc_coredump(td, name, error);
+#endif
+		free(name, M_TEMP);
+		return (error);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	*vpp = nd.ni_vp;
+	*namep = name;
+	return (0);
+}
+
+/*
+ * Dump a process' core.  The main routine does some
+ * policy checking, and creates the name of the coredump;
+ * then it passes on a vnode and a size limit to the process-specific
+ * coredump routine if there is one; if there _is not_ one, it returns
+ * ENOSYS; otherwise it returns the error from the process-specific routine.
+ */
+
+static int
+coredump(struct thread *td)
+{
+	struct proc *p = td->td_proc;
+	struct ucred *cred = td->td_ucred;
+	struct vnode *vp;
+	struct flock lf;
+	struct vattr vattr;
+	int error, error1, locked;
+	struct mount *mp;
+	char *name;			/* name of corefile */
+	off_t limit;
+	int compress;
+
+#ifdef COMPRESS_USER_CORES
+	compress = compress_user_cores;
+#else
+	compress = 0;
+#endif
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
+	_STOPEVENT(p, S_CORE, 0);
+
+	if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0)) {
+		PROC_UNLOCK(p);
+		return (EFAULT);
+	}
+
+	/*
+	 * Note that the bulk of limit checking is done after
+	 * the corefile is created.  The exception is if the limit
+	 * for corefiles is 0, in which case we don't bother
+	 * creating the corefile at all.  This layout means that
+	 * a corefile is truncated instead of not being created,
+	 * if it is larger than the limit.
+	 */
+	limit = (off_t)lim_cur(p, RLIMIT_CORE);
+	if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
+		PROC_UNLOCK(p);
+		return (EFBIG);
+	}
+	PROC_UNLOCK(p);
+
+restart:
+	error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, compress,
+	    &vp, &name);
+	if (error != 0)
+		return (error);
+
+	/* Don't dump to non-regular files or files with links. */
+	if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 ||
+	    vattr.va_nlink != 1) {
+		VOP_UNLOCK(vp, 0);
+		error = EFAULT;
+		goto close;
+	}
+
+	VOP_UNLOCK(vp, 0);
+	lf.l_whence = SEEK_SET;
+	lf.l_start = 0;
+	lf.l_len = 0;
+	lf.l_type = F_WRLCK;
+	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
+
+	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+		lf.l_type = F_UNLCK;
+		if (locked)
+			VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
+		if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
+			goto out;
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			goto out;
+		free(name, M_TEMP);
+		goto restart;
+	}
+
+	VATTR_NULL(&vattr);
+	vattr.va_size = 0;
+	if (set_core_nodump_flag)
+		vattr.va_flags = UF_NODUMP;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	VOP_SETATTR(vp, &vattr, cred);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	PROC_LOCK(p);
+	p->p_acflag |= ACORE;
+	PROC_UNLOCK(p);
+
+	if (p->p_sysent->sv_coredump != NULL) {
+		error = p->p_sysent->sv_coredump(td, vp, limit,
+		    compress ? IMGACT_CORE_COMPRESS : 0);
+	} else {
+		error = ENOSYS;
+	}
+
+	if (locked) {
+		lf.l_type = F_UNLCK;
+		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
+	}
+close:
+	error1 = vn_close(vp, FWRITE, cred, td);
+	if (error == 0)
+		error = error1;
+out:
+#ifdef AUDIT
+	audit_proc_coredump(td, name, error);
+#endif
+	free(name, M_TEMP);
+	return (error);
+}
+
+/*
+ * Nonexistent system call-- signal process (may want to handle it).  Flag
+ * error in case process won't see signal immediately (blocked or ignored).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nosys_args {
+	int	dummy;
+};
+#endif
+/* ARGSUSED */
+int
+nosys(td, args)
+	struct thread *td;
+	struct nosys_args *args;
+{
+	struct proc *p = td->td_proc;
+
+	PROC_LOCK(p);
+	tdsignal(td, SIGSYS);
+	PROC_UNLOCK(p);
+	return (ENOSYS);
+}
+
+/*
+ * Send a SIGIO or SIGURG signal to a process or process group using stored
+ * credentials rather than those of the current process.
+ */
+void
+pgsigio(sigiop, sig, checkctty)
+	struct sigio **sigiop;
+	int sig, checkctty;
+{
+	ksiginfo_t ksi;
+	struct sigio *sigio;
+
+	ksiginfo_init(&ksi);
+	ksi.ksi_signo = sig;
+	ksi.ksi_code = SI_KERNEL;
+
+	SIGIO_LOCK();
+	sigio = *sigiop;
+	if (sigio == NULL) {
+		SIGIO_UNLOCK();
+		return;
+	}
+	if (sigio->sio_pgid > 0) {
+		PROC_LOCK(sigio->sio_proc);
+		if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
+			kern_psignal(sigio->sio_proc, sig);
+		PROC_UNLOCK(sigio->sio_proc);
+	} else if (sigio->sio_pgid < 0) {
+		struct proc *p;
+
+		PGRP_LOCK(sigio->sio_pgrp);
+		LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
+			PROC_LOCK(p);
+			if (p->p_state == PRS_NORMAL &&
+			    CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
+			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
+				kern_psignal(p, sig);
+			PROC_UNLOCK(p);
+		}
+		PGRP_UNLOCK(sigio->sio_pgrp);
+	}
+	SIGIO_UNLOCK();
+}
+
+static int
+filt_sigattach(struct knote *kn)
+{
+	struct proc *p = curproc;
+
+	kn->kn_ptr.p_proc = p;
+	kn->kn_flags |= EV_CLEAR;		/* automatically set */
+
+	knlist_add(&p->p_klist, kn, 0);
+
+	return (0);
+}
+
+static void
+filt_sigdetach(struct knote *kn)
+{
+	struct proc *p = kn->kn_ptr.p_proc;
+
+	knlist_remove(&p->p_klist, kn, 0);
+}
+
+/*
+ * signal knotes are shared with proc knotes, so we apply a mask to
+ * the hint in order to differentiate them from process hints.  This
+ * could be avoided by using a signal-specific knote list, but probably
+ * isn't worth the trouble.
+ */
+static int
+filt_signal(struct knote *kn, long hint)
+{
+
+	if (hint & NOTE_SIGNAL) {
+		hint &= ~NOTE_SIGNAL;
+
+		if (kn->kn_id == hint)
+			kn->kn_data++;
+	}
+	return (kn->kn_data != 0);
+}
+
+struct sigacts *
+sigacts_alloc(void)
+{
+	struct sigacts *ps;
+
+	ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO);
+	ps->ps_refcnt = 1;
+	mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
+	return (ps);
+}
+
+void
+sigacts_free(struct sigacts *ps)
+{
+
+	mtx_lock(&ps->ps_mtx);
+	ps->ps_refcnt--;
+	if (ps->ps_refcnt == 0) {
+		mtx_destroy(&ps->ps_mtx);
+		free(ps, M_SUBPROC);
+	} else
+		mtx_unlock(&ps->ps_mtx);
+}
+
+struct sigacts *
+sigacts_hold(struct sigacts *ps)
+{
+	mtx_lock(&ps->ps_mtx);
+	ps->ps_refcnt++;
+	mtx_unlock(&ps->ps_mtx);
+	return (ps);
+}
+
+void
+sigacts_copy(struct sigacts *dest, struct sigacts *src)
+{
+
+	KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
+	mtx_lock(&src->ps_mtx);
+	bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
+	mtx_unlock(&src->ps_mtx);
+}
+
+int
+sigacts_shared(struct sigacts *ps)
+{
+	int shared;
+
+	mtx_lock(&ps->ps_mtx);
+	shared = ps->ps_refcnt > 1;
+	mtx_unlock(&ps->ps_mtx);
+	return (shared);
+}
diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
new file mode 100644
index 0000000..d0009b1
--- /dev/null
+++ b/sys/kern/kern_switch.c
@@ -0,0 +1,513 @@
+/*-
+ * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+
+/* Uncomment this to enable logging of critical_enter/exit. */
+#if 0
+#define	KTR_CRITICAL	KTR_SCHED
+#else
+#define	KTR_CRITICAL	0
+#endif
+
+#ifdef FULL_PREEMPTION
+#ifndef PREEMPTION
+#error "The FULL_PREEMPTION option requires the PREEMPTION option"
+#endif
+#endif
+
+CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
+
+/*
+ * kern.sched.preemption allows user space to determine if preemption support
+ * is compiled in or not.  It is not currently a boot or runtime flag that
+ * can be changed.
+ */
+#ifdef PREEMPTION
+static int kern_sched_preemption = 1;
+#else
+static int kern_sched_preemption = 0;
+#endif
+SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
+    &kern_sched_preemption, 0, "Kernel preemption enabled");
+
+/*
+ * Support for scheduler stats exported via kern.sched.stats.  All stats may
+ * be reset with kern.sched.stats.reset = 1.  Stats may be defined elsewhere
+ * with SCHED_STAT_DEFINE().
+ */
+#ifdef SCHED_STATS
+SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
+
+/* Switch reasons from mi_switch(). */
+DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]);
+SCHED_STAT_DEFINE_VAR(uncategorized,
+    &DPCPU_NAME(sched_switch_stats[SWT_NONE]), "");
+SCHED_STAT_DEFINE_VAR(preempt,
+    &DPCPU_NAME(sched_switch_stats[SWT_PREEMPT]), "");
+SCHED_STAT_DEFINE_VAR(owepreempt,
+    &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), "");
+SCHED_STAT_DEFINE_VAR(turnstile,
+    &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), "");
+SCHED_STAT_DEFINE_VAR(sleepq,
+    &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), "");
+SCHED_STAT_DEFINE_VAR(sleepqtimo,
+    &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQTIMO]), "");
+SCHED_STAT_DEFINE_VAR(relinquish, 
+    &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), "");
+SCHED_STAT_DEFINE_VAR(needresched,
+    &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), "");
+SCHED_STAT_DEFINE_VAR(idle, 
+    &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), "");
+SCHED_STAT_DEFINE_VAR(iwait,
+    &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), "");
+SCHED_STAT_DEFINE_VAR(suspend,
+    &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), "");
+SCHED_STAT_DEFINE_VAR(remotepreempt,
+    &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), "");
+SCHED_STAT_DEFINE_VAR(remotewakeidle,
+    &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), "");
+
+static int
+sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
+{
+	struct sysctl_oid *p;
+	uintptr_t counter;
+        int error;
+	int val;
+	int i;
+
+        val = 0;
+        error = sysctl_handle_int(oidp, &val, 0, req);
+        if (error != 0 || req->newptr == NULL)
+                return (error);
+        if (val == 0)
+                return (0);
+	/*
+	 * Traverse the list of children of _kern_sched_stats and reset each
+	 * to 0.  Skip the reset entry.
+	 */
+	SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
+		if (p == oidp || p->oid_arg1 == NULL)
+			continue;
+		counter = (uintptr_t)p->oid_arg1;
+		CPU_FOREACH(i) {
+			*(long *)(dpcpu_off[i] + counter) = 0;
+		}
+	}
+	return (0);
+}
+
+SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
+    0, sysctl_stats_reset, "I", "Reset scheduler statistics");
+#endif
+
+/************************************************************************
+ * Functions that manipulate runnability from a thread perspective.	*
+ ************************************************************************/
+/*
+ * Select the thread that will be run next.
+ */
+struct thread *
+choosethread(void)
+{
+	struct thread *td;
+
+retry:
+	td = sched_choose();
+
+	/*
+	 * If we are in panic, only allow system threads,
+	 * plus the one we are running in, to be run.
+	 */
+	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
+	    (td->td_flags & TDF_INPANIC) == 0)) {
+		/* note that it is no longer on the run queue */
+		TD_SET_CAN_RUN(td);
+		goto retry;
+	}
+
+	TD_SET_RUNNING(td);
+	return (td);
+}
+
+/*
+ * Kernel thread preemption implementation.  Critical sections mark
+ * regions of code in which preemptions are not allowed.
+ *
+ * It might seem a good idea to inline critical_enter() but, in order
+ * to prevent instructions reordering by the compiler, a __compiler_membar()
+ * would have to be used here (the same as sched_pin()).  The performance
+ * penalty imposed by the membar could, then, produce slower code than
+ * the function call itself, for most cases.
+ */
+void
+critical_enter(void)
+{
+	struct thread *td;
+
+	td = curthread;
+	td->td_critnest++;
+	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
+	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
+}
+
+void
+critical_exit(void)
+{
+	struct thread *td;
+	int flags;
+
+	td = curthread;
+	KASSERT(td->td_critnest != 0,
+	    ("critical_exit: td_critnest == 0"));
+
+	if (td->td_critnest == 1) {
+		td->td_critnest = 0;
+		if (td->td_owepreempt && !kdb_active) {
+			td->td_critnest = 1;
+			thread_lock(td);
+			td->td_critnest--;
+			flags = SW_INVOL | SW_PREEMPT;
+			if (TD_IS_IDLETHREAD(td))
+				flags |= SWT_IDLE;
+			else
+				flags |= SWT_OWEPREEMPT;
+			mi_switch(flags, NULL);
+			thread_unlock(td);
+		}
+	} else
+		td->td_critnest--;
+
+	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
+	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
+}
+
+/************************************************************************
+ * SYSTEM RUN QUEUE manipulations and tests				*
+ ************************************************************************/
+/*
+ * Initialize a run structure.
+ */
+void
+runq_init(struct runq *rq)
+{
+	int i;
+
+	bzero(rq, sizeof *rq);
+	for (i = 0; i < RQ_NQS; i++)
+		TAILQ_INIT(&rq->rq_queues[i]);
+}
+
+/*
+ * Clear the status bit of the queue corresponding to priority level pri,
+ * indicating that it is empty.
+ */
+static __inline void
+runq_clrbit(struct runq *rq, int pri)
+{
+	struct rqbits *rqb;
+
+	rqb = &rq->rq_status;
+	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
+	    rqb->rqb_bits[RQB_WORD(pri)],
+	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
+	    RQB_BIT(pri), RQB_WORD(pri));
+	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
+}
+
+/*
+ * Find the index of the first non-empty run queue.  This is done by
+ * scanning the status bits, a set bit indicates a non-empty queue.
+ */
+static __inline int
+runq_findbit(struct runq *rq)
+{
+	struct rqbits *rqb;
+	int pri;
+	int i;
+
+	rqb = &rq->rq_status;
+	for (i = 0; i < RQB_LEN; i++)
+		if (rqb->rqb_bits[i]) {
+			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
+			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
+			    rqb->rqb_bits[i], i, pri);
+			return (pri);
+		}
+
+	return (-1);
+}
+
+static __inline int
+runq_findbit_from(struct runq *rq, u_char pri)
+{
+	struct rqbits *rqb;
+	rqb_word_t mask;
+	int i;
+
+	/*
+	 * Set the mask for the first word so we ignore priorities before 'pri'.
+	 */
+	mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
+	rqb = &rq->rq_status;
+again:
+	for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
+		mask = rqb->rqb_bits[i] & mask;
+		if (mask == 0)
+			continue;
+		pri = RQB_FFS(mask) + (i << RQB_L2BPW);
+		CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
+		    mask, i, pri);
+		return (pri);
+	}
+	if (pri == 0)
+		return (-1);
+	/*
+	 * Wrap back around to the beginning of the list just once so we
+	 * scan the whole thing.
+	 */
+	pri = 0;
+	goto again;
+}
+
+/*
+ * Set the status bit of the queue corresponding to priority level pri,
+ * indicating that it is non-empty.
+ */
+static __inline void
+runq_setbit(struct runq *rq, int pri)
+{
+	struct rqbits *rqb;
+
+	rqb = &rq->rq_status;
+	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
+	    rqb->rqb_bits[RQB_WORD(pri)],
+	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
+	    RQB_BIT(pri), RQB_WORD(pri));
+	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
+}
+
+/*
+ * Add the thread to the queue specified by its priority, and set the
+ * corresponding status bit.
+ */
+void
+runq_add(struct runq *rq, struct thread *td, int flags)
+{
+	struct rqhead *rqh;
+	int pri;
+
+	pri = td->td_priority / RQ_PPQ;
+	td->td_rqindex = pri;
+	runq_setbit(rq, pri);
+	rqh = &rq->rq_queues[pri];
+	CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p",
+	    td, td->td_priority, pri, rqh);
+	if (flags & SRQ_PREEMPTED) {
+		TAILQ_INSERT_HEAD(rqh, td, td_runq);
+	} else {
+		TAILQ_INSERT_TAIL(rqh, td, td_runq);
+	}
+}
+
+void
+runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags)
+{
+	struct rqhead *rqh;
+
+	KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
+	td->td_rqindex = pri;
+	runq_setbit(rq, pri);
+	rqh = &rq->rq_queues[pri];
+	CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p",
+	    td, td->td_priority, pri, rqh);
+	if (flags & SRQ_PREEMPTED) {
+		TAILQ_INSERT_HEAD(rqh, td, td_runq);
+	} else {
+		TAILQ_INSERT_TAIL(rqh, td, td_runq);
+	}
+}
+/*
+ * Return true if there are runnable processes of any priority on the run
+ * queue, false otherwise.  Has no side effects, does not modify the run
+ * queue structure.
+ */
+int
+runq_check(struct runq *rq)
+{
+	struct rqbits *rqb;
+	int i;
+
+	rqb = &rq->rq_status;
+	for (i = 0; i < RQB_LEN; i++)
+		if (rqb->rqb_bits[i]) {
+			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
+			    rqb->rqb_bits[i], i);
+			return (1);
+		}
+	CTR0(KTR_RUNQ, "runq_check: empty");
+
+	return (0);
+}
+
+/*
+ * Find the highest priority process on the run queue.
+ */
+struct thread *
+runq_choose_fuzz(struct runq *rq, int fuzz)
+{
+	struct rqhead *rqh;
+	struct thread *td;
+	int pri;
+
+	while ((pri = runq_findbit(rq)) != -1) {
+		rqh = &rq->rq_queues[pri];
+		/* fuzz == 1 is normal.. 0 or less are ignored */
+		if (fuzz > 1) {
+			/*
+			 * In the first couple of entries, check if
+			 * there is one for our CPU as a preference.
+			 */
+			int count = fuzz;
+			int cpu = PCPU_GET(cpuid);
+			struct thread *td2;
+			td2 = td = TAILQ_FIRST(rqh);
+
+			while (count-- && td2) {
+				if (td2->td_lastcpu == cpu) {
+					td = td2;
+					break;
+				}
+				td2 = TAILQ_NEXT(td2, td_runq);
+			}
+		} else
+			td = TAILQ_FIRST(rqh);
+		KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue"));
+		CTR3(KTR_RUNQ,
+		    "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh);
+		return (td);
+	}
+	CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri);
+
+	return (NULL);
+}
+
+/*
+ * Find the highest priority process on the run queue.
+ */
+struct thread *
+runq_choose(struct runq *rq)
+{
+	struct rqhead *rqh;
+	struct thread *td;
+	int pri;
+
+	while ((pri = runq_findbit(rq)) != -1) {
+		rqh = &rq->rq_queues[pri];
+		td = TAILQ_FIRST(rqh);
+		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
+		CTR3(KTR_RUNQ,
+		    "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh);
+		return (td);
+	}
+	CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri);
+
+	return (NULL);
+}
+
+struct thread *
+runq_choose_from(struct runq *rq, u_char idx)
+{
+	struct rqhead *rqh;
+	struct thread *td;
+	int pri;
+
+	if ((pri = runq_findbit_from(rq, idx)) != -1) {
+		rqh = &rq->rq_queues[pri];
+		td = TAILQ_FIRST(rqh);
+		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
+		CTR4(KTR_RUNQ,
+		    "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p",
+		    pri, td, td->td_rqindex, rqh);
+		return (td);
+	}
+	CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri);
+
+	return (NULL);
+}
+/*
+ * Remove the thread from the queue specified by its priority, and clear the
+ * corresponding status bit if the queue becomes empty.
+ * Caller must set state afterwards.
+ */
+void
+runq_remove(struct runq *rq, struct thread *td)
+{
+
+	runq_remove_idx(rq, td, NULL);
+}
+
+void
+runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx)
+{
+	struct rqhead *rqh;
+	u_char pri;
+
+	KASSERT(td->td_flags & TDF_INMEM,
+		("runq_remove_idx: thread swapped out"));
+	pri = td->td_rqindex;
+	KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
+	rqh = &rq->rq_queues[pri];
+	CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p",
+	    td, td->td_priority, pri, rqh);
+	TAILQ_REMOVE(rqh, td, td_runq);
+	if (TAILQ_EMPTY(rqh)) {
+		CTR0(KTR_RUNQ, "runq_remove_idx: empty");
+		runq_clrbit(rq, pri);
+		if (idx != NULL && *idx == pri)
+			*idx = (pri + 1) % RQ_NQS;
+	}
+}
diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c
new file mode 100644
index 0000000..ff5d95d
--- /dev/null
+++ b/sys/kern/kern_sx.c
@@ -0,0 +1,1214 @@
+/*-
+ * Copyright (c) 2007 Attilio Rao <attilio@freebsd.org>
+ * Copyright (c) 2001 Jason Evans <jasone@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice(s), this list of conditions and the following disclaimer as
+ *    the first lines of this file unmodified other than the possible
+ *    addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice(s), this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Shared/exclusive locks.  This implementation attempts to ensure
+ * deterministic lock granting behavior, so that slocks and xlocks are
+ * interleaved.
+ *
+ * Priority propagation will not generally raise the priority of lock holders,
+ * so should not be relied upon in combination with sx locks.
+ */
+
+#include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+#include "opt_no_adaptive_sx.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sleepqueue.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+
+#if defined(SMP) && !defined(NO_ADAPTIVE_SX)
+#include <machine/cpu.h>
+#endif
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#if defined(SMP) && !defined(NO_ADAPTIVE_SX)
+#define	ADAPTIVE_SX
+#endif
+
+CTASSERT((SX_NOADAPTIVE & LO_CLASSFLAGS) == SX_NOADAPTIVE);
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+PMC_SOFT_DECLARE( , , lock, failed);
+#endif
+
+/* Handy macros for sleep queues. */
+#define	SQ_EXCLUSIVE_QUEUE	0
+#define	SQ_SHARED_QUEUE		1
+
+/*
+ * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file.  We
+ * drop Giant anytime we have to sleep or if we adaptively spin.
+ */
+#define	GIANT_DECLARE							\
+	int _giantcnt = 0;						\
+	WITNESS_SAVE_DECL(Giant)					\
+
+#define	GIANT_SAVE() do {						\
+	if (mtx_owned(&Giant)) {					\
+		WITNESS_SAVE(&Giant.lock_object, Giant);		\
+		while (mtx_owned(&Giant)) {				\
+			_giantcnt++;					\
+			mtx_unlock(&Giant);				\
+		}							\
+	}								\
+} while (0)
+
+#define GIANT_RESTORE() do {						\
+	if (_giantcnt > 0) {						\
+		mtx_assert(&Giant, MA_NOTOWNED);			\
+		while (_giantcnt--)					\
+			mtx_lock(&Giant);				\
+		WITNESS_RESTORE(&Giant.lock_object, Giant);		\
+	}								\
+} while (0)
+
+/*
+ * Returns true if an exclusive lock is recursed.  It assumes
+ * curthread currently has an exclusive lock.
+ */
+#define	sx_recurse		lock_object.lo_data
+#define	sx_recursed(sx)		((sx)->sx_recurse != 0)
+
+static void	assert_sx(const struct lock_object *lock, int what);
+#ifdef DDB
+static void	db_show_sx(const struct lock_object *lock);
+#endif
+static void	lock_sx(struct lock_object *lock, int how);
+#ifdef KDTRACE_HOOKS
+static int	owner_sx(const struct lock_object *lock, struct thread **owner);
+#endif
+static int	unlock_sx(struct lock_object *lock);
+
+struct lock_class lock_class_sx = {
+	.lc_name = "sx",
+	.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE,
+	.lc_assert = assert_sx,
+#ifdef DDB
+	.lc_ddb_show = db_show_sx,
+#endif
+	.lc_lock = lock_sx,
+	.lc_unlock = unlock_sx,
+#ifdef KDTRACE_HOOKS
+	.lc_owner = owner_sx,
+#endif
+};
+
+#ifndef INVARIANTS
+#define	_sx_assert(sx, what, file, line)
+#endif
+
+#ifdef ADAPTIVE_SX
+static u_int asx_retries = 10;
+static u_int asx_loops = 10000;
+static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging");
+SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, "");
+SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, "");
+#endif
+
+void
+assert_sx(const struct lock_object *lock, int what)
+{
+
+	sx_assert((const struct sx *)lock, what);
+}
+
+void
+lock_sx(struct lock_object *lock, int how)
+{
+	struct sx *sx;
+
+	sx = (struct sx *)lock;
+	if (how)
+		sx_xlock(sx);
+	else
+		sx_slock(sx);
+}
+
+int
+unlock_sx(struct lock_object *lock)
+{
+	struct sx *sx;
+
+	sx = (struct sx *)lock;
+	sx_assert(sx, SA_LOCKED | SA_NOTRECURSED);
+	if (sx_xlocked(sx)) {
+		sx_xunlock(sx);
+		return (1);
+	} else {
+		sx_sunlock(sx);
+		return (0);
+	}
+}
+
+#ifdef KDTRACE_HOOKS
+int
+owner_sx(const struct lock_object *lock, struct thread **owner)
+{
+        const struct sx *sx = (const struct sx *)lock;
+	uintptr_t x = sx->sx_lock;
+
+        *owner = (struct thread *)SX_OWNER(x);
+        return ((x & SX_LOCK_SHARED) != 0 ? (SX_SHARERS(x) != 0) :
+	    (*owner != NULL));
+}
+#endif
+
+void
+sx_sysinit(void *arg)
+{
+	struct sx_args *sargs = arg;
+
+	sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags);
+}
+
+void
+sx_init_flags(struct sx *sx, const char *description, int opts)
+{
+	int flags;
+
+	MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK |
+	    SX_NOPROFILE | SX_NOADAPTIVE)) == 0);
+	ASSERT_ATOMIC_LOAD_PTR(sx->sx_lock,
+	    ("%s: sx_lock not aligned for %s: %p", __func__, description,
+	    &sx->sx_lock));
+
+	flags = LO_SLEEPABLE | LO_UPGRADABLE;
+	if (opts & SX_DUPOK)
+		flags |= LO_DUPOK;
+	if (opts & SX_NOPROFILE)
+		flags |= LO_NOPROFILE;
+	if (!(opts & SX_NOWITNESS))
+		flags |= LO_WITNESS;
+	if (opts & SX_RECURSE)
+		flags |= LO_RECURSABLE;
+	if (opts & SX_QUIET)
+		flags |= LO_QUIET;
+
+	flags |= opts & SX_NOADAPTIVE;
+	lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags);
+	sx->sx_lock = SX_LOCK_UNLOCKED;
+	sx->sx_recurse = 0;
+}
+
+void
+sx_destroy(struct sx *sx)
+{
+
+	KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held"));
+	KASSERT(sx->sx_recurse == 0, ("sx lock still recursed"));
+	sx->sx_lock = SX_LOCK_DESTROYED;
+	lock_destroy(&sx->lock_object);
+}
+
+int
+_sx_slock(struct sx *sx, int opts, const char *file, int line)
+{
+	int error = 0;
+
+	if (SCHEDULER_STOPPED())
+		return (0);
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("sx_slock() by idle thread %p on sx %s @ %s:%d",
+	    curthread, sx->lock_object.lo_name, file, line));
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_slock() of destroyed sx @ %s:%d", file, line));
+	WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line, NULL);
+	error = __sx_slock(sx, opts, file, line);
+	if (!error) {
+		LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line);
+		WITNESS_LOCK(&sx->lock_object, 0, file, line);
+		curthread->td_locks++;
+	}
+
+	return (error);
+}
+
+int
+sx_try_slock_(struct sx *sx, const char *file, int line)
+{
+	uintptr_t x;
+
+	if (SCHEDULER_STOPPED())
+		return (1);
+
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("sx_try_slock() by idle thread %p on sx %s @ %s:%d",
+	    curthread, sx->lock_object.lo_name, file, line));
+
+	for (;;) {
+		x = sx->sx_lock;
+		KASSERT(x != SX_LOCK_DESTROYED,
+		    ("sx_try_slock() of destroyed sx @ %s:%d", file, line));
+		if (!(x & SX_LOCK_SHARED))
+			break;
+		if (atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) {
+			LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line);
+			WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line);
+			curthread->td_locks++;
+			return (1);
+		}
+	}
+
+	LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line);
+	return (0);
+}
+
+int
+_sx_xlock(struct sx *sx, int opts, const char *file, int line)
+{
+	int error = 0;
+
+	if (SCHEDULER_STOPPED())
+		return (0);
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("sx_xlock() by idle thread %p on sx %s @ %s:%d",
+	    curthread, sx->lock_object.lo_name, file, line));
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_xlock() of destroyed sx @ %s:%d", file, line));
+	WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
+	    line, NULL);
+	error = __sx_xlock(sx, curthread, opts, file, line);
+	if (!error) {
+		LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse,
+		    file, line);
+		WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
+		curthread->td_locks++;
+	}
+
+	return (error);
+}
+
+int
+sx_try_xlock_(struct sx *sx, const char *file, int line)
+{
+	int rval;
+
+	if (SCHEDULER_STOPPED())
+		return (1);
+
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("sx_try_xlock() by idle thread %p on sx %s @ %s:%d",
+	    curthread, sx->lock_object.lo_name, file, line));
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_try_xlock() of destroyed sx @ %s:%d", file, line));
+
+	if (sx_xlocked(sx) &&
+	    (sx->lock_object.lo_flags & LO_RECURSABLE) != 0) {
+		sx->sx_recurse++;
+		atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
+		rval = 1;
+	} else
+		rval = atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED,
+		    (uintptr_t)curthread);
+	LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line);
+	if (rval) {
+		WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+		    file, line);
+		curthread->td_locks++;
+	}
+
+	return (rval);
+}
+
+void
+_sx_sunlock(struct sx *sx, const char *file, int line)
+{
+
+	if (SCHEDULER_STOPPED())
+		return;
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_sunlock() of destroyed sx @ %s:%d", file, line));
+	_sx_assert(sx, SA_SLOCKED, file, line);
+	WITNESS_UNLOCK(&sx->lock_object, 0, file, line);
+	LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line);
+	__sx_sunlock(sx, file, line);
+	LOCKSTAT_PROFILE_RELEASE_LOCK(LS_SX_SUNLOCK_RELEASE, sx);
+	curthread->td_locks--;
+}
+
+void
+_sx_xunlock(struct sx *sx, const char *file, int line)
+{
+
+	if (SCHEDULER_STOPPED())
+		return;
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_xunlock() of destroyed sx @ %s:%d", file, line));
+	_sx_assert(sx, SA_XLOCKED, file, line);
+	WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
+	LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file,
+	    line);
+	if (!sx_recursed(sx))
+		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_SX_XUNLOCK_RELEASE, sx);
+	__sx_xunlock(sx, curthread, file, line);
+	curthread->td_locks--;
+}
+
+/*
+ * Try to do a non-blocking upgrade from a shared lock to an exclusive lock.
+ * This will only succeed if this thread holds a single shared lock.
+ * Return 1 if if the upgrade succeed, 0 otherwise.
+ */
+int
+sx_try_upgrade_(struct sx *sx, const char *file, int line)
+{
+	uintptr_t x;
+	int success;
+
+	if (SCHEDULER_STOPPED())
+		return (1);
+
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line));
+	_sx_assert(sx, SA_SLOCKED, file, line);
+
+	/*
+	 * Try to switch from one shared lock to an exclusive lock.  We need
+	 * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that
+	 * we will wake up the exclusive waiters when we drop the lock.
+	 */
+	x = sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS;
+	success = atomic_cmpset_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | x,
+	    (uintptr_t)curthread | x);
+	LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line);
+	if (success) {
+		WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+		    file, line);
+		LOCKSTAT_RECORD0(LS_SX_TRYUPGRADE_UPGRADE, sx);
+	}
+	return (success);
+}
+
+/*
+ * Downgrade an unrecursed exclusive lock into a single shared lock.
+ */
+void
+sx_downgrade_(struct sx *sx, const char *file, int line)
+{
+	uintptr_t x;
+	int wakeup_swapper;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_downgrade() of destroyed sx @ %s:%d", file, line));
+	_sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line);
+#ifndef INVARIANTS
+	if (sx_recursed(sx))
+		panic("downgrade of a recursed lock");
+#endif
+
+	WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line);
+
+	/*
+	 * Try to switch from an exclusive lock with no shared waiters
+	 * to one sharer with no shared waiters.  If there are
+	 * exclusive waiters, we don't need to lock the sleep queue so
+	 * long as we preserve the flag.  We do one quick try and if
+	 * that fails we grab the sleepq lock to keep the flags from
+	 * changing and do it the slow way.
+	 *
+	 * We have to lock the sleep queue if there are shared waiters
+	 * so we can wake them up.
+	 */
+	x = sx->sx_lock;
+	if (!(x & SX_LOCK_SHARED_WAITERS) &&
+	    atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) |
+	    (x & SX_LOCK_EXCLUSIVE_WAITERS))) {
+		LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
+		return;
+	}
+
+	/*
+	 * Lock the sleep queue so we can read the waiters bits
+	 * without any races and wakeup any shared waiters.
+	 */
+	sleepq_lock(&sx->lock_object);
+
+	/*
+	 * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single
+	 * shared lock.  If there are any shared waiters, wake them up.
+	 */
+	wakeup_swapper = 0;
+	x = sx->sx_lock;
+	atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) |
+	    (x & SX_LOCK_EXCLUSIVE_WAITERS));
+	if (x & SX_LOCK_SHARED_WAITERS)
+		wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX,
+		    0, SQ_SHARED_QUEUE);
+	sleepq_release(&sx->lock_object);
+
+	LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
+	LOCKSTAT_RECORD0(LS_SX_DOWNGRADE_DOWNGRADE, sx);
+
+	if (wakeup_swapper)
+		kick_proc0();
+}
+
+/*
+ * This function represents the so-called 'hard case' for sx_xlock
+ * operation.  All 'easy case' failures are redirected to this.  Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
+int
+_sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file,
+    int line)
+{
+	GIANT_DECLARE;
+#ifdef ADAPTIVE_SX
+	volatile struct thread *owner;
+	u_int i, spintries = 0;
+#endif
+	uintptr_t x;
+#ifdef LOCK_PROFILING
+	uint64_t waittime = 0;
+	int contested = 0;
+#endif
+	int error = 0;
+#ifdef	KDTRACE_HOOKS
+	uint64_t spin_cnt = 0;
+	uint64_t sleep_cnt = 0;
+	int64_t sleep_time = 0;
+#endif
+
+	if (SCHEDULER_STOPPED())
+		return (0);
+
+	/* If we already hold an exclusive lock, then recurse. */
+	if (sx_xlocked(sx)) {
+		KASSERT((sx->lock_object.lo_flags & LO_RECURSABLE) != 0,
+	    ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n",
+		    sx->lock_object.lo_name, file, line));
+		sx->sx_recurse++;
+		atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx);
+		return (0);
+	}
+
+	if (LOCK_LOG_TEST(&sx->lock_object, 0))
+		CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
+		    sx->lock_object.lo_name, (void *)sx->sx_lock, file, line);
+
+	while (!atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, tid)) {
+#ifdef KDTRACE_HOOKS
+		spin_cnt++;
+#endif
+#ifdef HWPMC_HOOKS
+		PMC_SOFT_CALL( , , lock, failed);
+#endif
+		lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
+		    &waittime);
+#ifdef ADAPTIVE_SX
+		/*
+		 * If the lock is write locked and the owner is
+		 * running on another CPU, spin until the owner stops
+		 * running or the state of the lock changes.
+		 */
+		x = sx->sx_lock;
+		if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
+			if ((x & SX_LOCK_SHARED) == 0) {
+				x = SX_OWNER(x);
+				owner = (struct thread *)x;
+				if (TD_IS_RUNNING(owner)) {
+					if (LOCK_LOG_TEST(&sx->lock_object, 0))
+						CTR3(KTR_LOCK,
+					    "%s: spinning on %p held by %p",
+						    __func__, sx, owner);
+					GIANT_SAVE();
+					while (SX_OWNER(sx->sx_lock) == x &&
+					    TD_IS_RUNNING(owner)) {
+						cpu_spinwait();
+#ifdef KDTRACE_HOOKS
+						spin_cnt++;
+#endif
+					}
+					continue;
+				}
+			} else if (SX_SHARERS(x) && spintries < asx_retries) {
+				GIANT_SAVE();
+				spintries++;
+				for (i = 0; i < asx_loops; i++) {
+					if (LOCK_LOG_TEST(&sx->lock_object, 0))
+						CTR4(KTR_LOCK,
+				    "%s: shared spinning on %p with %u and %u",
+						    __func__, sx, spintries, i);
+					x = sx->sx_lock;
+					if ((x & SX_LOCK_SHARED) == 0 ||
+					    SX_SHARERS(x) == 0)
+						break;
+					cpu_spinwait();
+#ifdef KDTRACE_HOOKS
+					spin_cnt++;
+#endif
+				}
+				if (i != asx_loops)
+					continue;
+			}
+		}
+#endif
+
+		sleepq_lock(&sx->lock_object);
+		x = sx->sx_lock;
+
+		/*
+		 * If the lock was released while spinning on the
+		 * sleep queue chain lock, try again.
+		 */
+		if (x == SX_LOCK_UNLOCKED) {
+			sleepq_release(&sx->lock_object);
+			continue;
+		}
+
+#ifdef ADAPTIVE_SX
+		/*
+		 * The current lock owner might have started executing
+		 * on another CPU (or the lock could have changed
+		 * owners) while we were waiting on the sleep queue
+		 * chain lock.  If so, drop the sleep queue lock and try
+		 * again.
+		 */
+		if (!(x & SX_LOCK_SHARED) &&
+		    (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
+			owner = (struct thread *)SX_OWNER(x);
+			if (TD_IS_RUNNING(owner)) {
+				sleepq_release(&sx->lock_object);
+				continue;
+			}
+		}
+#endif
+
+		/*
+		 * If an exclusive lock was released with both shared
+		 * and exclusive waiters and a shared waiter hasn't
+		 * woken up and acquired the lock yet, sx_lock will be
+		 * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS.
+		 * If we see that value, try to acquire it once.  Note
+		 * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS
+		 * as there are other exclusive waiters still.  If we
+		 * fail, restart the loop.
+		 */
+		if (x == (SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS)) {
+			if (atomic_cmpset_acq_ptr(&sx->sx_lock,
+			    SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS,
+			    tid | SX_LOCK_EXCLUSIVE_WAITERS)) {
+				sleepq_release(&sx->lock_object);
+				CTR2(KTR_LOCK, "%s: %p claimed by new writer",
+				    __func__, sx);
+				break;
+			}
+			sleepq_release(&sx->lock_object);
+			continue;
+		}
+
+		/*
+		 * Try to set the SX_LOCK_EXCLUSIVE_WAITERS.  If we fail,
+		 * than loop back and retry.
+		 */
+		if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
+			if (!atomic_cmpset_ptr(&sx->sx_lock, x,
+			    x | SX_LOCK_EXCLUSIVE_WAITERS)) {
+				sleepq_release(&sx->lock_object);
+				continue;
+			}
+			if (LOCK_LOG_TEST(&sx->lock_object, 0))
+				CTR2(KTR_LOCK, "%s: %p set excl waiters flag",
+				    __func__, sx);
+		}
+
+		/*
+		 * Since we have been unable to acquire the exclusive
+		 * lock and the exclusive waiters flag is set, we have
+		 * to sleep.
+		 */
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
+			    __func__, sx);
+
+#ifdef KDTRACE_HOOKS
+		sleep_time -= lockstat_nsecs();
+#endif
+		GIANT_SAVE();
+		sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
+		    SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
+		    SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE);
+		if (!(opts & SX_INTERRUPTIBLE))
+			sleepq_wait(&sx->lock_object, 0);
+		else
+			error = sleepq_wait_sig(&sx->lock_object, 0);
+#ifdef KDTRACE_HOOKS
+		sleep_time += lockstat_nsecs();
+		sleep_cnt++;
+#endif
+		if (error) {
+			if (LOCK_LOG_TEST(&sx->lock_object, 0))
+				CTR2(KTR_LOCK,
+			"%s: interruptible sleep by %p suspended by signal",
+				    __func__, sx);
+			break;
+		}
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
+			    __func__, sx);
+	}
+
+	GIANT_RESTORE();
+	if (!error)
+		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_XLOCK_ACQUIRE, sx,
+		    contested, waittime, file, line);
+#ifdef KDTRACE_HOOKS
+	if (sleep_time)
+		LOCKSTAT_RECORD1(LS_SX_XLOCK_BLOCK, sx, sleep_time);
+	if (spin_cnt > sleep_cnt)
+		LOCKSTAT_RECORD1(LS_SX_XLOCK_SPIN, sx, (spin_cnt - sleep_cnt));
+#endif
+	return (error);
+}
+
+/*
+ * This function represents the so-called 'hard case' for sx_xunlock
+ * operation.  All 'easy case' failures are redirected to this.  Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
+void
+_sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int line)
+{
+	uintptr_t x;
+	int queue, wakeup_swapper;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	MPASS(!(sx->sx_lock & SX_LOCK_SHARED));
+
+	/* If the lock is recursed, then unrecurse one level. */
+	if (sx_xlocked(sx) && sx_recursed(sx)) {
+		if ((--sx->sx_recurse) == 0)
+			atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx);
+		return;
+	}
+	MPASS(sx->sx_lock & (SX_LOCK_SHARED_WAITERS |
+	    SX_LOCK_EXCLUSIVE_WAITERS));
+	if (LOCK_LOG_TEST(&sx->lock_object, 0))
+		CTR2(KTR_LOCK, "%s: %p contested", __func__, sx);
+
+	sleepq_lock(&sx->lock_object);
+	x = SX_LOCK_UNLOCKED;
+
+	/*
+	 * The wake up algorithm here is quite simple and probably not
+	 * ideal.  It gives precedence to shared waiters if they are
+	 * present.  For this condition, we have to preserve the
+	 * state of the exclusive waiters flag.
+	 * If interruptible sleeps left the shared queue empty avoid a
+	 * starvation for the threads sleeping on the exclusive queue by giving
+	 * them precedence and cleaning up the shared waiters bit anyway.
+	 */
+	if ((sx->sx_lock & SX_LOCK_SHARED_WAITERS) != 0 &&
+	    sleepq_sleepcnt(&sx->lock_object, SQ_SHARED_QUEUE) != 0) {
+		queue = SQ_SHARED_QUEUE;
+		x |= (sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS);
+	} else
+		queue = SQ_EXCLUSIVE_QUEUE;
+
+	/* Wake up all the waiters for the specific queue. */
+	if (LOCK_LOG_TEST(&sx->lock_object, 0))
+		CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue",
+		    __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" :
+		    "exclusive");
+	atomic_store_rel_ptr(&sx->sx_lock, x);
+	wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0,
+	    queue);
+	sleepq_release(&sx->lock_object);
+	if (wakeup_swapper)
+		kick_proc0();
+}
+
+/*
+ * This function represents the so-called 'hard case' for sx_slock
+ * operation.  All 'easy case' failures are redirected to this.  Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
+int
+_sx_slock_hard(struct sx *sx, int opts, const char *file, int line)
+{
+	GIANT_DECLARE;
+#ifdef ADAPTIVE_SX
+	volatile struct thread *owner;
+#endif
+#ifdef LOCK_PROFILING
+	uint64_t waittime = 0;
+	int contested = 0;
+#endif
+	uintptr_t x;
+	int error = 0;
+#ifdef KDTRACE_HOOKS
+	uint64_t spin_cnt = 0;
+	uint64_t sleep_cnt = 0;
+	int64_t sleep_time = 0;
+#endif
+
+	if (SCHEDULER_STOPPED())
+		return (0);
+
+	/*
+	 * As with rwlocks, we don't make any attempt to try to block
+	 * shared locks once there is an exclusive waiter.
+	 */
+	for (;;) {
+#ifdef KDTRACE_HOOKS
+		spin_cnt++;
+#endif
+		x = sx->sx_lock;
+
+		/*
+		 * If no other thread has an exclusive lock then try to bump up
+		 * the count of sharers.  Since we have to preserve the state
+		 * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the
+		 * shared lock loop back and retry.
+		 */
+		if (x & SX_LOCK_SHARED) {
+			MPASS(!(x & SX_LOCK_SHARED_WAITERS));
+			if (atomic_cmpset_acq_ptr(&sx->sx_lock, x,
+			    x + SX_ONE_SHARER)) {
+				if (LOCK_LOG_TEST(&sx->lock_object, 0))
+					CTR4(KTR_LOCK,
+					    "%s: %p succeed %p -> %p", __func__,
+					    sx, (void *)x,
+					    (void *)(x + SX_ONE_SHARER));
+				break;
+			}
+			continue;
+		}
+#ifdef HWPMC_HOOKS
+		PMC_SOFT_CALL( , , lock, failed);
+#endif
+		lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
+		    &waittime);
+
+#ifdef ADAPTIVE_SX
+		/*
+		 * If the owner is running on another CPU, spin until
+		 * the owner stops running or the state of the lock
+		 * changes.
+		 */
+		if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
+			x = SX_OWNER(x);
+			owner = (struct thread *)x;
+			if (TD_IS_RUNNING(owner)) {
+				if (LOCK_LOG_TEST(&sx->lock_object, 0))
+					CTR3(KTR_LOCK,
+					    "%s: spinning on %p held by %p",
+					    __func__, sx, owner);
+				GIANT_SAVE();
+				while (SX_OWNER(sx->sx_lock) == x &&
+				    TD_IS_RUNNING(owner)) {
+#ifdef KDTRACE_HOOKS
+					spin_cnt++;
+#endif
+					cpu_spinwait();
+				}
+				continue;
+			}
+		}
+#endif
+
+		/*
+		 * Some other thread already has an exclusive lock, so
+		 * start the process of blocking.
+		 */
+		sleepq_lock(&sx->lock_object);
+		x = sx->sx_lock;
+
+		/*
+		 * The lock could have been released while we spun.
+		 * In this case loop back and retry.
+		 */
+		if (x & SX_LOCK_SHARED) {
+			sleepq_release(&sx->lock_object);
+			continue;
+		}
+
+#ifdef ADAPTIVE_SX
+		/*
+		 * If the owner is running on another CPU, spin until
+		 * the owner stops running or the state of the lock
+		 * changes.
+		 */
+		if (!(x & SX_LOCK_SHARED) &&
+		    (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
+			owner = (struct thread *)SX_OWNER(x);
+			if (TD_IS_RUNNING(owner)) {
+				sleepq_release(&sx->lock_object);
+				continue;
+			}
+		}
+#endif
+
+		/*
+		 * Try to set the SX_LOCK_SHARED_WAITERS flag.  If we
+		 * fail to set it drop the sleep queue lock and loop
+		 * back.
+		 */
+		if (!(x & SX_LOCK_SHARED_WAITERS)) {
+			if (!atomic_cmpset_ptr(&sx->sx_lock, x,
+			    x | SX_LOCK_SHARED_WAITERS)) {
+				sleepq_release(&sx->lock_object);
+				continue;
+			}
+			if (LOCK_LOG_TEST(&sx->lock_object, 0))
+				CTR2(KTR_LOCK, "%s: %p set shared waiters flag",
+				    __func__, sx);
+		}
+
+		/*
+		 * Since we have been unable to acquire the shared lock,
+		 * we have to sleep.
+		 */
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
+			    __func__, sx);
+
+#ifdef KDTRACE_HOOKS
+		sleep_time -= lockstat_nsecs();
+#endif
+		GIANT_SAVE();
+		sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
+		    SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
+		    SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE);
+		if (!(opts & SX_INTERRUPTIBLE))
+			sleepq_wait(&sx->lock_object, 0);
+		else
+			error = sleepq_wait_sig(&sx->lock_object, 0);
+#ifdef KDTRACE_HOOKS
+		sleep_time += lockstat_nsecs();
+		sleep_cnt++;
+#endif
+		if (error) {
+			if (LOCK_LOG_TEST(&sx->lock_object, 0))
+				CTR2(KTR_LOCK,
+			"%s: interruptible sleep by %p suspended by signal",
+				    __func__, sx);
+			break;
+		}
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
+			    __func__, sx);
+	}
+	if (error == 0)
+		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_SLOCK_ACQUIRE, sx,
+		    contested, waittime, file, line);
+#ifdef KDTRACE_HOOKS
+	if (sleep_time)
+		LOCKSTAT_RECORD1(LS_SX_XLOCK_BLOCK, sx, sleep_time);
+	if (spin_cnt > sleep_cnt)
+		LOCKSTAT_RECORD1(LS_SX_XLOCK_SPIN, sx, (spin_cnt - sleep_cnt));
+#endif
+	GIANT_RESTORE();
+	return (error);
+}
+
+/*
+ * This function represents the so-called 'hard case' for sx_sunlock
+ * operation.  All 'easy case' failures are redirected to this.  Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
+void
+_sx_sunlock_hard(struct sx *sx, const char *file, int line)
+{
+	uintptr_t x;
+	int wakeup_swapper;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	for (;;) {
+		x = sx->sx_lock;
+
+		/*
+		 * We should never have sharers while at least one thread
+		 * holds a shared lock.
+		 */
+		KASSERT(!(x & SX_LOCK_SHARED_WAITERS),
+		    ("%s: waiting sharers", __func__));
+
+		/*
+		 * See if there is more than one shared lock held.  If
+		 * so, just drop one and return.
+		 */
+		if (SX_SHARERS(x) > 1) {
+			if (atomic_cmpset_rel_ptr(&sx->sx_lock, x,
+			    x - SX_ONE_SHARER)) {
+				if (LOCK_LOG_TEST(&sx->lock_object, 0))
+					CTR4(KTR_LOCK,
+					    "%s: %p succeeded %p -> %p",
+					    __func__, sx, (void *)x,
+					    (void *)(x - SX_ONE_SHARER));
+				break;
+			}
+			continue;
+		}
+
+		/*
+		 * If there aren't any waiters for an exclusive lock,
+		 * then try to drop it quickly.
+		 */
+		if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
+			MPASS(x == SX_SHARERS_LOCK(1));
+			if (atomic_cmpset_rel_ptr(&sx->sx_lock,
+			    SX_SHARERS_LOCK(1), SX_LOCK_UNLOCKED)) {
+				if (LOCK_LOG_TEST(&sx->lock_object, 0))
+					CTR2(KTR_LOCK, "%s: %p last succeeded",
+					    __func__, sx);
+				break;
+			}
+			continue;
+		}
+
+		/*
+		 * At this point, there should just be one sharer with
+		 * exclusive waiters.
+		 */
+		MPASS(x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS));
+
+		sleepq_lock(&sx->lock_object);
+
+		/*
+		 * Wake up semantic here is quite simple:
+		 * Just wake up all the exclusive waiters.
+		 * Note that the state of the lock could have changed,
+		 * so if it fails loop back and retry.
+		 */
+		if (!atomic_cmpset_rel_ptr(&sx->sx_lock,
+		    SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS,
+		    SX_LOCK_UNLOCKED)) {
+			sleepq_release(&sx->lock_object);
+			continue;
+		}
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p waking up all thread on"
+			    "exclusive queue", __func__, sx);
+		wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX,
+		    0, SQ_EXCLUSIVE_QUEUE);
+		sleepq_release(&sx->lock_object);
+		if (wakeup_swapper)
+			kick_proc0();
+		break;
+	}
+}
+
+#ifdef INVARIANT_SUPPORT
+#ifndef INVARIANTS
+#undef	_sx_assert
+#endif
+
+/*
+ * In the non-WITNESS case, sx_assert() can only detect that at least
+ * *some* thread owns an slock, but it cannot guarantee that *this*
+ * thread owns an slock.
+ */
+void
+_sx_assert(const struct sx *sx, int what, const char *file, int line)
+{
+#ifndef WITNESS
+	int slocked = 0;
+#endif
+
+	if (panicstr != NULL)
+		return;
+	switch (what) {
+	case SA_SLOCKED:
+	case SA_SLOCKED | SA_NOTRECURSED:
+	case SA_SLOCKED | SA_RECURSED:
+#ifndef WITNESS
+		slocked = 1;
+		/* FALLTHROUGH */
+#endif
+	case SA_LOCKED:
+	case SA_LOCKED | SA_NOTRECURSED:
+	case SA_LOCKED | SA_RECURSED:
+#ifdef WITNESS
+		witness_assert(&sx->lock_object, what, file, line);
+#else
+		/*
+		 * If some other thread has an exclusive lock or we
+		 * have one and are asserting a shared lock, fail.
+		 * Also, if no one has a lock at all, fail.
+		 */
+		if (sx->sx_lock == SX_LOCK_UNLOCKED ||
+		    (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked ||
+		    sx_xholder(sx) != curthread)))
+			panic("Lock %s not %slocked @ %s:%d\n",
+			    sx->lock_object.lo_name, slocked ? "share " : "",
+			    file, line);
+
+		if (!(sx->sx_lock & SX_LOCK_SHARED)) {
+			if (sx_recursed(sx)) {
+				if (what & SA_NOTRECURSED)
+					panic("Lock %s recursed @ %s:%d\n",
+					    sx->lock_object.lo_name, file,
+					    line);
+			} else if (what & SA_RECURSED)
+				panic("Lock %s not recursed @ %s:%d\n",
+				    sx->lock_object.lo_name, file, line);
+		}
+#endif
+		break;
+	case SA_XLOCKED:
+	case SA_XLOCKED | SA_NOTRECURSED:
+	case SA_XLOCKED | SA_RECURSED:
+		if (sx_xholder(sx) != curthread)
+			panic("Lock %s not exclusively locked @ %s:%d\n",
+			    sx->lock_object.lo_name, file, line);
+		if (sx_recursed(sx)) {
+			if (what & SA_NOTRECURSED)
+				panic("Lock %s recursed @ %s:%d\n",
+				    sx->lock_object.lo_name, file, line);
+		} else if (what & SA_RECURSED)
+			panic("Lock %s not recursed @ %s:%d\n",
+			    sx->lock_object.lo_name, file, line);
+		break;
+	case SA_UNLOCKED:
+#ifdef WITNESS
+		witness_assert(&sx->lock_object, what, file, line);
+#else
+		/*
+		 * If we hold an exclusve lock fail.  We can't
+		 * reliably check to see if we hold a shared lock or
+		 * not.
+		 */
+		if (sx_xholder(sx) == curthread)
+			panic("Lock %s exclusively locked @ %s:%d\n",
+			    sx->lock_object.lo_name, file, line);
+#endif
+		break;
+	default:
+		panic("Unknown sx lock assertion: %d @ %s:%d", what, file,
+		    line);
+	}
+}
+#endif	/* INVARIANT_SUPPORT */
+
+#ifdef DDB
+static void
+db_show_sx(const struct lock_object *lock)
+{
+	struct thread *td;
+	const struct sx *sx;
+
+	sx = (const struct sx *)lock;
+
+	db_printf(" state: ");
+	if (sx->sx_lock == SX_LOCK_UNLOCKED)
+		db_printf("UNLOCKED\n");
+	else if (sx->sx_lock == SX_LOCK_DESTROYED) {
+		db_printf("DESTROYED\n");
+		return;
+	} else if (sx->sx_lock & SX_LOCK_SHARED)
+		db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock));
+	else {
+		td = sx_xholder(sx);
+		db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
+		    td->td_tid, td->td_proc->p_pid, td->td_name);
+		if (sx_recursed(sx))
+			db_printf(" recursed: %d\n", sx->sx_recurse);
+	}
+
+	db_printf(" waiters: ");
+	switch(sx->sx_lock &
+	    (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) {
+	case SX_LOCK_SHARED_WAITERS:
+		db_printf("shared\n");
+		break;
+	case SX_LOCK_EXCLUSIVE_WAITERS:
+		db_printf("exclusive\n");
+		break;
+	case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS:
+		db_printf("exclusive and shared\n");
+		break;
+	default:
+		db_printf("none\n");
+	}
+}
+
+/*
+ * Check to see if a thread that is blocked on a sleep queue is actually
+ * blocked on an sx lock.  If so, output some details and return true.
+ * If the lock has an exclusive owner, return that in *ownerp.
+ */
+int
+sx_chain(struct thread *td, struct thread **ownerp)
+{
+	struct sx *sx;
+
+	/*
+	 * Check to see if this thread is blocked on an sx lock.
+	 * First, we check the lock class.  If that is ok, then we
+	 * compare the lock name against the wait message.
+	 */
+	sx = td->td_wchan;
+	if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx ||
+	    sx->lock_object.lo_name != td->td_wmesg)
+		return (0);
+
+	/* We think we have an sx lock, so output some details. */
+	db_printf("blocked on sx \"%s\" ", td->td_wmesg);
+	*ownerp = sx_xholder(sx);
+	if (sx->sx_lock & SX_LOCK_SHARED)
+		db_printf("SLOCK (count %ju)\n",
+		    (uintmax_t)SX_SHARERS(sx->sx_lock));
+	else
+		db_printf("XLOCK\n");
+	return (1);
+}
+#endif
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
new file mode 100644
index 0000000..b0e1908
--- /dev/null
+++ b/sys/kern/kern_synch.c
@@ -0,0 +1,632 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/condvar.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/sleepqueue.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vmmeter.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+#include <machine/cpu.h>
+
+#ifdef XEN
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#endif
+
+#define	KTDSTATE(td)							\
+	(((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep"  :		\
+	((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" :	\
+	((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" :		\
+	((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" :		\
+	((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding")
+
+static void synch_setup(void *dummy);
+SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup,
+    NULL);
+
+int	hogticks;
+static uint8_t pause_wchan[MAXCPU];
+
+static struct callout loadav_callout;
+
+struct loadavg averunnable =
+	{ {0, 0, 0}, FSCALE };	/* load average, of runnable procs */
+/*
+ * Constants for averages over 1, 5, and 15 minutes
+ * when sampling at 5 second intervals.
+ */
+static fixpt_t cexp[3] = {
+	0.9200444146293232 * FSCALE,	/* exp(-1/12) */
+	0.9834714538216174 * FSCALE,	/* exp(-1/60) */
+	0.9944598480048967 * FSCALE,	/* exp(-1/180) */
+};
+
+/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
+static int      fscale __unused = FSCALE;
+SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
+
+static void	loadav(void *arg);
+
+SDT_PROVIDER_DECLARE(sched);
+SDT_PROBE_DEFINE(sched, , , preempt, preempt);
+
+/*
+ * These probes reference Solaris features that are not implemented in FreeBSD.
+ * Create the probes anyway for compatibility with existing D scripts; they'll
+ * just never fire.
+ */
+SDT_PROBE_DEFINE(sched, , , cpucaps_sleep, cpucaps-sleep);
+SDT_PROBE_DEFINE(sched, , , cpucaps_wakeup, cpucaps-wakeup);
+SDT_PROBE_DEFINE(sched, , , schedctl_nopreempt, schedctl-nopreempt);
+SDT_PROBE_DEFINE(sched, , , schedctl_preempt, schedctl-preempt);
+SDT_PROBE_DEFINE(sched, , , schedctl_yield, schedctl-yield);
+
+static void
+sleepinit(void *unused)
+{
+
+	hogticks = (hz / 10) * 2;	/* Default only. */
+	init_sleepqueues();
+}
+
+/*
+ * vmem tries to lock the sleepq mutexes when free'ing kva, so make sure
+ * it is available.
+ */
+SYSINIT(sleepinit, SI_SUB_KMEM, SI_ORDER_ANY, sleepinit, 0);
+
+/*
+ * General sleep call.  Suspends the current thread until a wakeup is
+ * performed on the specified identifier.  The thread will then be made
+ * runnable with the specified priority.  Sleeps at most sbt units of time
+ * (0 means no timeout).  If pri includes the PCATCH flag, let signals
+ * interrupt the sleep, otherwise ignore them while sleeping.  Returns 0 if
+ * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
+ * signal becomes pending, ERESTART is returned if the current system
+ * call should be restarted if possible, and EINTR is returned if the system
+ * call should be interrupted by the signal (return EINTR).
+ *
+ * The lock argument is unlocked before the caller is suspended, and
+ * re-locked before _sleep() returns.  If priority includes the PDROP
+ * flag the lock is not re-locked before returning.
+ */
+int
+_sleep(void *ident, struct lock_object *lock, int priority,
+    const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
+{
+	struct thread *td;
+	struct proc *p;
+	struct lock_class *class;
+	int catch, lock_state, pri, rval, sleepq_flags;
+	WITNESS_SAVE_DECL(lock_witness);
+
+	td = curthread;
+	p = td->td_proc;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(1, 0, wmesg);
+#endif
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+	    "Sleeping on \"%s\"", wmesg);
+	KASSERT(sbt != 0 || mtx_owned(&Giant) || lock != NULL,
+	    ("sleeping without a lock"));
+	KASSERT(p != NULL, ("msleep1"));
+	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
+	if (priority & PDROP)
+		KASSERT(lock != NULL && lock != &Giant.lock_object,
+		    ("PDROP requires a non-Giant lock"));
+	if (lock != NULL)
+		class = LOCK_CLASS(lock);
+	else
+		class = NULL;
+
+	if (cold || SCHEDULER_STOPPED()) {
+		/*
+		 * During autoconfiguration, just return;
+		 * don't run any other threads or panic below,
+		 * in case this is the idle thread and already asleep.
+		 * XXX: this used to do "s = splhigh(); splx(safepri);
+		 * splx(s);" to give interrupts a chance, but there is
+		 * no way to give interrupts a chance now.
+		 */
+		if (lock != NULL && priority & PDROP)
+			class->lc_unlock(lock);
+		return (0);
+	}
+	catch = priority & PCATCH;
+	pri = priority & PRIMASK;
+
+	/*
+	 * If we are already on a sleep queue, then remove us from that
+	 * sleep queue first.  We have to do this to handle recursive
+	 * sleeps.
+	 */
+	if (TD_ON_SLEEPQ(td))
+		sleepq_remove(td, td->td_wchan);
+
+	if ((uint8_t *)ident >= &pause_wchan[0] &&
+	    (uint8_t *)ident <= &pause_wchan[MAXCPU - 1])
+		sleepq_flags = SLEEPQ_PAUSE;
+	else
+		sleepq_flags = SLEEPQ_SLEEP;
+	if (catch)
+		sleepq_flags |= SLEEPQ_INTERRUPTIBLE;
+
+	sleepq_lock(ident);
+	CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)",
+	    td->td_tid, p->p_pid, td->td_name, wmesg, ident);
+
+	if (lock == &Giant.lock_object)
+		mtx_assert(&Giant, MA_OWNED);
+	DROP_GIANT();
+	if (lock != NULL && lock != &Giant.lock_object &&
+	    !(class->lc_flags & LC_SLEEPABLE)) {
+		WITNESS_SAVE(lock, lock_witness);
+		lock_state = class->lc_unlock(lock);
+	} else
+		/* GCC needs to follow the Yellow Brick Road */
+		lock_state = -1;
+
+	/*
+	 * We put ourselves on the sleep queue and start our timeout
+	 * before calling thread_suspend_check, as we could stop there,
+	 * and a wakeup or a SIGCONT (or both) could occur while we were
+	 * stopped without resuming us.  Thus, we must be ready for sleep
+	 * when cursig() is called.  If the wakeup happens while we're
+	 * stopped, then td will no longer be on a sleep queue upon
+	 * return from cursig().
+	 */
+	sleepq_add(ident, lock, wmesg, sleepq_flags, 0);
+	if (sbt != 0)
+		sleepq_set_timeout_sbt(ident, sbt, pr, flags);
+	if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
+		sleepq_release(ident);
+		WITNESS_SAVE(lock, lock_witness);
+		lock_state = class->lc_unlock(lock);
+		sleepq_lock(ident);
+	}
+	if (sbt != 0 && catch)
+		rval = sleepq_timedwait_sig(ident, pri);
+	else if (sbt != 0)
+		rval = sleepq_timedwait(ident, pri);
+	else if (catch)
+		rval = sleepq_wait_sig(ident, pri);
+	else {
+		sleepq_wait(ident, pri);
+		rval = 0;
+	}
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(0, 0, wmesg);
+#endif
+	PICKUP_GIANT();
+	if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) {
+		class->lc_lock(lock, lock_state);
+		WITNESS_RESTORE(lock, lock_witness);
+	}
+	return (rval);
+}
+
+int
+msleep_spin_sbt(void *ident, struct mtx *mtx, const char *wmesg,
+    sbintime_t sbt, sbintime_t pr, int flags)
+{
+	struct thread *td;
+	struct proc *p;
+	int rval;
+	WITNESS_SAVE_DECL(mtx);
+
+	td = curthread;
+	p = td->td_proc;
+	KASSERT(mtx != NULL, ("sleeping without a mutex"));
+	KASSERT(p != NULL, ("msleep1"));
+	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
+
+	if (cold || SCHEDULER_STOPPED()) {
+		/*
+		 * During autoconfiguration, just return;
+		 * don't run any other threads or panic below,
+		 * in case this is the idle thread and already asleep.
+		 * XXX: this used to do "s = splhigh(); splx(safepri);
+		 * splx(s);" to give interrupts a chance, but there is
+		 * no way to give interrupts a chance now.
+		 */
+		return (0);
+	}
+
+	sleepq_lock(ident);
+	CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)",
+	    td->td_tid, p->p_pid, td->td_name, wmesg, ident);
+
+	DROP_GIANT();
+	mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
+	WITNESS_SAVE(&mtx->lock_object, mtx);
+	mtx_unlock_spin(mtx);
+
+	/*
+	 * We put ourselves on the sleep queue and start our timeout.
+	 */
+	sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0);
+	if (sbt != 0)
+		sleepq_set_timeout_sbt(ident, sbt, pr, flags);
+
+	/*
+	 * Can't call ktrace with any spin locks held so it can lock the
+	 * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold
+	 * any spin lock.  Thus, we have to drop the sleepq spin lock while
+	 * we handle those requests.  This is safe since we have placed our
+	 * thread on the sleep queue already.
+	 */
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW)) {
+		sleepq_release(ident);
+		ktrcsw(1, 0, wmesg);
+		sleepq_lock(ident);
+	}
+#endif
+#ifdef WITNESS
+	sleepq_release(ident);
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"",
+	    wmesg);
+	sleepq_lock(ident);
+#endif
+	if (sbt != 0)
+		rval = sleepq_timedwait(ident, 0);
+	else {
+		sleepq_wait(ident, 0);
+		rval = 0;
+	}
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(0, 0, wmesg);
+#endif
+	PICKUP_GIANT();
+	mtx_lock_spin(mtx);
+	WITNESS_RESTORE(&mtx->lock_object, mtx);
+	return (rval);
+}
+
+/*
+ * pause() delays the calling thread by the given number of system ticks.
+ * During cold bootup, pause() uses the DELAY() function instead of
+ * the tsleep() function to do the waiting. The "timo" argument must be
+ * greater than or equal to zero. A "timo" value of zero is equivalent
+ * to a "timo" value of one.
+ */
+int
+pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
+{
+	KASSERT(sbt >= 0, ("pause: timeout must be >= 0"));
+
+	/* silently convert invalid timeouts */
+	if (sbt == 0)
+		sbt = tick_sbt;
+
+	if (cold) {
+		/*
+		 * We delay one second at a time to avoid overflowing the
+		 * system specific DELAY() function(s):
+		 */
+		while (sbt >= SBT_1S) {
+			DELAY(1000000);
+			sbt -= SBT_1S;
+		}
+		/* Do the delay remainder, if any */
+		sbt = (sbt + SBT_1US - 1) / SBT_1US;
+		if (sbt > 0)
+			DELAY(sbt);
+		return (0);
+	}
+	return (_sleep(&pause_wchan[curcpu], NULL, 0, wmesg, sbt, pr, flags));
+}
+
+/*
+ * Make all threads sleeping on the specified identifier runnable.
+ */
+void
+wakeup(void *ident)
+{
+	int wakeup_swapper;
+
+	sleepq_lock(ident);
+	wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0);
+	sleepq_release(ident);
+	if (wakeup_swapper) {
+		KASSERT(ident != &proc0,
+		    ("wakeup and wakeup_swapper and proc0"));
+		kick_proc0();
+	}
+}
+
+/*
+ * Make a thread sleeping on the specified identifier runnable.
+ * May wake more than one thread if a target thread is currently
+ * swapped out.
+ */
+void
+wakeup_one(void *ident)
+{
+	int wakeup_swapper;
+
+	sleepq_lock(ident);
+	wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0);
+	sleepq_release(ident);
+	if (wakeup_swapper)
+		kick_proc0();
+}
+
+static void
+kdb_switch(void)
+{
+	thread_unlock(curthread);
+	kdb_backtrace();
+	kdb_reenter();
+	panic("%s: did not reenter debugger", __func__);
+}
+
+/*
+ * The machine independent parts of context switching.
+ */
+void
+mi_switch(int flags, struct thread *newtd)
+{
+	uint64_t runtime, new_switchtime;
+	struct thread *td;
+	struct proc *p;
+
+	td = curthread;			/* XXX */
+	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
+	p = td->td_proc;		/* XXX */
+	KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
+#ifdef INVARIANTS
+	if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
+		mtx_assert(&Giant, MA_NOTOWNED);
+#endif
+	KASSERT(td->td_critnest == 1 || panicstr,
+	    ("mi_switch: switch in a critical section"));
+	KASSERT((flags & (SW_INVOL | SW_VOL)) != 0,
+	    ("mi_switch: switch must be voluntary or involuntary"));
+	KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself"));
+
+	/*
+	 * Don't perform context switches from the debugger.
+	 */
+	if (kdb_active)
+		kdb_switch();
+	if (SCHEDULER_STOPPED())
+		return;
+	if (flags & SW_VOL) {
+		td->td_ru.ru_nvcsw++;
+		td->td_swvoltick = ticks;
+	} else
+		td->td_ru.ru_nivcsw++;
+#ifdef SCHED_STATS
+	SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]);
+#endif
+	/*
+	 * Compute the amount of time during which the current
+	 * thread was running, and add that to its total so far.
+	 */
+	new_switchtime = cpu_ticks();
+	runtime = new_switchtime - PCPU_GET(switchtime);
+	td->td_runtime += runtime;
+	td->td_incruntime += runtime;
+	PCPU_SET(switchtime, new_switchtime);
+	td->td_generation++;	/* bump preempt-detect counter */
+	PCPU_INC(cnt.v_swtch);
+	PCPU_SET(switchticks, ticks);
+	CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)",
+	    td->td_tid, td->td_sched, p->p_pid, td->td_name);
+#if (KTR_COMPILE & KTR_SCHED) != 0
+	if (TD_IS_IDLETHREAD(td))
+		KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
+		    "prio:%d", td->td_priority);
+	else
+		KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
+		    "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
+		    "lockname:\"%s\"", td->td_lockname);
+#endif
+	SDT_PROBE0(sched, , , preempt);
+#ifdef XEN
+	PT_UPDATES_FLUSH();
+#endif
+	sched_switch(td, newtd, flags);
+	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
+	    "prio:%d", td->td_priority);
+
+	CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)",
+	    td->td_tid, td->td_sched, p->p_pid, td->td_name);
+
+	/* 
+	 * If the last thread was exiting, finish cleaning it up.
+	 */
+	if ((td = PCPU_GET(deadthread))) {
+		PCPU_SET(deadthread, NULL);
+		thread_stash(td);
+	}
+}
+
+/*
+ * Change thread state to be runnable, placing it on the run queue if
+ * it is in memory.  If it is swapped out, return true so our caller
+ * will know to awaken the swapper.
+ */
+int
+setrunnable(struct thread *td)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT(td->td_proc->p_state != PRS_ZOMBIE,
+	    ("setrunnable: pid %d is a zombie", td->td_proc->p_pid));
+	switch (td->td_state) {
+	case TDS_RUNNING:
+	case TDS_RUNQ:
+		return (0);
+	case TDS_INHIBITED:
+		/*
+		 * If we are only inhibited because we are swapped out
+		 * then arange to swap in this process. Otherwise just return.
+		 */
+		if (td->td_inhibitors != TDI_SWAPPED)
+			return (0);
+		/* FALLTHROUGH */
+	case TDS_CAN_RUN:
+		break;
+	default:
+		printf("state is 0x%x", td->td_state);
+		panic("setrunnable(2)");
+	}
+	if ((td->td_flags & TDF_INMEM) == 0) {
+		if ((td->td_flags & TDF_SWAPINREQ) == 0) {
+			td->td_flags |= TDF_SWAPINREQ;
+			return (1);
+		}
+	} else
+		sched_wakeup(td);
+	return (0);
+}
+
+/*
+ * Compute a tenex style load average of a quantity on
+ * 1, 5 and 15 minute intervals.
+ */
+static void
+loadav(void *arg)
+{
+	int i, nrun;
+	struct loadavg *avg;
+
+	nrun = sched_load();
+	avg = &averunnable;
+
+	for (i = 0; i < 3; i++)
+		avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
+		    nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
+
+	/*
+	 * Schedule the next update to occur after 5 seconds, but add a
+	 * random variation to avoid synchronisation with processes that
+	 * run at regular intervals.
+	 */
+	callout_reset_sbt(&loadav_callout,
+	    tick_sbt * (hz * 4 + (int)(random() % (hz * 2 + 1))), 0,
+	    loadav, NULL, C_DIRECT_EXEC | C_HARDCLOCK);
+}
+
+/* ARGSUSED */
+static void
+synch_setup(void *dummy)
+{
+	callout_init(&loadav_callout, CALLOUT_MPSAFE);
+
+	/* Kick off timeout driven events by calling first time. */
+	loadav(NULL);
+}
+
+int
+should_yield(void)
+{
+
+	return ((unsigned int)(ticks - curthread->td_swvoltick) >= hogticks);
+}
+
+void
+maybe_yield(void)
+{
+
+	if (should_yield())
+		kern_yield(PRI_USER);
+}
+
+void
+kern_yield(int prio)
+{
+	struct thread *td;
+
+	td = curthread;
+	DROP_GIANT();
+	thread_lock(td);
+	if (prio == PRI_USER)
+		prio = td->td_user_pri;
+	if (prio >= 0)
+		sched_prio(td, prio);
+	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
+	thread_unlock(td);
+	PICKUP_GIANT();
+}
+
+/*
+ * General purpose yield system call.
+ */
+int
+sys_yield(struct thread *td, struct yield_args *uap)
+{
+
+	thread_lock(td);
+	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
+		sched_prio(td, PRI_MAX_TIMESHARE);
+	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
+	thread_unlock(td);
+	td->td_retval[0] = 0;
+	return (0);
+}
diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c
new file mode 100644
index 0000000..03f6088
--- /dev/null
+++ b/sys/kern/kern_syscalls.c
@@ -0,0 +1,220 @@
+/*-
+ * Copyright (c) 1999 Assar Westerlund
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/sx.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <machine/atomic.h>
+
+/*
+ * Acts like "nosys" but can be identified in sysent for dynamic call
+ * number assignment for a limited number of calls.
+ *
+ * Place holder for system call slots reserved for loadable modules.
+ */
+int
+lkmnosys(struct thread *td, struct nosys_args *args)
+{
+
+	return (nosys(td, args));
+}
+
+int
+lkmressys(struct thread *td, struct nosys_args *args)
+{
+
+	return (nosys(td, args));
+}
+
+static void
+syscall_thread_drain(struct sysent *se)
+{
+	u_int32_t cnt, oldcnt;
+
+	do {
+		oldcnt = se->sy_thrcnt;
+		KASSERT((oldcnt & SY_THR_STATIC) == 0,
+		    ("drain on static syscall"));
+		cnt = oldcnt | SY_THR_DRAINING;
+	} while (atomic_cmpset_acq_32(&se->sy_thrcnt, oldcnt, cnt) == 0);
+	while (atomic_cmpset_32(&se->sy_thrcnt, SY_THR_DRAINING,
+	    SY_THR_ABSENT) == 0)
+		pause("scdrn", hz/2);
+}
+
+int
+syscall_thread_enter(struct thread *td, struct sysent *se)
+{
+	u_int32_t cnt, oldcnt;
+
+	do {
+		oldcnt = se->sy_thrcnt;
+		if ((oldcnt & SY_THR_STATIC) != 0)
+			return (0);
+		if ((oldcnt & (SY_THR_DRAINING | SY_THR_ABSENT)) != 0)
+			return (ENOSYS);
+		cnt = oldcnt + SY_THR_INCR;
+	} while (atomic_cmpset_acq_32(&se->sy_thrcnt, oldcnt, cnt) == 0);
+	return (0);
+}
+
+void
+syscall_thread_exit(struct thread *td, struct sysent *se)
+{
+	u_int32_t cnt, oldcnt;
+
+	do {
+		oldcnt = se->sy_thrcnt;
+		if ((oldcnt & SY_THR_STATIC) != 0)
+			return;
+		cnt = oldcnt - SY_THR_INCR;
+	} while (atomic_cmpset_rel_32(&se->sy_thrcnt, oldcnt, cnt) == 0);
+}
+
+int
+syscall_register(int *offset, struct sysent *new_sysent,
+    struct sysent *old_sysent)
+{
+	int i;
+
+	if (*offset == NO_SYSCALL) {
+		for (i = 1; i < SYS_MAXSYSCALL; ++i)
+			if (sysent[i].sy_call == (sy_call_t *)lkmnosys)
+				break;
+		if (i == SYS_MAXSYSCALL)
+			return (ENFILE);
+		*offset = i;
+	} else if (*offset < 0 || *offset >= SYS_MAXSYSCALL)
+		return (EINVAL);
+	else if (sysent[*offset].sy_call != (sy_call_t *)lkmnosys &&
+	    sysent[*offset].sy_call != (sy_call_t *)lkmressys)
+		return (EEXIST);
+
+	KASSERT(sysent[*offset].sy_thrcnt == SY_THR_ABSENT,
+	    ("dynamic syscall is not protected"));
+	*old_sysent = sysent[*offset];
+	new_sysent->sy_thrcnt = SY_THR_ABSENT;
+	sysent[*offset] = *new_sysent;
+	atomic_store_rel_32(&sysent[*offset].sy_thrcnt, 0);
+	return (0);
+}
+
+int
+syscall_deregister(int *offset, struct sysent *old_sysent)
+{
+
+	if (*offset) {
+		syscall_thread_drain(&sysent[*offset]);
+		sysent[*offset] = *old_sysent;
+	}
+	return (0);
+}
+
+int
+syscall_module_handler(struct module *mod, int what, void *arg)
+{
+	struct syscall_module_data *data = arg;
+	modspecific_t ms;
+	int error;
+
+	switch (what) {
+	case MOD_LOAD:
+		error = syscall_register(data->offset, data->new_sysent,
+		    &data->old_sysent);
+		if (error) {
+			/* Leave a mark so we know to safely unload below. */
+			data->offset = NULL;
+			return (error);
+		}
+		ms.intval = *data->offset;
+		MOD_XLOCK;
+		module_setspecific(mod, &ms);
+		MOD_XUNLOCK;
+		if (data->chainevh)
+			error = data->chainevh(mod, what, data->chainarg);
+		return (error);
+	case MOD_UNLOAD:
+		/*
+		 * MOD_LOAD failed, so just return without calling the
+		 * chained handler since we didn't pass along the MOD_LOAD
+		 * event.
+		 */
+		if (data->offset == NULL)
+			return (0);
+		if (data->chainevh) {
+			error = data->chainevh(mod, what, data->chainarg);
+			if (error)
+				return error;
+		}
+		error = syscall_deregister(data->offset, &data->old_sysent);
+		return (error);
+	default:
+		if (data->chainevh)
+			return (data->chainevh(mod, what, data->chainarg));
+		return (EOPNOTSUPP);
+	}
+
+	/* NOTREACHED */
+}
+
+int
+syscall_helper_register(struct syscall_helper_data *sd)
+{
+	struct syscall_helper_data *sd1;
+	int error;
+
+	for (sd1 = sd; sd1->syscall_no != NO_SYSCALL; sd1++) {
+		error = syscall_register(&sd1->syscall_no, &sd1->new_sysent,
+		    &sd1->old_sysent);
+		if (error != 0) {
+			syscall_helper_unregister(sd);
+			return (error);
+		}
+		sd1->registered = 1;
+	}
+	return (0);
+}
+
+int
+syscall_helper_unregister(struct syscall_helper_data *sd)
+{
+	struct syscall_helper_data *sd1;
+
+	for (sd1 = sd; sd1->registered != 0; sd1++) {
+		syscall_deregister(&sd1->syscall_no, &sd1->old_sysent);
+		sd1->registered = 0;
+	}
+	return (0);
+}
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
new file mode 100644
index 0000000..416f85f
--- /dev/null
+++ b/sys/kern/kern_sysctl.c
@@ -0,0 +1,1656 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/fail.h>
+#include <sys/systm.h>
+#include <sys/capability.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/jail.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/sx.h>
+#include <sys/sysproto.h>
+#include <sys/uio.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <net/vnet.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
+static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
+static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
+
+/*
+ * The sysctllock protects the MIB tree.  It also protects sysctl
+ * contexts used with dynamic sysctls.  The sysctl_register_oid() and
+ * sysctl_unregister_oid() routines require the sysctllock to already
+ * be held, so the sysctl_lock() and sysctl_unlock() routines are
+ * provided for the few places in the kernel which need to use that
+ * API rather than using the dynamic API.  Use of the dynamic API is
+ * strongly encouraged for most code.
+ *
+ * The sysctlmemlock is used to limit the amount of user memory wired for
+ * sysctl requests.  This is implemented by serializing any userland
+ * sysctl requests larger than a single page via an exclusive lock.
+ */
+static struct sx sysctllock;
+static struct sx sysctlmemlock;
+
+#define	SYSCTL_XLOCK()		sx_xlock(&sysctllock)
+#define	SYSCTL_XUNLOCK()	sx_xunlock(&sysctllock)
+#define	SYSCTL_ASSERT_XLOCKED()	sx_assert(&sysctllock, SA_XLOCKED)
+#define	SYSCTL_INIT()		sx_init(&sysctllock, "sysctl lock")
+#define	SYSCTL_SLEEP(ch, wmesg, timo)					\
+				sx_sleep(ch, &sysctllock, 0, wmesg, timo)
+
+static int sysctl_root(SYSCTL_HANDLER_ARGS);
+
+struct sysctl_oid_list sysctl__children; /* root list */
+
+static int	sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del,
+		    int recurse);
+
+static struct sysctl_oid *
+sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
+{
+	struct sysctl_oid *oidp;
+
+	SYSCTL_ASSERT_XLOCKED();
+	SLIST_FOREACH(oidp, list, oid_link) {
+		if (strcmp(oidp->oid_name, name) == 0) {
+			return (oidp);
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Initialization of the MIB tree.
+ *
+ * Order by number in each list.
+ */
+void
+sysctl_lock(void)
+{
+
+	SYSCTL_XLOCK();
+}
+
+void
+sysctl_unlock(void)
+{
+
+	SYSCTL_XUNLOCK();
+}
+
+void
+sysctl_register_oid(struct sysctl_oid *oidp)
+{
+	struct sysctl_oid_list *parent = oidp->oid_parent;
+	struct sysctl_oid *p;
+	struct sysctl_oid *q;
+
+	/*
+	 * First check if another oid with the same name already
+	 * exists in the parent's list.
+	 */
+	SYSCTL_ASSERT_XLOCKED();
+	p = sysctl_find_oidname(oidp->oid_name, parent);
+	if (p != NULL) {
+		if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+			p->oid_refcnt++;
+			return;
+		} else {
+			printf("can't re-use a leaf (%s)!\n", p->oid_name);
+			return;
+		}
+	}
+	/*
+	 * If this oid has a number OID_AUTO, give it a number which
+	 * is greater than any current oid.
+	 * NOTE: DO NOT change the starting value here, change it in
+	 * <sys/sysctl.h>, and make sure it is at least 256 to
+	 * accomodate e.g. net.inet.raw as a static sysctl node.
+	 */
+	if (oidp->oid_number == OID_AUTO) {
+		static int newoid = CTL_AUTO_START;
+
+		oidp->oid_number = newoid++;
+		if (newoid == 0x7fffffff)
+			panic("out of oids");
+	}
+#if 0
+	else if (oidp->oid_number >= CTL_AUTO_START) {
+		/* do not panic; this happens when unregistering sysctl sets */
+		printf("static sysctl oid too high: %d", oidp->oid_number);
+	}
+#endif
+
+	/*
+	 * Insert the oid into the parent's list in order.
+	 */
+	q = NULL;
+	SLIST_FOREACH(p, parent, oid_link) {
+		if (oidp->oid_number < p->oid_number)
+			break;
+		q = p;
+	}
+	if (q)
+		SLIST_INSERT_AFTER(q, oidp, oid_link);
+	else
+		SLIST_INSERT_HEAD(parent, oidp, oid_link);
+}
+
+void
+sysctl_unregister_oid(struct sysctl_oid *oidp)
+{
+	struct sysctl_oid *p;
+	int error;
+
+	SYSCTL_ASSERT_XLOCKED();
+	error = ENOENT;
+	if (oidp->oid_number == OID_AUTO) {
+		error = EINVAL;
+	} else {
+		SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
+			if (p == oidp) {
+				SLIST_REMOVE(oidp->oid_parent, oidp,
+				    sysctl_oid, oid_link);
+				error = 0;
+				break;
+			}
+		}
+	}
+
+	/* 
+	 * This can happen when a module fails to register and is
+	 * being unloaded afterwards.  It should not be a panic()
+	 * for normal use.
+	 */
+	if (error)
+		printf("%s: failed to unregister sysctl\n", __func__);
+}
+
+/* Initialize a new context to keep track of dynamically added sysctls. */
+int
+sysctl_ctx_init(struct sysctl_ctx_list *c)
+{
+
+	if (c == NULL) {
+		return (EINVAL);
+	}
+
+	/*
+	 * No locking here, the caller is responsible for not adding
+	 * new nodes to a context until after this function has
+	 * returned.
+	 */
+	TAILQ_INIT(c);
+	return (0);
+}
+
+/* Free the context, and destroy all dynamic oids registered in this context */
+int
+sysctl_ctx_free(struct sysctl_ctx_list *clist)
+{
+	struct sysctl_ctx_entry *e, *e1;
+	int error;
+
+	error = 0;
+	/*
+	 * First perform a "dry run" to check if it's ok to remove oids.
+	 * XXX FIXME
+	 * XXX This algorithm is a hack. But I don't know any
+	 * XXX better solution for now...
+	 */
+	SYSCTL_XLOCK();
+	TAILQ_FOREACH(e, clist, link) {
+		error = sysctl_remove_oid_locked(e->entry, 0, 0);
+		if (error)
+			break;
+	}
+	/*
+	 * Restore deregistered entries, either from the end,
+	 * or from the place where error occured.
+	 * e contains the entry that was not unregistered
+	 */
+	if (error)
+		e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
+	else
+		e1 = TAILQ_LAST(clist, sysctl_ctx_list);
+	while (e1 != NULL) {
+		sysctl_register_oid(e1->entry);
+		e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
+	}
+	if (error) {
+		SYSCTL_XUNLOCK();
+		return(EBUSY);
+	}
+	/* Now really delete the entries */
+	e = TAILQ_FIRST(clist);
+	while (e != NULL) {
+		e1 = TAILQ_NEXT(e, link);
+		error = sysctl_remove_oid_locked(e->entry, 1, 0);
+		if (error)
+			panic("sysctl_remove_oid: corrupt tree, entry: %s",
+			    e->entry->oid_name);
+		free(e, M_SYSCTLOID);
+		e = e1;
+	}
+	SYSCTL_XUNLOCK();
+	return (error);
+}
+
+/* Add an entry to the context */
+struct sysctl_ctx_entry *
+sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
+{
+	struct sysctl_ctx_entry *e;
+
+	SYSCTL_ASSERT_XLOCKED();
+	if (clist == NULL || oidp == NULL)
+		return(NULL);
+	e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
+	e->entry = oidp;
+	TAILQ_INSERT_HEAD(clist, e, link);
+	return (e);
+}
+
+/* Find an entry in the context */
+struct sysctl_ctx_entry *
+sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
+{
+	struct sysctl_ctx_entry *e;
+
+	SYSCTL_ASSERT_XLOCKED();
+	if (clist == NULL || oidp == NULL)
+		return(NULL);
+	TAILQ_FOREACH(e, clist, link) {
+		if(e->entry == oidp)
+			return(e);
+	}
+	return (e);
+}
+
+/*
+ * Delete an entry from the context.
+ * NOTE: this function doesn't free oidp! You have to remove it
+ * with sysctl_remove_oid().
+ */
+int
+sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
+{
+	struct sysctl_ctx_entry *e;
+
+	if (clist == NULL || oidp == NULL)
+		return (EINVAL);
+	SYSCTL_XLOCK();
+	e = sysctl_ctx_entry_find(clist, oidp);
+	if (e != NULL) {
+		TAILQ_REMOVE(clist, e, link);
+		SYSCTL_XUNLOCK();
+		free(e, M_SYSCTLOID);
+		return (0);
+	} else {
+		SYSCTL_XUNLOCK();
+		return (ENOENT);
+	}
+}
+
+/*
+ * Remove dynamically created sysctl trees.
+ * oidp - top of the tree to be removed
+ * del - if 0 - just deregister, otherwise free up entries as well
+ * recurse - if != 0 traverse the subtree to be deleted
+ */
+int
+sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
+{
+	int error;
+
+	SYSCTL_XLOCK();
+	error = sysctl_remove_oid_locked(oidp, del, recurse);
+	SYSCTL_XUNLOCK();
+	return (error);
+}
+
+int
+sysctl_remove_name(struct sysctl_oid *parent, const char *name,
+    int del, int recurse)
+{
+	struct sysctl_oid *p, *tmp;
+	int error;
+
+	error = ENOENT;
+	SYSCTL_XLOCK();
+	SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) {
+		if (strcmp(p->oid_name, name) == 0) {
+			error = sysctl_remove_oid_locked(p, del, recurse);
+			break;
+		}
+	}
+	SYSCTL_XUNLOCK();
+
+	return (error);
+}
+
+
+static int
+sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse)
+{
+	struct sysctl_oid *p, *tmp;
+	int error;
+
+	SYSCTL_ASSERT_XLOCKED();
+	if (oidp == NULL)
+		return(EINVAL);
+	if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
+		printf("can't remove non-dynamic nodes!\n");
+		return (EINVAL);
+	}
+	/*
+	 * WARNING: normal method to do this should be through
+	 * sysctl_ctx_free(). Use recursing as the last resort
+	 * method to purge your sysctl tree of leftovers...
+	 * However, if some other code still references these nodes,
+	 * it will panic.
+	 */
+	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+		if (oidp->oid_refcnt == 1) {
+			SLIST_FOREACH_SAFE(p,
+			    SYSCTL_CHILDREN(oidp), oid_link, tmp) {
+				if (!recurse) {
+					printf("Warning: failed attempt to "
+					    "remove oid %s with child %s\n",
+					    oidp->oid_name, p->oid_name);
+					return (ENOTEMPTY);
+				}
+				error = sysctl_remove_oid_locked(p, del,
+				    recurse);
+				if (error)
+					return (error);
+			}
+			if (del)
+				free(SYSCTL_CHILDREN(oidp), M_SYSCTLOID);
+		}
+	}
+	if (oidp->oid_refcnt > 1 ) {
+		oidp->oid_refcnt--;
+	} else {
+		if (oidp->oid_refcnt == 0) {
+			printf("Warning: bad oid_refcnt=%u (%s)!\n",
+				oidp->oid_refcnt, oidp->oid_name);
+			return (EINVAL);
+		}
+		sysctl_unregister_oid(oidp);
+		if (del) {
+			/*
+			 * Wait for all threads running the handler to drain.
+			 * This preserves the previous behavior when the
+			 * sysctl lock was held across a handler invocation,
+			 * and is necessary for module unload correctness.
+			 */
+			while (oidp->oid_running > 0) {
+				oidp->oid_kind |= CTLFLAG_DYING;
+				SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0);
+			}
+			if (oidp->oid_descr)
+				free(__DECONST(char *, oidp->oid_descr),
+				    M_SYSCTLOID);
+			free(__DECONST(char *, oidp->oid_name), M_SYSCTLOID);
+			free(oidp, M_SYSCTLOID);
+		}
+	}
+	return (0);
+}
+/*
+ * Create new sysctls at run time.
+ * clist may point to a valid context initialized with sysctl_ctx_init().
+ */
+struct sysctl_oid *
+sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
+	int number, const char *name, int kind, void *arg1, intptr_t arg2,
+	int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr)
+{
+	struct sysctl_oid *oidp;
+
+	/* You have to hook up somewhere.. */
+	if (parent == NULL)
+		return(NULL);
+	/* Check if the node already exists, otherwise create it */
+	SYSCTL_XLOCK();
+	oidp = sysctl_find_oidname(name, parent);
+	if (oidp != NULL) {
+		if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+			oidp->oid_refcnt++;
+			/* Update the context */
+			if (clist != NULL)
+				sysctl_ctx_entry_add(clist, oidp);
+			SYSCTL_XUNLOCK();
+			return (oidp);
+		} else {
+			SYSCTL_XUNLOCK();
+			printf("can't re-use a leaf (%s)!\n", name);
+			return (NULL);
+		}
+	}
+	oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
+	oidp->oid_parent = parent;
+	SLIST_NEXT(oidp, oid_link) = NULL;
+	oidp->oid_number = number;
+	oidp->oid_refcnt = 1;
+	oidp->oid_name = strdup(name, M_SYSCTLOID);
+	oidp->oid_handler = handler;
+	oidp->oid_kind = CTLFLAG_DYN | kind;
+	if ((kind & CTLTYPE) == CTLTYPE_NODE) {
+		/* Allocate space for children */
+		SYSCTL_CHILDREN_SET(oidp, malloc(sizeof(struct sysctl_oid_list),
+		    M_SYSCTLOID, M_WAITOK));
+		SLIST_INIT(SYSCTL_CHILDREN(oidp));
+		oidp->oid_arg2 = arg2;
+	} else {
+		oidp->oid_arg1 = arg1;
+		oidp->oid_arg2 = arg2;
+	}
+	oidp->oid_fmt = fmt;
+	if (descr)
+		oidp->oid_descr = strdup(descr, M_SYSCTLOID);
+	/* Update the context, if used */
+	if (clist != NULL)
+		sysctl_ctx_entry_add(clist, oidp);
+	/* Register this oid */
+	sysctl_register_oid(oidp);
+	SYSCTL_XUNLOCK();
+	return (oidp);
+}
+
+/*
+ * Rename an existing oid.
+ */
+void
+sysctl_rename_oid(struct sysctl_oid *oidp, const char *name)
+{
+	char *newname;
+	char *oldname;
+
+	newname = strdup(name, M_SYSCTLOID);
+	SYSCTL_XLOCK();
+	oldname = __DECONST(char *, oidp->oid_name);
+	oidp->oid_name = newname;
+	SYSCTL_XUNLOCK();
+	free(oldname, M_SYSCTLOID);
+}
+
+/*
+ * Reparent an existing oid.
+ */
+int
+sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent)
+{
+	struct sysctl_oid *oidp;
+
+	SYSCTL_XLOCK();
+	if (oid->oid_parent == parent) {
+		SYSCTL_XUNLOCK();
+		return (0);
+	}
+	oidp = sysctl_find_oidname(oid->oid_name, parent);
+	if (oidp != NULL) {
+		SYSCTL_XUNLOCK();
+		return (EEXIST);
+	}
+	sysctl_unregister_oid(oid);
+	oid->oid_parent = parent;
+	oid->oid_number = OID_AUTO;
+	sysctl_register_oid(oid);
+	SYSCTL_XUNLOCK();
+	return (0);
+}
+
+/*
+ * Register the kernel's oids on startup.
+ */
+SET_DECLARE(sysctl_set, struct sysctl_oid);
+
+static void
+sysctl_register_all(void *arg)
+{
+	struct sysctl_oid **oidp;
+
+	sx_init(&sysctlmemlock, "sysctl mem");
+	SYSCTL_INIT();
+	SYSCTL_XLOCK();
+	SET_FOREACH(oidp, sysctl_set)
+		sysctl_register_oid(*oidp);
+	SYSCTL_XUNLOCK();
+}
+SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_all, 0);
+
+/*
+ * "Staff-functions"
+ *
+ * These functions implement a presently undocumented interface 
+ * used by the sysctl program to walk the tree, and get the type
+ * so it can print the value.
+ * This interface is under work and consideration, and should probably
+ * be killed with a big axe by the first person who can find the time.
+ * (be aware though, that the proper interface isn't as obvious as it
+ * may seem, there are various conflicting requirements.
+ *
+ * {0,0}	printf the entire MIB-tree.
+ * {0,1,...}	return the name of the "..." OID.
+ * {0,2,...}	return the next OID.
+ * {0,3}	return the OID of the name in "new"
+ * {0,4,...}	return the kind & format info for the "..." OID.
+ * {0,5,...}	return the description the "..." OID.
+ */
+
+#ifdef SYSCTL_DEBUG
+static void
+sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
+{
+	int k;
+	struct sysctl_oid *oidp;
+
+	SYSCTL_ASSERT_XLOCKED();
+	SLIST_FOREACH(oidp, l, oid_link) {
+
+		for (k=0; k<i; k++)
+			printf(" ");
+
+		printf("%d %s ", oidp->oid_number, oidp->oid_name);
+
+		printf("%c%c",
+			oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
+			oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
+
+		if (oidp->oid_handler)
+			printf(" *Handler");
+
+		switch (oidp->oid_kind & CTLTYPE) {
+			case CTLTYPE_NODE:
+				printf(" Node\n");
+				if (!oidp->oid_handler) {
+					sysctl_sysctl_debug_dump_node(
+						oidp->oid_arg1, i+2);
+				}
+				break;
+			case CTLTYPE_INT:    printf(" Int\n"); break;
+			case CTLTYPE_UINT:   printf(" u_int\n"); break;
+			case CTLTYPE_LONG:   printf(" Long\n"); break;
+			case CTLTYPE_ULONG:  printf(" u_long\n"); break;
+			case CTLTYPE_STRING: printf(" String\n"); break;
+			case CTLTYPE_U64:    printf(" uint64_t\n"); break;
+			case CTLTYPE_S64:    printf(" int64_t\n"); break;
+			case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
+			default:	     printf("\n");
+		}
+
+	}
+}
+
+static int
+sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+
+	error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
+	if (error)
+		return (error);
+	SYSCTL_XLOCK();
+	sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
+	SYSCTL_XUNLOCK();
+	return (ENOENT);
+}
+
+SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD,
+	0, 0, sysctl_sysctl_debug, "-", "");
+#endif
+
+static int
+sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
+	int error = 0;
+	struct sysctl_oid *oid;
+	struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
+	char buf[10];
+
+	SYSCTL_XLOCK();
+	while (namelen) {
+		if (!lsp) {
+			snprintf(buf,sizeof(buf),"%d",*name);
+			if (req->oldidx)
+				error = SYSCTL_OUT(req, ".", 1);
+			if (!error)
+				error = SYSCTL_OUT(req, buf, strlen(buf));
+			if (error)
+				goto out;
+			namelen--;
+			name++;
+			continue;
+		}
+		lsp2 = 0;
+		SLIST_FOREACH(oid, lsp, oid_link) {
+			if (oid->oid_number != *name)
+				continue;
+
+			if (req->oldidx)
+				error = SYSCTL_OUT(req, ".", 1);
+			if (!error)
+				error = SYSCTL_OUT(req, oid->oid_name,
+					strlen(oid->oid_name));
+			if (error)
+				goto out;
+
+			namelen--;
+			name++;
+
+			if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
+				break;
+
+			if (oid->oid_handler)
+				break;
+
+			lsp2 = SYSCTL_CHILDREN(oid);
+			break;
+		}
+		lsp = lsp2;
+	}
+	error = SYSCTL_OUT(req, "", 1);
+ out:
+	SYSCTL_XUNLOCK();
+	return (error);
+}
+
+/*
+ * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in
+ * capability mode.
+ */
+static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_CAPRD,
+    sysctl_sysctl_name, "");
+
+static int
+sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, 
+	int *next, int *len, int level, struct sysctl_oid **oidpp)
+{
+	struct sysctl_oid *oidp;
+
+	SYSCTL_ASSERT_XLOCKED();
+	*len = level;
+	SLIST_FOREACH(oidp, lsp, oid_link) {
+		*next = oidp->oid_number;
+		*oidpp = oidp;
+
+		if (oidp->oid_kind & CTLFLAG_SKIP)
+			continue;
+
+		if (!namelen) {
+			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
+				return (0);
+			if (oidp->oid_handler) 
+				/* We really should call the handler here...*/
+				return (0);
+			lsp = SYSCTL_CHILDREN(oidp);
+			if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1, 
+				len, level+1, oidpp))
+				return (0);
+			goto emptynode;
+		}
+
+		if (oidp->oid_number < *name)
+			continue;
+
+		if (oidp->oid_number > *name) {
+			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+				return (0);
+			if (oidp->oid_handler)
+				return (0);
+			lsp = SYSCTL_CHILDREN(oidp);
+			if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, 
+				next+1, len, level+1, oidpp))
+				return (0);
+			goto next;
+		}
+		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+			continue;
+
+		if (oidp->oid_handler)
+			continue;
+
+		lsp = SYSCTL_CHILDREN(oidp);
+		if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, 
+			len, level+1, oidpp))
+			return (0);
+	next:
+		namelen = 1;
+	emptynode:
+		*len = level;
+	}
+	return (1);
+}
+
+static int
+sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
+	int i, j, error;
+	struct sysctl_oid *oid;
+	struct sysctl_oid_list *lsp = &sysctl__children;
+	int newoid[CTL_MAXNAME];
+
+	SYSCTL_XLOCK();
+	i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid);
+	SYSCTL_XUNLOCK();
+	if (i)
+		return (ENOENT);
+	error = SYSCTL_OUT(req, newoid, j * sizeof (int));
+	return (error);
+}
+
+/*
+ * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in
+ * capability mode.
+ */
+static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_CAPRD,
+    sysctl_sysctl_next, "");
+
+static int
+name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp)
+{
+	struct sysctl_oid *oidp;
+	struct sysctl_oid_list *lsp = &sysctl__children;
+	char *p;
+
+	SYSCTL_ASSERT_XLOCKED();
+
+	for (*len = 0; *len < CTL_MAXNAME;) {
+		p = strsep(&name, ".");
+
+		oidp = SLIST_FIRST(lsp);
+		for (;; oidp = SLIST_NEXT(oidp, oid_link)) {
+			if (oidp == NULL)
+				return (ENOENT);
+			if (strcmp(p, oidp->oid_name) == 0)
+				break;
+		}
+		*oid++ = oidp->oid_number;
+		(*len)++;
+
+		if (name == NULL || *name == '\0') {
+			if (oidpp)
+				*oidpp = oidp;
+			return (0);
+		}
+
+		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+			break;
+
+		if (oidp->oid_handler)
+			break;
+
+		lsp = SYSCTL_CHILDREN(oidp);
+	}
+	return (ENOENT);
+}
+
+static int
+sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
+{
+	char *p;
+	int error, oid[CTL_MAXNAME], len = 0;
+	struct sysctl_oid *op = 0;
+
+	if (!req->newlen) 
+		return (ENOENT);
+	if (req->newlen >= MAXPATHLEN)	/* XXX arbitrary, undocumented */
+		return (ENAMETOOLONG);
+
+	p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
+
+	error = SYSCTL_IN(req, p, req->newlen);
+	if (error) {
+		free(p, M_SYSCTL);
+		return (error);
+	}
+
+	p [req->newlen] = '\0';
+
+	SYSCTL_XLOCK();
+	error = name2oid(p, oid, &len, &op);
+	SYSCTL_XUNLOCK();
+
+	free(p, M_SYSCTL);
+
+	if (error)
+		return (error);
+
+	error = SYSCTL_OUT(req, oid, len * sizeof *oid);
+	return (error);
+}
+
+/*
+ * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in
+ * capability mode.
+ */
+SYSCTL_PROC(_sysctl, 3, name2oid,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE
+    | CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", "");
+
+static int
+sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
+{
+	struct sysctl_oid *oid;
+	int error;
+
+	SYSCTL_XLOCK();
+	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
+	if (error)
+		goto out;
+
+	if (oid->oid_fmt == NULL) {
+		error = ENOENT;
+		goto out;
+	}
+	error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
+	if (error)
+		goto out;
+	error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
+ out:
+	SYSCTL_XUNLOCK();
+	return (error);
+}
+
+
+static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD,
+    sysctl_sysctl_oidfmt, "");
+
+static int
+sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
+{
+	struct sysctl_oid *oid;
+	int error;
+
+	SYSCTL_XLOCK();
+	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
+	if (error)
+		goto out;
+
+	if (oid->oid_descr == NULL) {
+		error = ENOENT;
+		goto out;
+	}
+	error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1);
+ out:
+	SYSCTL_XUNLOCK();
+	return (error);
+}
+
+static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD|CTLFLAG_CAPRD,
+    sysctl_sysctl_oiddescr, "");
+
+/*
+ * Default "handler" functions.
+ */
+
+/*
+ * Handle an int, signed or unsigned.
+ * Two cases:
+ *     a variable:  point arg1 at it.
+ *     a constant:  pass it in arg2.
+ */
+
+int
+sysctl_handle_int(SYSCTL_HANDLER_ARGS)
+{
+	int tmpout, error = 0;
+
+	/*
+	 * Attempt to get a coherent snapshot by making a copy of the data.
+	 */
+	if (arg1)
+		tmpout = *(int *)arg1;
+	else
+		tmpout = arg2;
+	error = SYSCTL_OUT(req, &tmpout, sizeof(int));
+
+	if (error || !req->newptr)
+		return (error);
+
+	if (!arg1)
+		error = EPERM;
+	else
+		error = SYSCTL_IN(req, arg1, sizeof(int));
+	return (error);
+}
+
+/*
+ * Based on on sysctl_handle_int() convert milliseconds into ticks.
+ * Note: this is used by TCP.
+ */
+
+int
+sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
+{
+	int error, s, tt;
+
+	tt = *(int *)arg1;
+	s = (int)((int64_t)tt * 1000 / hz);
+
+	error = sysctl_handle_int(oidp, &s, 0, req);
+	if (error || !req->newptr)
+		return (error);
+
+	tt = (int)((int64_t)s * hz / 1000);
+	if (tt < 1)
+		return (EINVAL);
+
+	*(int *)arg1 = tt;
+	return (0);
+}
+
+
+/*
+ * Handle a long, signed or unsigned.
+ * Two cases:
+ *     a variable:  point arg1 at it.
+ *     a constant:  pass it in arg2.
+ */
+
+int
+sysctl_handle_long(SYSCTL_HANDLER_ARGS)
+{
+	int error = 0;
+	long tmplong;
+#ifdef SCTL_MASK32
+	int tmpint;
+#endif
+
+	/*
+	 * Attempt to get a coherent snapshot by making a copy of the data.
+	 */
+	if (arg1)
+		tmplong = *(long *)arg1;
+	else
+		tmplong = arg2;
+#ifdef SCTL_MASK32
+	if (req->flags & SCTL_MASK32) {
+		tmpint = tmplong;
+		error = SYSCTL_OUT(req, &tmpint, sizeof(int));
+	} else
+#endif
+		error = SYSCTL_OUT(req, &tmplong, sizeof(long));
+
+	if (error || !req->newptr)
+		return (error);
+
+	if (!arg1)
+		error = EPERM;
+#ifdef SCTL_MASK32
+	else if (req->flags & SCTL_MASK32) {
+		error = SYSCTL_IN(req, &tmpint, sizeof(int));
+		*(long *)arg1 = (long)tmpint;
+	}
+#endif
+	else
+		error = SYSCTL_IN(req, arg1, sizeof(long));
+	return (error);
+}
+
+/*
+ * Handle a 64 bit int, signed or unsigned.
+ * Two cases:
+ *     a variable:  point arg1 at it.
+ *     a constant:  pass it in arg2.
+ */
+int
+sysctl_handle_64(SYSCTL_HANDLER_ARGS)
+{
+	int error = 0;
+	uint64_t tmpout;
+
+	/*
+	 * Attempt to get a coherent snapshot by making a copy of the data.
+	 */
+	if (arg1)
+		tmpout = *(uint64_t *)arg1;
+	else
+		tmpout = arg2;
+	error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t));
+
+	if (error || !req->newptr)
+		return (error);
+
+	if (!arg1)
+		error = EPERM;
+	else
+		error = SYSCTL_IN(req, arg1, sizeof(uint64_t));
+	return (error);
+}
+
+/*
+ * Handle our generic '\0' terminated 'C' string.
+ * Two cases:
+ * 	a variable string:  point arg1 at it, arg2 is max length.
+ * 	a constant string:  point arg1 at it, arg2 is zero.
+ */
+
+int
+sysctl_handle_string(SYSCTL_HANDLER_ARGS)
+{
+	int error=0;
+	char *tmparg;
+	size_t outlen;
+
+	/*
+	 * Attempt to get a coherent snapshot by copying to a
+	 * temporary kernel buffer.
+	 */
+retry:
+	outlen = strlen((char *)arg1)+1;
+	tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK);
+
+	if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) {
+		free(tmparg, M_SYSCTLTMP);
+		goto retry;
+	}
+
+	error = SYSCTL_OUT(req, tmparg, outlen);
+	free(tmparg, M_SYSCTLTMP);
+
+	if (error || !req->newptr)
+		return (error);
+
+	if ((req->newlen - req->newidx) >= arg2) {
+		error = EINVAL;
+	} else {
+		arg2 = (req->newlen - req->newidx);
+		error = SYSCTL_IN(req, arg1, arg2);
+		((char *)arg1)[arg2] = '\0';
+	}
+
+	return (error);
+}
+
+/*
+ * Handle any kind of opaque data.
+ * arg1 points to it, arg2 is the size.
+ */
+
+int
+sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
+{
+	int error, tries;
+	u_int generation;
+	struct sysctl_req req2;
+
+	/*
+	 * Attempt to get a coherent snapshot, by using the thread
+	 * pre-emption counter updated from within mi_switch() to
+	 * determine if we were pre-empted during a bcopy() or
+	 * copyout(). Make 3 attempts at doing this before giving up.
+	 * If we encounter an error, stop immediately.
+	 */
+	tries = 0;
+	req2 = *req;
+retry:
+	generation = curthread->td_generation;
+	error = SYSCTL_OUT(req, arg1, arg2);
+	if (error)
+		return (error);
+	tries++;
+	if (generation != curthread->td_generation && tries < 3) {
+		*req = req2;
+		goto retry;
+	}
+
+	error = SYSCTL_IN(req, arg1, arg2);
+
+	return (error);
+}
+
+/*
+ * Transfer functions to/from kernel space.
+ * XXX: rather untested at this point
+ */
+static int
+sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
+{
+	size_t i = 0;
+
+	if (req->oldptr) {
+		i = l;
+		if (req->oldlen <= req->oldidx)
+			i = 0;
+		else
+			if (i > req->oldlen - req->oldidx)
+				i = req->oldlen - req->oldidx;
+		if (i > 0)
+			bcopy(p, (char *)req->oldptr + req->oldidx, i);
+	}
+	req->oldidx += l;
+	if (req->oldptr && i != l)
+		return (ENOMEM);
+	return (0);
+}
+
+static int
+sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
+{
+	if (!req->newptr)
+		return (0);
+	if (req->newlen - req->newidx < l)
+		return (EINVAL);
+	bcopy((char *)req->newptr + req->newidx, p, l);
+	req->newidx += l;
+	return (0);
+}
+
+int
+kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
+    size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags)
+{
+	int error = 0;
+	struct sysctl_req req;
+
+	bzero(&req, sizeof req);
+
+	req.td = td;
+	req.flags = flags;
+
+	if (oldlenp) {
+		req.oldlen = *oldlenp;
+	}
+	req.validlen = req.oldlen;
+
+	if (old) {
+		req.oldptr= old;
+	}
+
+	if (new != NULL) {
+		req.newlen = newlen;
+		req.newptr = new;
+	}
+
+	req.oldfunc = sysctl_old_kernel;
+	req.newfunc = sysctl_new_kernel;
+	req.lock = REQ_UNWIRED;
+
+	SYSCTL_XLOCK();
+	error = sysctl_root(0, name, namelen, &req);
+	SYSCTL_XUNLOCK();
+
+	if (req.lock == REQ_WIRED && req.validlen > 0)
+		vsunlock(req.oldptr, req.validlen);
+
+	if (error && error != ENOMEM)
+		return (error);
+
+	if (retval) {
+		if (req.oldptr && req.oldidx > req.validlen)
+			*retval = req.validlen;
+		else
+			*retval = req.oldidx;
+	}
+	return (error);
+}
+
+int
+kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
+    void *new, size_t newlen, size_t *retval, int flags)
+{
+        int oid[CTL_MAXNAME];
+        size_t oidlen, plen;
+	int error;
+
+	oid[0] = 0;		/* sysctl internal magic */
+	oid[1] = 3;		/* name2oid */
+	oidlen = sizeof(oid);
+
+	error = kernel_sysctl(td, oid, 2, oid, &oidlen,
+	    (void *)name, strlen(name), &plen, flags);
+	if (error)
+		return (error);
+
+	error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
+	    new, newlen, retval, flags);
+	return (error);
+}
+
+/*
+ * Transfer function to/from user space.
+ */
+static int
+sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
+{
+	size_t i, len, origidx;
+	int error;
+
+	origidx = req->oldidx;
+	req->oldidx += l;
+	if (req->oldptr == NULL)
+		return (0);
+	/*
+	 * If we have not wired the user supplied buffer and we are currently
+	 * holding locks, drop a witness warning, as it's possible that
+	 * write operations to the user page can sleep.
+	 */
+	if (req->lock != REQ_WIRED)
+		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+		    "sysctl_old_user()");
+	i = l;
+	len = req->validlen;
+	if (len <= origidx)
+		i = 0;
+	else {
+		if (i > len - origidx)
+			i = len - origidx;
+		if (req->lock == REQ_WIRED) {
+			error = copyout_nofault(p, (char *)req->oldptr +
+			    origidx, i);
+		} else
+			error = copyout(p, (char *)req->oldptr + origidx, i);
+		if (error != 0)
+			return (error);
+	}
+	if (i < l)
+		return (ENOMEM);
+	return (0);
+}
+
+static int
+sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
+{
+	int error;
+
+	if (!req->newptr)
+		return (0);
+	if (req->newlen - req->newidx < l)
+		return (EINVAL);
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+	    "sysctl_new_user()");
+	error = copyin((char *)req->newptr + req->newidx, p, l);
+	req->newidx += l;
+	return (error);
+}
+
+/*
+ * Wire the user space destination buffer.  If set to a value greater than
+ * zero, the len parameter limits the maximum amount of wired memory.
+ */
+int
+sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
+{
+	int ret;
+	size_t wiredlen;
+
+	wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen;
+	ret = 0;
+	if (req->lock != REQ_WIRED && req->oldptr &&
+	    req->oldfunc == sysctl_old_user) {
+		if (wiredlen != 0) {
+			ret = vslock(req->oldptr, wiredlen);
+			if (ret != 0) {
+				if (ret != ENOMEM)
+					return (ret);
+				wiredlen = 0;
+			}
+		}
+		req->lock = REQ_WIRED;
+		req->validlen = wiredlen;
+	}
+	return (0);
+}
+
+int
+sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
+    int *nindx, struct sysctl_req *req)
+{
+	struct sysctl_oid_list *lsp;
+	struct sysctl_oid *oid;
+	int indx;
+
+	SYSCTL_ASSERT_XLOCKED();
+	lsp = &sysctl__children;
+	indx = 0;
+	while (indx < CTL_MAXNAME) {
+		SLIST_FOREACH(oid, lsp, oid_link) {
+			if (oid->oid_number == name[indx])
+				break;
+		}
+		if (oid == NULL)
+			return (ENOENT);
+
+		indx++;
+		if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+			if (oid->oid_handler != NULL || indx == namelen) {
+				*noid = oid;
+				if (nindx != NULL)
+					*nindx = indx;
+				KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
+				    ("%s found DYING node %p", __func__, oid));
+				return (0);
+			}
+			lsp = SYSCTL_CHILDREN(oid);
+		} else if (indx == namelen) {
+			*noid = oid;
+			if (nindx != NULL)
+				*nindx = indx;
+			KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
+			    ("%s found DYING node %p", __func__, oid));
+			return (0);
+		} else {
+			return (ENOTDIR);
+		}
+	}
+	return (ENOENT);
+}
+
+/*
+ * Traverse our tree, and find the right node, execute whatever it points
+ * to, and return the resulting error code.
+ */
+
+static int
+sysctl_root(SYSCTL_HANDLER_ARGS)
+{
+	struct sysctl_oid *oid;
+	int error, indx, lvl;
+
+	SYSCTL_ASSERT_XLOCKED();
+
+	error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
+	if (error)
+		return (error);
+
+	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+		/*
+		 * You can't call a sysctl when it's a node, but has
+		 * no handler.  Inform the user that it's a node.
+		 * The indx may or may not be the same as namelen.
+		 */
+		if (oid->oid_handler == NULL)
+			return (EISDIR);
+	}
+
+	/* Is this sysctl writable? */
+	if (req->newptr && !(oid->oid_kind & CTLFLAG_WR))
+		return (EPERM);
+
+	KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));
+
+#ifdef CAPABILITY_MODE
+	/*
+	 * If the process is in capability mode, then don't permit reading or
+	 * writing unless specifically granted for the node.
+	 */
+	if (IN_CAPABILITY_MODE(req->td)) {
+		if (req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD))
+			return (EPERM);
+		if (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))
+			return (EPERM);
+	}
+#endif
+
+	/* Is this sysctl sensitive to securelevels? */
+	if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
+		lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE;
+		error = securelevel_gt(req->td->td_ucred, lvl);
+		if (error)
+			return (error);
+	}
+
+	/* Is this sysctl writable by only privileged users? */
+	if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
+		int priv;
+
+		if (oid->oid_kind & CTLFLAG_PRISON)
+			priv = PRIV_SYSCTL_WRITEJAIL;
+#ifdef VIMAGE
+		else if ((oid->oid_kind & CTLFLAG_VNET) &&
+		     prison_owns_vnet(req->td->td_ucred))
+			priv = PRIV_SYSCTL_WRITEJAIL;
+#endif
+		else
+			priv = PRIV_SYSCTL_WRITE;
+		error = priv_check(req->td, priv);
+		if (error)
+			return (error);
+	}
+
+	if (!oid->oid_handler)
+		return (EINVAL);
+
+	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+		arg1 = (int *)arg1 + indx;
+		arg2 -= indx;
+	} else {
+		arg1 = oid->oid_arg1;
+		arg2 = oid->oid_arg2;
+	}
+#ifdef MAC
+	error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2,
+	    req);
+	if (error != 0)
+		return (error);
+#endif
+	oid->oid_running++;
+	SYSCTL_XUNLOCK();
+
+	if (!(oid->oid_kind & CTLFLAG_MPSAFE))
+		mtx_lock(&Giant);
+	error = oid->oid_handler(oid, arg1, arg2, req);
+	if (!(oid->oid_kind & CTLFLAG_MPSAFE))
+		mtx_unlock(&Giant);
+
+	KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error);
+
+	SYSCTL_XLOCK();
+	oid->oid_running--;
+	if (oid->oid_running == 0 && (oid->oid_kind & CTLFLAG_DYING) != 0)
+		wakeup(&oid->oid_running);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sysctl_args {
+	int	*name;
+	u_int	namelen;
+	void	*old;
+	size_t	*oldlenp;
+	void	*new;
+	size_t	newlen;
+};
+#endif
+int
+sys___sysctl(struct thread *td, struct sysctl_args *uap)
+{
+	int error, i, name[CTL_MAXNAME];
+	size_t j;
+
+	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
+		return (EINVAL);
+
+ 	error = copyin(uap->name, &name, uap->namelen * sizeof(int));
+ 	if (error)
+		return (error);
+
+	error = userland_sysctl(td, name, uap->namelen,
+		uap->old, uap->oldlenp, 0,
+		uap->new, uap->newlen, &j, 0);
+	if (error && error != ENOMEM)
+		return (error);
+	if (uap->oldlenp) {
+		i = copyout(&j, uap->oldlenp, sizeof(j));
+		if (i)
+			return (i);
+	}
+	return (error);
+}
+
+/*
+ * This is used from various compatibility syscalls too.  That's why name
+ * must be in kernel space.
+ */
+int
+userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
+    size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval,
+    int flags)
+{
+	int error = 0, memlocked;
+	struct sysctl_req req;
+
+	bzero(&req, sizeof req);
+
+	req.td = td;
+	req.flags = flags;
+
+	if (oldlenp) {
+		if (inkernel) {
+			req.oldlen = *oldlenp;
+		} else {
+			error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
+			if (error)
+				return (error);
+		}
+	}
+	req.validlen = req.oldlen;
+
+	if (old) {
+		if (!useracc(old, req.oldlen, VM_PROT_WRITE))
+			return (EFAULT);
+		req.oldptr= old;
+	}
+
+	if (new != NULL) {
+		if (!useracc(new, newlen, VM_PROT_READ))
+			return (EFAULT);
+		req.newlen = newlen;
+		req.newptr = new;
+	}
+
+	req.oldfunc = sysctl_old_user;
+	req.newfunc = sysctl_new_user;
+	req.lock = REQ_UNWIRED;
+
+#ifdef KTRACE
+	if (KTRPOINT(curthread, KTR_SYSCTL))
+		ktrsysctl(name, namelen);
+#endif
+
+	if (req.oldlen > PAGE_SIZE) {
+		memlocked = 1;
+		sx_xlock(&sysctlmemlock);
+	} else
+		memlocked = 0;
+	CURVNET_SET(TD_TO_VNET(td));
+
+	for (;;) {
+		req.oldidx = 0;
+		req.newidx = 0;
+		SYSCTL_XLOCK();
+		error = sysctl_root(0, name, namelen, &req);
+		SYSCTL_XUNLOCK();
+		if (error != EAGAIN)
+			break;
+		kern_yield(PRI_USER);
+	}
+
+	CURVNET_RESTORE();
+
+	if (req.lock == REQ_WIRED && req.validlen > 0)
+		vsunlock(req.oldptr, req.validlen);
+	if (memlocked)
+		sx_xunlock(&sysctlmemlock);
+
+	if (error && error != ENOMEM)
+		return (error);
+
+	if (retval) {
+		if (req.oldptr && req.oldidx > req.validlen)
+			*retval = req.validlen;
+		else
+			*retval = req.oldidx;
+	}
+	return (error);
+}
+
+/*
+ * Drain into a sysctl struct.  The user buffer should be wired if a page
+ * fault would cause issue.
+ */
+static int
+sbuf_sysctl_drain(void *arg, const char *data, int len)
+{
+	struct sysctl_req *req = arg;
+	int error;
+
+	error = SYSCTL_OUT(req, data, len);
+	KASSERT(error >= 0, ("Got unexpected negative value %d", error));
+	return (error == 0 ? len : -error);
+}
+
+struct sbuf *
+sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length,
+    struct sysctl_req *req)
+{
+
+	s = sbuf_new(s, buf, length, SBUF_FIXEDLEN);
+	sbuf_set_drain(s, sbuf_sysctl_drain, req);
+	return (s);
+}
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
new file mode 100644
index 0000000..9fe7ebe
--- /dev/null
+++ b/sys/kern/kern_tc.c
@@ -0,0 +1,2030 @@
+/*-
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2011 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Julien Ridoux at the University
+ * of Melbourne under sponsorship from the FreeBSD Foundation.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_ntp.h"
+#include "opt_ffclock.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#ifdef FFCLOCK
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/timeffc.h>
+#include <sys/timepps.h>
+#include <sys/timetc.h>
+#include <sys/timex.h>
+#include <sys/vdso.h>
+
+/*
+ * A large step happens on boot.  This constant detects such steps.
+ * It is relatively small so that ntp_update_second gets called enough
+ * in the typical 'missed a couple of seconds' case, but doesn't loop
+ * forever when the time step is large.
+ */
+#define LARGE_STEP	200
+
+/*
+ * Implement a dummy timecounter which we can use until we get a real one
+ * in the air.  This allows the console and other early stuff to use
+ * time services.
+ */
+
+static u_int
+dummy_get_timecount(struct timecounter *tc)
+{
+	static u_int now;
+
+	return (++now);
+}
+
+static struct timecounter dummy_timecounter = {
+	dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000
+};
+
+struct timehands {
+	/* These fields must be initialized by the driver. */
+	struct timecounter	*th_counter;
+	int64_t			th_adjustment;
+	uint64_t		th_scale;
+	u_int	 		th_offset_count;
+	struct bintime		th_offset;
+	struct timeval		th_microtime;
+	struct timespec		th_nanotime;
+	/* Fields not to be copied in tc_windup start with th_generation. */
+	volatile u_int		th_generation;
+	struct timehands	*th_next;
+};
+
+static struct timehands th0;
+static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th0};
+static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th9};
+static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th8};
+static struct timehands th6 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th7};
+static struct timehands th5 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th6};
+static struct timehands th4 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th5};
+static struct timehands th3 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th4};
+static struct timehands th2 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th3};
+static struct timehands th1 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th2};
+static struct timehands th0 = {
+	&dummy_timecounter,
+	0,
+	(uint64_t)-1 / 1000000,
+	0,
+	{1, 0},
+	{0, 0},
+	{0, 0},
+	1,
+	&th1
+};
+
+static struct timehands *volatile timehands = &th0;
+struct timecounter *timecounter = &dummy_timecounter;
+static struct timecounter *timecounters = &dummy_timecounter;
+
+int tc_min_ticktock_freq = 1;
+
+volatile time_t time_second = 1;
+volatile time_t time_uptime = 1;
+
+struct bintime boottimebin;
+struct timeval boottime;
+static int sysctl_kern_boottime(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_kern, KERN_BOOTTIME, boottime, CTLTYPE_STRUCT|CTLFLAG_RD,
+    NULL, 0, sysctl_kern_boottime, "S,timeval", "System boottime");
+
+SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+static SYSCTL_NODE(_kern_timecounter, OID_AUTO, tc, CTLFLAG_RW, 0, "");
+
+static int timestepwarnings;
+SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW,
+    &timestepwarnings, 0, "Log time steps");
+
+struct bintime bt_timethreshold;
+struct bintime bt_tickthreshold;
+sbintime_t sbt_timethreshold;
+sbintime_t sbt_tickthreshold;
+struct bintime tc_tick_bt;
+sbintime_t tc_tick_sbt;
+int tc_precexp;
+int tc_timepercentage = TC_DEFAULTPERC;
+TUNABLE_INT("kern.timecounter.alloweddeviation", &tc_timepercentage);
+static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
+    sysctl_kern_timecounter_adjprecision, "I",
+    "Allowed time interval deviation in percents");
+
+static void tc_windup(void);
+static void cpu_tick_calibrate(int);
+
+void dtrace_getnanotime(struct timespec *tsp);
+
+static int
+sysctl_kern_boottime(SYSCTL_HANDLER_ARGS)
+{
+#ifndef __mips__
+#ifdef SCTL_MASK32
+	int tv[2];
+
+	if (req->flags & SCTL_MASK32) {
+		tv[0] = boottime.tv_sec;
+		tv[1] = boottime.tv_usec;
+		return SYSCTL_OUT(req, tv, sizeof(tv));
+	} else
+#endif
+#endif
+		return SYSCTL_OUT(req, &boottime, sizeof(boottime));
+}
+
+static int
+sysctl_kern_timecounter_get(SYSCTL_HANDLER_ARGS)
+{
+	u_int ncount;
+	struct timecounter *tc = arg1;
+
+	ncount = tc->tc_get_timecount(tc);
+	return sysctl_handle_int(oidp, &ncount, 0, req);
+}
+
+static int
+sysctl_kern_timecounter_freq(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t freq;
+	struct timecounter *tc = arg1;
+
+	freq = tc->tc_frequency;
+	return sysctl_handle_64(oidp, &freq, 0, req);
+}
+
+/*
+ * Return the difference between the timehands' counter value now and what
+ * was when we copied it to the timehands' offset_count.
+ */
+static __inline u_int
+tc_delta(struct timehands *th)
+{
+	struct timecounter *tc;
+
+	tc = th->th_counter;
+	return ((tc->tc_get_timecount(tc) - th->th_offset_count) &
+	    tc->tc_counter_mask);
+}
+
+/*
+ * Functions for reading the time.  We have to loop until we are sure that
+ * the timehands that we operated on was not updated under our feet.  See
+ * the comment in <sys/time.h> for a description of these 12 functions.
+ */
+
+#ifdef FFCLOCK
+void
+fbclock_binuptime(struct bintime *bt)
+{
+	struct timehands *th;
+	unsigned int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*bt = th->th_offset;
+		bintime_addx(bt, th->th_scale * tc_delta(th));
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+fbclock_nanouptime(struct timespec *tsp)
+{
+	struct bintime bt;
+
+	fbclock_binuptime(&bt);
+	bintime2timespec(&bt, tsp);
+}
+
+void
+fbclock_microuptime(struct timeval *tvp)
+{
+	struct bintime bt;
+
+	fbclock_binuptime(&bt);
+	bintime2timeval(&bt, tvp);
+}
+
+void
+fbclock_bintime(struct bintime *bt)
+{
+
+	fbclock_binuptime(bt);
+	bintime_add(bt, &boottimebin);
+}
+
+void
+fbclock_nanotime(struct timespec *tsp)
+{
+	struct bintime bt;
+
+	fbclock_bintime(&bt);
+	bintime2timespec(&bt, tsp);
+}
+
+void
+fbclock_microtime(struct timeval *tvp)
+{
+	struct bintime bt;
+
+	fbclock_bintime(&bt);
+	bintime2timeval(&bt, tvp);
+}
+
+void
+fbclock_getbinuptime(struct bintime *bt)
+{
+	struct timehands *th;
+	unsigned int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*bt = th->th_offset;
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+fbclock_getnanouptime(struct timespec *tsp)
+{
+	struct timehands *th;
+	unsigned int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		bintime2timespec(&th->th_offset, tsp);
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+fbclock_getmicrouptime(struct timeval *tvp)
+{
+	struct timehands *th;
+	unsigned int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		bintime2timeval(&th->th_offset, tvp);
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+fbclock_getbintime(struct bintime *bt)
+{
+	struct timehands *th;
+	unsigned int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*bt = th->th_offset;
+	} while (gen == 0 || gen != th->th_generation);
+	bintime_add(bt, &boottimebin);
+}
+
+void
+fbclock_getnanotime(struct timespec *tsp)
+{
+	struct timehands *th;
+	unsigned int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*tsp = th->th_nanotime;
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+fbclock_getmicrotime(struct timeval *tvp)
+{
+	struct timehands *th;
+	unsigned int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*tvp = th->th_microtime;
+	} while (gen == 0 || gen != th->th_generation);
+}
+#else /* !FFCLOCK */
+void
+binuptime(struct bintime *bt)
+{
+	struct timehands *th;
+	u_int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*bt = th->th_offset;
+		bintime_addx(bt, th->th_scale * tc_delta(th));
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+nanouptime(struct timespec *tsp)
+{
+	struct bintime bt;
+
+	binuptime(&bt);
+	bintime2timespec(&bt, tsp);
+}
+
+void
+microuptime(struct timeval *tvp)
+{
+	struct bintime bt;
+
+	binuptime(&bt);
+	bintime2timeval(&bt, tvp);
+}
+
+void
+bintime(struct bintime *bt)
+{
+
+	binuptime(bt);
+	bintime_add(bt, &boottimebin);
+}
+
+void
+nanotime(struct timespec *tsp)
+{
+	struct bintime bt;
+
+	bintime(&bt);
+	bintime2timespec(&bt, tsp);
+}
+
+void
+microtime(struct timeval *tvp)
+{
+	struct bintime bt;
+
+	bintime(&bt);
+	bintime2timeval(&bt, tvp);
+}
+
+void
+getbinuptime(struct bintime *bt)
+{
+	struct timehands *th;
+	u_int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*bt = th->th_offset;
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getnanouptime(struct timespec *tsp)
+{
+	struct timehands *th;
+	u_int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		bintime2timespec(&th->th_offset, tsp);
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getmicrouptime(struct timeval *tvp)
+{
+	struct timehands *th;
+	u_int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		bintime2timeval(&th->th_offset, tvp);
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getbintime(struct bintime *bt)
+{
+	struct timehands *th;
+	u_int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*bt = th->th_offset;
+	} while (gen == 0 || gen != th->th_generation);
+	bintime_add(bt, &boottimebin);
+}
+
+void
+getnanotime(struct timespec *tsp)
+{
+	struct timehands *th;
+	u_int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*tsp = th->th_nanotime;
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getmicrotime(struct timeval *tvp)
+{
+	struct timehands *th;
+	u_int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*tvp = th->th_microtime;
+	} while (gen == 0 || gen != th->th_generation);
+}
+#endif /* FFCLOCK */
+
+#ifdef FFCLOCK
+/*
+ * Support for feed-forward synchronization algorithms. This is heavily inspired
+ * by the timehands mechanism but kept independent from it. *_windup() functions
+ * have some connection to avoid accessing the timecounter hardware more than
+ * necessary.
+ */
+
+/* Feed-forward clock estimates kept updated by the synchronization daemon. */
+struct ffclock_estimate ffclock_estimate;
+struct bintime ffclock_boottime;	/* Feed-forward boot time estimate. */
+uint32_t ffclock_status;		/* Feed-forward clock status. */
+int8_t ffclock_updated;			/* New estimates are available. */
+struct mtx ffclock_mtx;			/* Mutex on ffclock_estimate. */
+
+struct fftimehands {
+	struct ffclock_estimate	cest;
+	struct bintime		tick_time;
+	struct bintime		tick_time_lerp;
+	ffcounter		tick_ffcount;
+	uint64_t		period_lerp;
+	volatile uint8_t	gen;
+	struct fftimehands	*next;
+};
+
+#define	NUM_ELEMENTS(x) (sizeof(x) / sizeof(*x))
+
+static struct fftimehands ffth[10];
+static struct fftimehands *volatile fftimehands = ffth;
+
+static void
+ffclock_init(void)
+{
+	struct fftimehands *cur;
+	struct fftimehands *last;
+
+	memset(ffth, 0, sizeof(ffth));
+
+	last = ffth + NUM_ELEMENTS(ffth) - 1;
+	for (cur = ffth; cur < last; cur++)
+		cur->next = cur + 1;
+	last->next = ffth;
+
+	ffclock_updated = 0;
+	ffclock_status = FFCLOCK_STA_UNSYNC;
+	mtx_init(&ffclock_mtx, "ffclock lock", NULL, MTX_DEF);
+}
+
+/*
+ * Reset the feed-forward clock estimates. Called from inittodr() to get things
+ * kick started and uses the timecounter nominal frequency as a first period
+ * estimate. Note: this function may be called several time just after boot.
+ * Note: this is the only function that sets the value of boot time for the
+ * monotonic (i.e. uptime) version of the feed-forward clock.
+ */
+void
+ffclock_reset_clock(struct timespec *ts)
+{
+	struct timecounter *tc;
+	struct ffclock_estimate cest;
+
+	tc = timehands->th_counter;
+	memset(&cest, 0, sizeof(struct ffclock_estimate));
+
+	timespec2bintime(ts, &ffclock_boottime);
+	timespec2bintime(ts, &(cest.update_time));
+	ffclock_read_counter(&cest.update_ffcount);
+	cest.leapsec_next = 0;
+	cest.period = ((1ULL << 63) / tc->tc_frequency) << 1;
+	cest.errb_abs = 0;
+	cest.errb_rate = 0;
+	cest.status = FFCLOCK_STA_UNSYNC;
+	cest.leapsec_total = 0;
+	cest.leapsec = 0;
+
+	mtx_lock(&ffclock_mtx);
+	bcopy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate));
+	ffclock_updated = INT8_MAX;
+	mtx_unlock(&ffclock_mtx);
+
+	printf("ffclock reset: %s (%llu Hz), time = %ld.%09lu\n", tc->tc_name,
+	    (unsigned long long)tc->tc_frequency, (long)ts->tv_sec,
+	    (unsigned long)ts->tv_nsec);
+}
+
+/*
+ * Sub-routine to convert a time interval measured in RAW counter units to time
+ * in seconds stored in bintime format.
+ * NOTE: bintime_mul requires u_int, but the value of the ffcounter may be
+ * larger than the max value of u_int (on 32 bit architecture). Loop to consume
+ * extra cycles.
+ */
+static void
+ffclock_convert_delta(ffcounter ffdelta, uint64_t period, struct bintime *bt)
+{
+	struct bintime bt2;
+	ffcounter delta, delta_max;
+
+	delta_max = (1ULL << (8 * sizeof(unsigned int))) - 1;
+	bintime_clear(bt);
+	do {
+		if (ffdelta > delta_max)
+			delta = delta_max;
+		else
+			delta = ffdelta;
+		bt2.sec = 0;
+		bt2.frac = period;
+		bintime_mul(&bt2, (unsigned int)delta);
+		bintime_add(bt, &bt2);
+		ffdelta -= delta;
+	} while (ffdelta > 0);
+}
+
+/*
+ * Update the fftimehands.
+ * Push the tick ffcount and time(s) forward based on current clock estimate.
+ * The conversion from ffcounter to bintime relies on the difference clock
+ * principle, whose accuracy relies on computing small time intervals. If a new
+ * clock estimate has been passed by the synchronisation daemon, make it
+ * current, and compute the linear interpolation for monotonic time if needed.
+ */
+static void
+ffclock_windup(unsigned int delta)
+{
+	struct ffclock_estimate *cest;
+	struct fftimehands *ffth;
+	struct bintime bt, gap_lerp;
+	ffcounter ffdelta;
+	uint64_t frac;
+	unsigned int polling;
+	uint8_t forward_jump, ogen;
+
+	/*
+	 * Pick the next timehand, copy current ffclock estimates and move tick
+	 * times and counter forward.
+	 */
+	forward_jump = 0;
+	ffth = fftimehands->next;
+	ogen = ffth->gen;
+	ffth->gen = 0;
+	cest = &ffth->cest;
+	bcopy(&fftimehands->cest, cest, sizeof(struct ffclock_estimate));
+	ffdelta = (ffcounter)delta;
+	ffth->period_lerp = fftimehands->period_lerp;
+
+	ffth->tick_time = fftimehands->tick_time;
+	ffclock_convert_delta(ffdelta, cest->period, &bt);
+	bintime_add(&ffth->tick_time, &bt);
+
+	ffth->tick_time_lerp = fftimehands->tick_time_lerp;
+	ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt);
+	bintime_add(&ffth->tick_time_lerp, &bt);
+
+	ffth->tick_ffcount = fftimehands->tick_ffcount + ffdelta;
+
+	/*
+	 * Assess the status of the clock, if the last update is too old, it is
+	 * likely the synchronisation daemon is dead and the clock is free
+	 * running.
+	 */
+	if (ffclock_updated == 0) {
+		ffdelta = ffth->tick_ffcount - cest->update_ffcount;
+		ffclock_convert_delta(ffdelta, cest->period, &bt);
+		if (bt.sec > 2 * FFCLOCK_SKM_SCALE)
+			ffclock_status |= FFCLOCK_STA_UNSYNC;
+	}
+
+	/*
+	 * If available, grab updated clock estimates and make them current.
+	 * Recompute time at this tick using the updated estimates. The clock
+	 * estimates passed the feed-forward synchronisation daemon may result
+	 * in time conversion that is not monotonically increasing (just after
+	 * the update). time_lerp is a particular linear interpolation over the
+	 * synchronisation algo polling period that ensures monotonicity for the
+	 * clock ids requesting it.
+	 */
+	if (ffclock_updated > 0) {
+		bcopy(&ffclock_estimate, cest, sizeof(struct ffclock_estimate));
+		ffdelta = ffth->tick_ffcount - cest->update_ffcount;
+		ffth->tick_time = cest->update_time;
+		ffclock_convert_delta(ffdelta, cest->period, &bt);
+		bintime_add(&ffth->tick_time, &bt);
+
+		/* ffclock_reset sets ffclock_updated to INT8_MAX */
+		if (ffclock_updated == INT8_MAX)
+			ffth->tick_time_lerp = ffth->tick_time;
+
+		if (bintime_cmp(&ffth->tick_time, &ffth->tick_time_lerp, >))
+			forward_jump = 1;
+		else
+			forward_jump = 0;
+
+		bintime_clear(&gap_lerp);
+		if (forward_jump) {
+			gap_lerp = ffth->tick_time;
+			bintime_sub(&gap_lerp, &ffth->tick_time_lerp);
+		} else {
+			gap_lerp = ffth->tick_time_lerp;
+			bintime_sub(&gap_lerp, &ffth->tick_time);
+		}
+
+		/*
+		 * The reset from the RTC clock may be far from accurate, and
+		 * reducing the gap between real time and interpolated time
+		 * could take a very long time if the interpolated clock insists
+		 * on strict monotonicity. The clock is reset under very strict
+		 * conditions (kernel time is known to be wrong and
+		 * synchronization daemon has been restarted recently.
+		 * ffclock_boottime absorbs the jump to ensure boot time is
+		 * correct and uptime functions stay consistent.
+		 */
+		if (((ffclock_status & FFCLOCK_STA_UNSYNC) == FFCLOCK_STA_UNSYNC) &&
+		    ((cest->status & FFCLOCK_STA_UNSYNC) == 0) &&
+		    ((cest->status & FFCLOCK_STA_WARMUP) == FFCLOCK_STA_WARMUP)) {
+			if (forward_jump)
+				bintime_add(&ffclock_boottime, &gap_lerp);
+			else
+				bintime_sub(&ffclock_boottime, &gap_lerp);
+			ffth->tick_time_lerp = ffth->tick_time;
+			bintime_clear(&gap_lerp);
+		}
+
+		ffclock_status = cest->status;
+		ffth->period_lerp = cest->period;
+
+		/*
+		 * Compute corrected period used for the linear interpolation of
+		 * time. The rate of linear interpolation is capped to 5000PPM
+		 * (5ms/s).
+		 */
+		if (bintime_isset(&gap_lerp)) {
+			ffdelta = cest->update_ffcount;
+			ffdelta -= fftimehands->cest.update_ffcount;
+			ffclock_convert_delta(ffdelta, cest->period, &bt);
+			polling = bt.sec;
+			bt.sec = 0;
+			bt.frac = 5000000 * (uint64_t)18446744073LL;
+			bintime_mul(&bt, polling);
+			if (bintime_cmp(&gap_lerp, &bt, >))
+				gap_lerp = bt;
+
+			/* Approximate 1 sec by 1-(1/2^64) to ease arithmetic */
+			frac = 0;
+			if (gap_lerp.sec > 0) {
+				frac -= 1;
+				frac /= ffdelta / gap_lerp.sec;
+			}
+			frac += gap_lerp.frac / ffdelta;
+
+			if (forward_jump)
+				ffth->period_lerp += frac;
+			else
+				ffth->period_lerp -= frac;
+		}
+
+		ffclock_updated = 0;
+	}
+	if (++ogen == 0)
+		ogen = 1;
+	ffth->gen = ogen;
+	fftimehands = ffth;
+}
+
+/*
+ * Adjust the fftimehands when the timecounter is changed. Stating the obvious,
+ * the old and new hardware counter cannot be read simultaneously. tc_windup()
+ * does read the two counters 'back to back', but a few cycles are effectively
+ * lost, and not accumulated in tick_ffcount. This is a fairly radical
+ * operation for a feed-forward synchronization daemon, and it is its job to not
+ * pushing irrelevant data to the kernel. Because there is no locking here,
+ * simply force to ignore pending or next update to give daemon a chance to
+ * realize the counter has changed.
+ */
+static void
+ffclock_change_tc(struct timehands *th)
+{
+	struct fftimehands *ffth;
+	struct ffclock_estimate *cest;
+	struct timecounter *tc;
+	uint8_t ogen;
+
+	tc = th->th_counter;
+	ffth = fftimehands->next;
+	ogen = ffth->gen;
+	ffth->gen = 0;
+
+	cest = &ffth->cest;
+	bcopy(&(fftimehands->cest), cest, sizeof(struct ffclock_estimate));
+	cest->period = ((1ULL << 63) / tc->tc_frequency ) << 1;
+	cest->errb_abs = 0;
+	cest->errb_rate = 0;
+	cest->status |= FFCLOCK_STA_UNSYNC;
+
+	ffth->tick_ffcount = fftimehands->tick_ffcount;
+	ffth->tick_time_lerp = fftimehands->tick_time_lerp;
+	ffth->tick_time = fftimehands->tick_time;
+	ffth->period_lerp = cest->period;
+
+	/* Do not lock but ignore next update from synchronization daemon. */
+	ffclock_updated--;
+
+	if (++ogen == 0)
+		ogen = 1;
+	ffth->gen = ogen;
+	fftimehands = ffth;
+}
+
+/*
+ * Retrieve feed-forward counter and time of last kernel tick.
+ */
+void
+ffclock_last_tick(ffcounter *ffcount, struct bintime *bt, uint32_t flags)
+{
+	struct fftimehands *ffth;
+	uint8_t gen;
+
+	/*
+	 * No locking but check generation has not changed. Also need to make
+	 * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
+	 */
+	do {
+		ffth = fftimehands;
+		gen = ffth->gen;
+		if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP)
+			*bt = ffth->tick_time_lerp;
+		else
+			*bt = ffth->tick_time;
+		*ffcount = ffth->tick_ffcount;
+	} while (gen == 0 || gen != ffth->gen);
+}
+
+/*
+ * Absolute clock conversion. Low level function to convert ffcounter to
+ * bintime. The ffcounter is converted using the current ffclock period estimate
+ * or the "interpolated period" to ensure monotonicity.
+ * NOTE: this conversion may have been deferred, and the clock updated since the
+ * hardware counter has been read.
+ */
+void
+ffclock_convert_abs(ffcounter ffcount, struct bintime *bt, uint32_t flags)
+{
+	struct fftimehands *ffth;
+	struct bintime bt2;
+	ffcounter ffdelta;
+	uint8_t gen;
+
+	/*
+	 * No locking but check generation has not changed. Also need to make
+	 * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
+	 */
+	do {
+		ffth = fftimehands;
+		gen = ffth->gen;
+		if (ffcount > ffth->tick_ffcount)
+			ffdelta = ffcount - ffth->tick_ffcount;
+		else
+			ffdelta = ffth->tick_ffcount - ffcount;
+
+		if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP) {
+			*bt = ffth->tick_time_lerp;
+			ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt2);
+		} else {
+			*bt = ffth->tick_time;
+			ffclock_convert_delta(ffdelta, ffth->cest.period, &bt2);
+		}
+
+		if (ffcount > ffth->tick_ffcount)
+			bintime_add(bt, &bt2);
+		else
+			bintime_sub(bt, &bt2);
+	} while (gen == 0 || gen != ffth->gen);
+}
+
+/*
+ * Difference clock conversion.
+ * Low level function to Convert a time interval measured in RAW counter units
+ * into bintime. The difference clock allows measuring small intervals much more
+ * reliably than the absolute clock.
+ */
+void
+ffclock_convert_diff(ffcounter ffdelta, struct bintime *bt)
+{
+	struct fftimehands *ffth;
+	uint8_t gen;
+
+	/* No locking but check generation has not changed. */
+	do {
+		ffth = fftimehands;
+		gen = ffth->gen;
+		ffclock_convert_delta(ffdelta, ffth->cest.period, bt);
+	} while (gen == 0 || gen != ffth->gen);
+}
+
+/*
+ * Access to current ffcounter value.
+ */
+void
+ffclock_read_counter(ffcounter *ffcount)
+{
+	struct timehands *th;
+	struct fftimehands *ffth;
+	unsigned int gen, delta;
+
+	/*
+	 * ffclock_windup() called from tc_windup(), safe to rely on
+	 * th->th_generation only, for correct delta and ffcounter.
+	 */
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		ffth = fftimehands;
+		delta = tc_delta(th);
+		*ffcount = ffth->tick_ffcount;
+	} while (gen == 0 || gen != th->th_generation);
+
+	*ffcount += delta;
+}
+
+void
+binuptime(struct bintime *bt)
+{
+
+	binuptime_fromclock(bt, sysclock_active);
+}
+
+void
+nanouptime(struct timespec *tsp)
+{
+
+	nanouptime_fromclock(tsp, sysclock_active);
+}
+
+void
+microuptime(struct timeval *tvp)
+{
+
+	microuptime_fromclock(tvp, sysclock_active);
+}
+
+void
+bintime(struct bintime *bt)
+{
+
+	bintime_fromclock(bt, sysclock_active);
+}
+
+void
+nanotime(struct timespec *tsp)
+{
+
+	nanotime_fromclock(tsp, sysclock_active);
+}
+
+void
+microtime(struct timeval *tvp)
+{
+
+	microtime_fromclock(tvp, sysclock_active);
+}
+
+void
+getbinuptime(struct bintime *bt)
+{
+
+	getbinuptime_fromclock(bt, sysclock_active);
+}
+
+void
+getnanouptime(struct timespec *tsp)
+{
+
+	getnanouptime_fromclock(tsp, sysclock_active);
+}
+
+void
+getmicrouptime(struct timeval *tvp)
+{
+
+	getmicrouptime_fromclock(tvp, sysclock_active);
+}
+
+void
+getbintime(struct bintime *bt)
+{
+
+	getbintime_fromclock(bt, sysclock_active);
+}
+
+void
+getnanotime(struct timespec *tsp)
+{
+
+	getnanotime_fromclock(tsp, sysclock_active);
+}
+
+void
+getmicrotime(struct timeval *tvp)
+{
+
+	getmicrouptime_fromclock(tvp, sysclock_active);
+}
+
+#endif /* FFCLOCK */
+
+/*
+ * This is a clone of getnanotime and used for walltimestamps.
+ * The dtrace_ prefix prevents fbt from creating probes for
+ * it so walltimestamp can be safely used in all fbt probes.
+ */
+void
+dtrace_getnanotime(struct timespec *tsp)
+{
+	struct timehands *th;
+	u_int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*tsp = th->th_nanotime;
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+/*
+ * System clock currently providing time to the system. Modifiable via sysctl
+ * when the FFCLOCK option is defined.
+ */
+int sysclock_active = SYSCLOCK_FBCK;
+
+/* Internal NTP status and error estimates. */
+extern int time_status;
+extern long time_esterror;
+
+/*
+ * Take a snapshot of sysclock data which can be used to compare system clocks
+ * and generate timestamps after the fact.
+ */
+void
+sysclock_getsnapshot(struct sysclock_snap *clock_snap, int fast)
+{
+	struct fbclock_info *fbi;
+	struct timehands *th;
+	struct bintime bt;
+	unsigned int delta, gen;
+#ifdef FFCLOCK
+	ffcounter ffcount;
+	struct fftimehands *ffth;
+	struct ffclock_info *ffi;
+	struct ffclock_estimate cest;
+
+	ffi = &clock_snap->ff_info;
+#endif
+
+	fbi = &clock_snap->fb_info;
+	delta = 0;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		fbi->th_scale = th->th_scale;
+		fbi->tick_time = th->th_offset;
+#ifdef FFCLOCK
+		ffth = fftimehands;
+		ffi->tick_time = ffth->tick_time_lerp;
+		ffi->tick_time_lerp = ffth->tick_time_lerp;
+		ffi->period = ffth->cest.period;
+		ffi->period_lerp = ffth->period_lerp;
+		clock_snap->ffcount = ffth->tick_ffcount;
+		cest = ffth->cest;
+#endif
+		if (!fast)
+			delta = tc_delta(th);
+	} while (gen == 0 || gen != th->th_generation);
+
+	clock_snap->delta = delta;
+	clock_snap->sysclock_active = sysclock_active;
+
+	/* Record feedback clock status and error. */
+	clock_snap->fb_info.status = time_status;
+	/* XXX: Very crude estimate of feedback clock error. */
+	bt.sec = time_esterror / 1000000;
+	bt.frac = ((time_esterror - bt.sec) * 1000000) *
+	    (uint64_t)18446744073709ULL;
+	clock_snap->fb_info.error = bt;
+
+#ifdef FFCLOCK
+	if (!fast)
+		clock_snap->ffcount += delta;
+
+	/* Record feed-forward clock leap second adjustment. */
+	ffi->leapsec_adjustment = cest.leapsec_total;
+	if (clock_snap->ffcount > cest.leapsec_next)
+		ffi->leapsec_adjustment -= cest.leapsec;
+
+	/* Record feed-forward clock status and error. */
+	clock_snap->ff_info.status = cest.status;
+	ffcount = clock_snap->ffcount - cest.update_ffcount;
+	ffclock_convert_delta(ffcount, cest.period, &bt);
+	/* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s]. */
+	bintime_mul(&bt, cest.errb_rate * (uint64_t)18446744073709ULL);
+	/* 18446744073 = int(2^64 / 1e9), since err_abs in [ns]. */
+	bintime_addx(&bt, cest.errb_abs * (uint64_t)18446744073ULL);
+	clock_snap->ff_info.error = bt;
+#endif
+}
+
+/*
+ * Convert a sysclock snapshot into a struct bintime based on the specified
+ * clock source and flags.
+ */
+int
+sysclock_snap2bintime(struct sysclock_snap *cs, struct bintime *bt,
+    int whichclock, uint32_t flags)
+{
+#ifdef FFCLOCK
+	struct bintime bt2;
+	uint64_t period;
+#endif
+
+	switch (whichclock) {
+	case SYSCLOCK_FBCK:
+		*bt = cs->fb_info.tick_time;
+
+		/* If snapshot was created with !fast, delta will be >0. */
+		if (cs->delta > 0)
+			bintime_addx(bt, cs->fb_info.th_scale * cs->delta);
+
+		if ((flags & FBCLOCK_UPTIME) == 0)
+			bintime_add(bt, &boottimebin);
+		break;
+#ifdef FFCLOCK
+	case SYSCLOCK_FFWD:
+		if (flags & FFCLOCK_LERP) {
+			*bt = cs->ff_info.tick_time_lerp;
+			period = cs->ff_info.period_lerp;
+		} else {
+			*bt = cs->ff_info.tick_time;
+			period = cs->ff_info.period;
+		}
+
+		/* If snapshot was created with !fast, delta will be >0. */
+		if (cs->delta > 0) {
+			ffclock_convert_delta(cs->delta, period, &bt2);
+			bintime_add(bt, &bt2);
+		}
+
+		/* Leap second adjustment. */
+		if (flags & FFCLOCK_LEAPSEC)
+			bt->sec -= cs->ff_info.leapsec_adjustment;
+
+		/* Boot time adjustment, for uptime/monotonic clocks. */
+		if (flags & FFCLOCK_UPTIME)
+			bintime_sub(bt, &ffclock_boottime);
+		break;
+#endif
+	default:
+		return (EINVAL);
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * Initialize a new timecounter and possibly use it.
+ */
+void
+tc_init(struct timecounter *tc)
+{
+	u_int u;
+	struct sysctl_oid *tc_root;
+
+	u = tc->tc_frequency / tc->tc_counter_mask;
+	/* XXX: We need some margin here, 10% is a guess */
+	u *= 11;
+	u /= 10;
+	if (u > hz && tc->tc_quality >= 0) {
+		tc->tc_quality = -2000;
+		if (bootverbose) {
+			printf("Timecounter \"%s\" frequency %ju Hz",
+			    tc->tc_name, (uintmax_t)tc->tc_frequency);
+			printf(" -- Insufficient hz, needs at least %u\n", u);
+		}
+	} else if (tc->tc_quality >= 0 || bootverbose) {
+		printf("Timecounter \"%s\" frequency %ju Hz quality %d\n",
+		    tc->tc_name, (uintmax_t)tc->tc_frequency,
+		    tc->tc_quality);
+	}
+
+	tc->tc_next = timecounters;
+	timecounters = tc;
+	/*
+	 * Set up sysctl tree for this counter.
+	 */
+	tc_root = SYSCTL_ADD_NODE(NULL,
+	    SYSCTL_STATIC_CHILDREN(_kern_timecounter_tc), OID_AUTO, tc->tc_name,
+	    CTLFLAG_RW, 0, "timecounter description");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+	    "mask", CTLFLAG_RD, &(tc->tc_counter_mask), 0,
+	    "mask for implemented bits");
+	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+	    "counter", CTLTYPE_UINT | CTLFLAG_RD, tc, sizeof(*tc),
+	    sysctl_kern_timecounter_get, "IU", "current timecounter value");
+	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+	    "frequency", CTLTYPE_U64 | CTLFLAG_RD, tc, sizeof(*tc),
+	     sysctl_kern_timecounter_freq, "QU", "timecounter frequency");
+	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+	    "quality", CTLFLAG_RD, &(tc->tc_quality), 0,
+	    "goodness of time counter");
+	/*
+	 * Never automatically use a timecounter with negative quality.
+	 * Even though we run on the dummy counter, switching here may be
+	 * worse since this timecounter may not be monotonous.
+	 */
+	if (tc->tc_quality < 0)
+		return;
+	if (tc->tc_quality < timecounter->tc_quality)
+		return;
+	if (tc->tc_quality == timecounter->tc_quality &&
+	    tc->tc_frequency < timecounter->tc_frequency)
+		return;
+	(void)tc->tc_get_timecount(tc);
+	(void)tc->tc_get_timecount(tc);
+	timecounter = tc;
+}
+
+/* Report the frequency of the current timecounter. */
+uint64_t
+tc_getfrequency(void)
+{
+
+	return (timehands->th_counter->tc_frequency);
+}
+
+/*
+ * Step our concept of UTC.  This is done by modifying our estimate of
+ * when we booted.
+ * XXX: not locked.
+ */
+void
+tc_setclock(struct timespec *ts)
+{
+	struct timespec tbef, taft;
+	struct bintime bt, bt2;
+
+	cpu_tick_calibrate(1);
+	nanotime(&tbef);
+	timespec2bintime(ts, &bt);
+	binuptime(&bt2);
+	bintime_sub(&bt, &bt2);
+	bintime_add(&bt2, &boottimebin);
+	boottimebin = bt;
+	bintime2timeval(&bt, &boottime);
+
+	/* XXX fiddle all the little crinkly bits around the fiords... */
+	tc_windup();
+	nanotime(&taft);
+	if (timestepwarnings) {
+		log(LOG_INFO,
+		    "Time stepped from %jd.%09ld to %jd.%09ld (%jd.%09ld)\n",
+		    (intmax_t)tbef.tv_sec, tbef.tv_nsec,
+		    (intmax_t)taft.tv_sec, taft.tv_nsec,
+		    (intmax_t)ts->tv_sec, ts->tv_nsec);
+	}
+	cpu_tick_calibrate(1);
+}
+
+/*
+ * Initialize the next struct timehands in the ring and make
+ * it the active timehands.  Along the way we might switch to a different
+ * timecounter and/or do seconds processing in NTP.  Slightly magic.
+ */
+static void
+tc_windup(void)
+{
+	struct bintime bt;
+	struct timehands *th, *tho;
+	uint64_t scale;
+	u_int delta, ncount, ogen;
+	int i;
+	time_t t;
+
+	/*
+	 * Make the next timehands a copy of the current one, but do not
+	 * overwrite the generation or next pointer.  While we update
+	 * the contents, the generation must be zero.
+	 */
+	tho = timehands;
+	th = tho->th_next;
+	ogen = th->th_generation;
+	th->th_generation = 0;
+	bcopy(tho, th, offsetof(struct timehands, th_generation));
+
+	/*
+	 * Capture a timecounter delta on the current timecounter and if
+	 * changing timecounters, a counter value from the new timecounter.
+	 * Update the offset fields accordingly.
+	 */
+	delta = tc_delta(th);
+	if (th->th_counter != timecounter)
+		ncount = timecounter->tc_get_timecount(timecounter);
+	else
+		ncount = 0;
+#ifdef FFCLOCK
+	ffclock_windup(delta);
+#endif
+	th->th_offset_count += delta;
+	th->th_offset_count &= th->th_counter->tc_counter_mask;
+	while (delta > th->th_counter->tc_frequency) {
+		/* Eat complete unadjusted seconds. */
+		delta -= th->th_counter->tc_frequency;
+		th->th_offset.sec++;
+	}
+	if ((delta > th->th_counter->tc_frequency / 2) &&
+	    (th->th_scale * delta < ((uint64_t)1 << 63))) {
+		/* The product th_scale * delta just barely overflows. */
+		th->th_offset.sec++;
+	}
+	bintime_addx(&th->th_offset, th->th_scale * delta);
+
+	/*
+	 * Hardware latching timecounters may not generate interrupts on
+	 * PPS events, so instead we poll them.  There is a finite risk that
+	 * the hardware might capture a count which is later than the one we
+	 * got above, and therefore possibly in the next NTP second which might
+	 * have a different rate than the current NTP second.  It doesn't
+	 * matter in practice.
+	 */
+	if (tho->th_counter->tc_poll_pps)
+		tho->th_counter->tc_poll_pps(tho->th_counter);
+
+	/*
+	 * Deal with NTP second processing.  The for loop normally
+	 * iterates at most once, but in extreme situations it might
+	 * keep NTP sane if timeouts are not run for several seconds.
+	 * At boot, the time step can be large when the TOD hardware
+	 * has been read, so on really large steps, we call
+	 * ntp_update_second only twice.  We need to call it twice in
+	 * case we missed a leap second.
+	 */
+	bt = th->th_offset;
+	bintime_add(&bt, &boottimebin);
+	i = bt.sec - tho->th_microtime.tv_sec;
+	if (i > LARGE_STEP)
+		i = 2;
+	for (; i > 0; i--) {
+		t = bt.sec;
+		ntp_update_second(&th->th_adjustment, &bt.sec);
+		if (bt.sec != t)
+			boottimebin.sec += bt.sec - t;
+	}
+	/* Update the UTC timestamps used by the get*() functions. */
+	/* XXX shouldn't do this here.  Should force non-`get' versions. */
+	bintime2timeval(&bt, &th->th_microtime);
+	bintime2timespec(&bt, &th->th_nanotime);
+
+	/* Now is a good time to change timecounters. */
+	if (th->th_counter != timecounter) {
+#ifndef __arm__
+		if ((timecounter->tc_flags & TC_FLAGS_C3STOP) != 0)
+			cpu_disable_deep_sleep++;
+		if ((th->th_counter->tc_flags & TC_FLAGS_C3STOP) != 0)
+			cpu_disable_deep_sleep--;
+#endif
+		th->th_counter = timecounter;
+		th->th_offset_count = ncount;
+		tc_min_ticktock_freq = max(1, timecounter->tc_frequency /
+		    (((uint64_t)timecounter->tc_counter_mask + 1) / 3));
+#ifdef FFCLOCK
+		ffclock_change_tc(th);
+#endif
+	}
+
+	/*-
+	 * Recalculate the scaling factor.  We want the number of 1/2^64
+	 * fractions of a second per period of the hardware counter, taking
+	 * into account the th_adjustment factor which the NTP PLL/adjtime(2)
+	 * processing provides us with.
+	 *
+	 * The th_adjustment is nanoseconds per second with 32 bit binary
+	 * fraction and we want 64 bit binary fraction of second:
+	 *
+	 *	 x = a * 2^32 / 10^9 = a * 4.294967296
+	 *
+	 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
+	 * we can only multiply by about 850 without overflowing, that
+	 * leaves no suitably precise fractions for multiply before divide.
+	 *
+	 * Divide before multiply with a fraction of 2199/512 results in a
+	 * systematic undercompensation of 10PPM of th_adjustment.  On a
+	 * 5000PPM adjustment this is a 0.05PPM error.  This is acceptable.
+ 	 *
+	 * We happily sacrifice the lowest of the 64 bits of our result
+	 * to the goddess of code clarity.
+	 *
+	 */
+	scale = (uint64_t)1 << 63;
+	scale += (th->th_adjustment / 1024) * 2199;
+	scale /= th->th_counter->tc_frequency;
+	th->th_scale = scale * 2;
+
+	/*
+	 * Now that the struct timehands is again consistent, set the new
+	 * generation number, making sure to not make it zero.
+	 */
+	if (++ogen == 0)
+		ogen = 1;
+	th->th_generation = ogen;
+
+	/* Go live with the new struct timehands. */
+#ifdef FFCLOCK
+	switch (sysclock_active) {
+	case SYSCLOCK_FBCK:
+#endif
+		time_second = th->th_microtime.tv_sec;
+		time_uptime = th->th_offset.sec;
+#ifdef FFCLOCK
+		break;
+	case SYSCLOCK_FFWD:
+		time_second = fftimehands->tick_time_lerp.sec;
+		time_uptime = fftimehands->tick_time_lerp.sec - ffclock_boottime.sec;
+		break;
+	}
+#endif
+
+	timehands = th;
+	timekeep_push_vdso();
+}
+
+/* Report or change the active timecounter hardware. */
+static int
+sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS)
+{
+	char newname[32];
+	struct timecounter *newtc, *tc;
+	int error;
+
+	tc = timecounter;
+	strlcpy(newname, tc->tc_name, sizeof(newname));
+
+	error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req);
+	if (error != 0 || req->newptr == NULL ||
+	    strcmp(newname, tc->tc_name) == 0)
+		return (error);
+	for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
+		if (strcmp(newname, newtc->tc_name) != 0)
+			continue;
+
+		/* Warm up new timecounter. */
+		(void)newtc->tc_get_timecount(newtc);
+		(void)newtc->tc_get_timecount(newtc);
+
+		timecounter = newtc;
+		timekeep_push_vdso();
+		return (0);
+	}
+	return (EINVAL);
+}
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW,
+    0, 0, sysctl_kern_timecounter_hardware, "A",
+    "Timecounter hardware selected");
+
+
+/* Report or change the active timecounter hardware. */
+static int
+sysctl_kern_timecounter_choice(SYSCTL_HANDLER_ARGS)
+{
+	char buf[32], *spc;
+	struct timecounter *tc;
+	int error;
+
+	spc = "";
+	error = 0;
+	for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) {
+		sprintf(buf, "%s%s(%d)",
+		    spc, tc->tc_name, tc->tc_quality);
+		error = SYSCTL_OUT(req, buf, strlen(buf));
+		spc = " ";
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, choice, CTLTYPE_STRING | CTLFLAG_RD,
+    0, 0, sysctl_kern_timecounter_choice, "A", "Timecounter hardware detected");
+
+/*
+ * RFC 2783 PPS-API implementation.
+ */
+
+static int
+pps_fetch(struct pps_fetch_args *fapi, struct pps_state *pps)
+{
+	int err, timo;
+	pps_seq_t aseq, cseq;
+	struct timeval tv;
+
+	if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
+		return (EINVAL);
+
+	/*
+	 * If no timeout is requested, immediately return whatever values were
+	 * most recently captured.  If timeout seconds is -1, that's a request
+	 * to block without a timeout.  WITNESS won't let us sleep forever
+	 * without a lock (we really don't need a lock), so just repeatedly
+	 * sleep a long time.
+	 */
+	if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec) {
+		if (fapi->timeout.tv_sec == -1)
+			timo = 0x7fffffff;
+		else {
+			tv.tv_sec = fapi->timeout.tv_sec;
+			tv.tv_usec = fapi->timeout.tv_nsec / 1000;
+			timo = tvtohz(&tv);
+		}
+		aseq = pps->ppsinfo.assert_sequence;
+		cseq = pps->ppsinfo.clear_sequence;
+		while (aseq == pps->ppsinfo.assert_sequence &&
+		    cseq == pps->ppsinfo.clear_sequence) {
+			err = tsleep(pps, PCATCH, "ppsfch", timo);
+			if (err == EWOULDBLOCK && fapi->timeout.tv_sec == -1) {
+				continue;
+			} else if (err != 0) {
+				return (err);
+			}
+		}
+	}
+
+	pps->ppsinfo.current_mode = pps->ppsparam.mode;
+	fapi->pps_info_buf = pps->ppsinfo;
+
+	return (0);
+}
+
+int
+pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
+{
+	pps_params_t *app;
+	struct pps_fetch_args *fapi;
+#ifdef FFCLOCK
+	struct pps_fetch_ffc_args *fapi_ffc;
+#endif
+#ifdef PPS_SYNC
+	struct pps_kcbind_args *kapi;
+#endif
+
+	KASSERT(pps != NULL, ("NULL pps pointer in pps_ioctl"));
+	switch (cmd) {
+	case PPS_IOC_CREATE:
+		return (0);
+	case PPS_IOC_DESTROY:
+		return (0);
+	case PPS_IOC_SETPARAMS:
+		app = (pps_params_t *)data;
+		if (app->mode & ~pps->ppscap)
+			return (EINVAL);
+#ifdef FFCLOCK
+		/* Ensure only a single clock is selected for ffc timestamp. */
+		if ((app->mode & PPS_TSCLK_MASK) == PPS_TSCLK_MASK)
+			return (EINVAL);
+#endif
+		pps->ppsparam = *app;
+		return (0);
+	case PPS_IOC_GETPARAMS:
+		app = (pps_params_t *)data;
+		*app = pps->ppsparam;
+		app->api_version = PPS_API_VERS_1;
+		return (0);
+	case PPS_IOC_GETCAP:
+		*(int*)data = pps->ppscap;
+		return (0);
+	case PPS_IOC_FETCH:
+		fapi = (struct pps_fetch_args *)data;
+		return (pps_fetch(fapi, pps));
+#ifdef FFCLOCK
+	case PPS_IOC_FETCH_FFCOUNTER:
+		fapi_ffc = (struct pps_fetch_ffc_args *)data;
+		if (fapi_ffc->tsformat && fapi_ffc->tsformat !=
+		    PPS_TSFMT_TSPEC)
+			return (EINVAL);
+		if (fapi_ffc->timeout.tv_sec || fapi_ffc->timeout.tv_nsec)
+			return (EOPNOTSUPP);
+		pps->ppsinfo_ffc.current_mode = pps->ppsparam.mode;
+		fapi_ffc->pps_info_buf_ffc = pps->ppsinfo_ffc;
+		/* Overwrite timestamps if feedback clock selected. */
+		switch (pps->ppsparam.mode & PPS_TSCLK_MASK) {
+		case PPS_TSCLK_FBCK:
+			fapi_ffc->pps_info_buf_ffc.assert_timestamp =
+			    pps->ppsinfo.assert_timestamp;
+			fapi_ffc->pps_info_buf_ffc.clear_timestamp =
+			    pps->ppsinfo.clear_timestamp;
+			break;
+		case PPS_TSCLK_FFWD:
+			break;
+		default:
+			break;
+		}
+		return (0);
+#endif /* FFCLOCK */
+	case PPS_IOC_KCBIND:
+#ifdef PPS_SYNC
+		kapi = (struct pps_kcbind_args *)data;
+		/* XXX Only root should be able to do this */
+		if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
+			return (EINVAL);
+		if (kapi->kernel_consumer != PPS_KC_HARDPPS)
+			return (EINVAL);
+		if (kapi->edge & ~pps->ppscap)
+			return (EINVAL);
+		pps->kcmode = kapi->edge;
+		return (0);
+#else
+		return (EOPNOTSUPP);
+#endif
+	default:
+		return (ENOIOCTL);
+	}
+}
+
+void
+pps_init(struct pps_state *pps)
+{
+	pps->ppscap |= PPS_TSFMT_TSPEC | PPS_CANWAIT;
+	if (pps->ppscap & PPS_CAPTUREASSERT)
+		pps->ppscap |= PPS_OFFSETASSERT;
+	if (pps->ppscap & PPS_CAPTURECLEAR)
+		pps->ppscap |= PPS_OFFSETCLEAR;
+#ifdef FFCLOCK
+	pps->ppscap |= PPS_TSCLK_MASK;
+#endif
+}
+
+void
+pps_capture(struct pps_state *pps)
+{
+	struct timehands *th;
+
+	KASSERT(pps != NULL, ("NULL pps pointer in pps_capture"));
+	th = timehands;
+	pps->capgen = th->th_generation;
+	pps->capth = th;
+#ifdef FFCLOCK
+	pps->capffth = fftimehands;
+#endif
+	pps->capcount = th->th_counter->tc_get_timecount(th->th_counter);
+	if (pps->capgen != th->th_generation)
+		pps->capgen = 0;
+}
+
+void
+pps_event(struct pps_state *pps, int event)
+{
+	struct bintime bt;
+	struct timespec ts, *tsp, *osp;
+	u_int tcount, *pcount;
+	int foff, fhard;
+	pps_seq_t *pseq;
+#ifdef FFCLOCK
+	struct timespec *tsp_ffc;
+	pps_seq_t *pseq_ffc;
+	ffcounter *ffcount;
+#endif
+
+	KASSERT(pps != NULL, ("NULL pps pointer in pps_event"));
+	/* If the timecounter was wound up underneath us, bail out. */
+	if (pps->capgen == 0 || pps->capgen != pps->capth->th_generation)
+		return;
+
+	/* Things would be easier with arrays. */
+	if (event == PPS_CAPTUREASSERT) {
+		tsp = &pps->ppsinfo.assert_timestamp;
+		osp = &pps->ppsparam.assert_offset;
+		foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
+		fhard = pps->kcmode & PPS_CAPTUREASSERT;
+		pcount = &pps->ppscount[0];
+		pseq = &pps->ppsinfo.assert_sequence;
+#ifdef FFCLOCK
+		ffcount = &pps->ppsinfo_ffc.assert_ffcount;
+		tsp_ffc = &pps->ppsinfo_ffc.assert_timestamp;
+		pseq_ffc = &pps->ppsinfo_ffc.assert_sequence;
+#endif
+	} else {
+		tsp = &pps->ppsinfo.clear_timestamp;
+		osp = &pps->ppsparam.clear_offset;
+		foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
+		fhard = pps->kcmode & PPS_CAPTURECLEAR;
+		pcount = &pps->ppscount[1];
+		pseq = &pps->ppsinfo.clear_sequence;
+#ifdef FFCLOCK
+		ffcount = &pps->ppsinfo_ffc.clear_ffcount;
+		tsp_ffc = &pps->ppsinfo_ffc.clear_timestamp;
+		pseq_ffc = &pps->ppsinfo_ffc.clear_sequence;
+#endif
+	}
+
+	/*
+	 * If the timecounter changed, we cannot compare the count values, so
+	 * we have to drop the rest of the PPS-stuff until the next event.
+	 */
+	if (pps->ppstc != pps->capth->th_counter) {
+		pps->ppstc = pps->capth->th_counter;
+		*pcount = pps->capcount;
+		pps->ppscount[2] = pps->capcount;
+		return;
+	}
+
+	/* Convert the count to a timespec. */
+	tcount = pps->capcount - pps->capth->th_offset_count;
+	tcount &= pps->capth->th_counter->tc_counter_mask;
+	bt = pps->capth->th_offset;
+	bintime_addx(&bt, pps->capth->th_scale * tcount);
+	bintime_add(&bt, &boottimebin);
+	bintime2timespec(&bt, &ts);
+
+	/* If the timecounter was wound up underneath us, bail out. */
+	if (pps->capgen != pps->capth->th_generation)
+		return;
+
+	*pcount = pps->capcount;
+	(*pseq)++;
+	*tsp = ts;
+
+	if (foff) {
+		timespecadd(tsp, osp);
+		if (tsp->tv_nsec < 0) {
+			tsp->tv_nsec += 1000000000;
+			tsp->tv_sec -= 1;
+		}
+	}
+
+#ifdef FFCLOCK
+	*ffcount = pps->capffth->tick_ffcount + tcount;
+	bt = pps->capffth->tick_time;
+	ffclock_convert_delta(tcount, pps->capffth->cest.period, &bt);
+	bintime_add(&bt, &pps->capffth->tick_time);
+	bintime2timespec(&bt, &ts);
+	(*pseq_ffc)++;
+	*tsp_ffc = ts;
+#endif
+
+#ifdef PPS_SYNC
+	if (fhard) {
+		uint64_t scale;
+
+		/*
+		 * Feed the NTP PLL/FLL.
+		 * The FLL wants to know how many (hardware) nanoseconds
+		 * elapsed since the previous event.
+		 */
+		tcount = pps->capcount - pps->ppscount[2];
+		pps->ppscount[2] = pps->capcount;
+		tcount &= pps->capth->th_counter->tc_counter_mask;
+		scale = (uint64_t)1 << 63;
+		scale /= pps->capth->th_counter->tc_frequency;
+		scale *= 2;
+		bt.sec = 0;
+		bt.frac = 0;
+		bintime_addx(&bt, scale * tcount);
+		bintime2timespec(&bt, &ts);
+		hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec);
+	}
+#endif
+
+	/* Wakeup anyone sleeping in pps_fetch().  */
+	wakeup(pps);
+}
+
+/*
+ * Timecounters need to be updated every so often to prevent the hardware
+ * counter from overflowing.  Updating also recalculates the cached values
+ * used by the get*() family of functions, so their precision depends on
+ * the update frequency.
+ */
+
+static int tc_tick;
+SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tc_tick, 0,
+    "Approximate number of hardclock ticks in a millisecond");
+
+void
+tc_ticktock(int cnt)
+{
+	static int count;
+
+	count += cnt;
+	if (count < tc_tick)
+		return;
+	count = 0;
+	tc_windup();
+}
+
+static void __inline
+tc_adjprecision(void)
+{
+	int t;
+
+	if (tc_timepercentage > 0) {
+		t = (99 + tc_timepercentage) / tc_timepercentage;
+		tc_precexp = fls(t + (t >> 1)) - 1;
+		FREQ2BT(hz / tc_tick, &bt_timethreshold);
+		FREQ2BT(hz, &bt_tickthreshold);
+		bintime_shift(&bt_timethreshold, tc_precexp);
+		bintime_shift(&bt_tickthreshold, tc_precexp);
+	} else {
+		tc_precexp = 31;
+		bt_timethreshold.sec = INT_MAX;
+		bt_timethreshold.frac = ~(uint64_t)0;
+		bt_tickthreshold = bt_timethreshold;
+	}
+	sbt_timethreshold = bttosbt(bt_timethreshold);
+	sbt_tickthreshold = bttosbt(bt_tickthreshold);
+}
+
+static int
+sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = tc_timepercentage;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	tc_timepercentage = val;
+	tc_adjprecision();
+	return (0);
+}
+
+static void
+inittimecounter(void *dummy)
+{
+	u_int p;
+	int tick_rate;
+
+	/*
+	 * Set the initial timeout to
+	 * max(1, <approx. number of hardclock ticks in a millisecond>).
+	 * People should probably not use the sysctl to set the timeout
+	 * to smaller than its inital value, since that value is the
+	 * smallest reasonable one.  If they want better timestamps they
+	 * should use the non-"get"* functions.
+	 */
+	if (hz > 1000)
+		tc_tick = (hz + 500) / 1000;
+	else
+		tc_tick = 1;
+	tc_adjprecision();
+	FREQ2BT(hz, &tick_bt);
+	tick_sbt = bttosbt(tick_bt);
+	tick_rate = hz / tc_tick;
+	FREQ2BT(tick_rate, &tc_tick_bt);
+	tc_tick_sbt = bttosbt(tc_tick_bt);
+	p = (tc_tick * 1000000) / hz;
+	printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000);
+
+#ifdef FFCLOCK
+	ffclock_init();
+#endif
+	/* warm up new timecounter (again) and get rolling. */
+	(void)timecounter->tc_get_timecount(timecounter);
+	(void)timecounter->tc_get_timecount(timecounter);
+	tc_windup();
+}
+
+SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL);
+
+/* Cpu tick handling -------------------------------------------------*/
+
+static int cpu_tick_variable;
+static uint64_t	cpu_tick_frequency;
+
+static uint64_t
+tc_cpu_ticks(void)
+{
+	static uint64_t base;
+	static unsigned last;
+	unsigned u;
+	struct timecounter *tc;
+
+	tc = timehands->th_counter;
+	u = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
+	if (u < last)
+		base += (uint64_t)tc->tc_counter_mask + 1;
+	last = u;
+	return (u + base);
+}
+
+void
+cpu_tick_calibration(void)
+{
+	static time_t last_calib;
+
+	if (time_uptime != last_calib && !(time_uptime & 0xf)) {
+		cpu_tick_calibrate(0);
+		last_calib = time_uptime;
+	}
+}
+
+/*
+ * This function gets called every 16 seconds on only one designated
+ * CPU in the system from hardclock() via cpu_tick_calibration()().
+ *
+ * Whenever the real time clock is stepped we get called with reset=1
+ * to make sure we handle suspend/resume and similar events correctly.
+ */
+
+static void
+cpu_tick_calibrate(int reset)
+{
+	static uint64_t c_last;
+	uint64_t c_this, c_delta;
+	static struct bintime  t_last;
+	struct bintime t_this, t_delta;
+	uint32_t divi;
+
+	if (reset) {
+		/* The clock was stepped, abort & reset */
+		t_last.sec = 0;
+		return;
+	}
+
+	/* we don't calibrate fixed rate cputicks */
+	if (!cpu_tick_variable)
+		return;
+
+	getbinuptime(&t_this);
+	c_this = cpu_ticks();
+	if (t_last.sec != 0) {
+		c_delta = c_this - c_last;
+		t_delta = t_this;
+		bintime_sub(&t_delta, &t_last);
+		/*
+		 * Headroom:
+		 * 	2^(64-20) / 16[s] =
+		 * 	2^(44) / 16[s] =
+		 * 	17.592.186.044.416 / 16 =
+		 * 	1.099.511.627.776 [Hz]
+		 */
+		divi = t_delta.sec << 20;
+		divi |= t_delta.frac >> (64 - 20);
+		c_delta <<= 20;
+		c_delta /= divi;
+		if (c_delta > cpu_tick_frequency) {
+			if (0 && bootverbose)
+				printf("cpu_tick increased to %ju Hz\n",
+				    c_delta);
+			cpu_tick_frequency = c_delta;
+		}
+	}
+	c_last = c_this;
+	t_last = t_this;
+}
+
+void
+set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var)
+{
+
+	if (func == NULL) {
+		cpu_ticks = tc_cpu_ticks;
+	} else {
+		cpu_tick_frequency = freq;
+		cpu_tick_variable = var;
+		cpu_ticks = func;
+	}
+}
+
+uint64_t
+cpu_tickrate(void)
+{
+
+	if (cpu_ticks == tc_cpu_ticks) 
+		return (tc_getfrequency());
+	return (cpu_tick_frequency);
+}
+
+/*
+ * We need to be slightly careful converting cputicks to microseconds.
+ * There is plenty of margin in 64 bits of microseconds (half a million
+ * years) and in 64 bits at 4 GHz (146 years), but if we do a multiply
+ * before divide conversion (to retain precision) we find that the
+ * margin shrinks to 1.5 hours (one millionth of 146y).
+ * With a three prong approach we never lose significant bits, no
+ * matter what the cputick rate and length of timeinterval is.
+ */
+
+uint64_t
+cputick2usec(uint64_t tick)
+{
+
+	if (tick > 18446744073709551LL)		/* floor(2^64 / 1000) */
+		return (tick / (cpu_tickrate() / 1000000LL));
+	else if (tick > 18446744073709LL)	/* floor(2^64 / 1000000) */
+		return ((tick * 1000LL) / (cpu_tickrate() / 1000LL));
+	else
+		return ((tick * 1000000LL) / cpu_tickrate());
+}
+
+cpu_tick_f	*cpu_ticks = tc_cpu_ticks;
+
+static int vdso_th_enable = 1;
+static int
+sysctl_fast_gettime(SYSCTL_HANDLER_ARGS)
+{
+	int old_vdso_th_enable, error;
+
+	old_vdso_th_enable = vdso_th_enable;
+	error = sysctl_handle_int(oidp, &old_vdso_th_enable, 0, req);
+	if (error != 0)
+		return (error);
+	vdso_th_enable = old_vdso_th_enable;
+	timekeep_push_vdso();
+	return (0);
+}
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, fast_gettime,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, 0, sysctl_fast_gettime, "I", "Enable fast time of day");
+
+uint32_t
+tc_fill_vdso_timehands(struct vdso_timehands *vdso_th)
+{
+	struct timehands *th;
+	uint32_t enabled;
+
+	th = timehands;
+	vdso_th->th_algo = VDSO_TH_ALGO_1;
+	vdso_th->th_scale = th->th_scale;
+	vdso_th->th_offset_count = th->th_offset_count;
+	vdso_th->th_counter_mask = th->th_counter->tc_counter_mask;
+	vdso_th->th_offset = th->th_offset;
+	vdso_th->th_boottime = boottimebin;
+	enabled = cpu_fill_vdso_timehands(vdso_th);
+	if (!vdso_th_enable)
+		enabled = 0;
+	return (enabled);
+}
+
+#ifdef COMPAT_FREEBSD32
+uint32_t
+tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
+{
+	struct timehands *th;
+	uint32_t enabled;
+
+	th = timehands;
+	vdso_th32->th_algo = VDSO_TH_ALGO_1;
+	*(uint64_t *)&vdso_th32->th_scale[0] = th->th_scale;
+	vdso_th32->th_offset_count = th->th_offset_count;
+	vdso_th32->th_counter_mask = th->th_counter->tc_counter_mask;
+	vdso_th32->th_offset.sec = th->th_offset.sec;
+	*(uint64_t *)&vdso_th32->th_offset.frac[0] = th->th_offset.frac;
+	vdso_th32->th_boottime.sec = boottimebin.sec;
+	*(uint64_t *)&vdso_th32->th_boottime.frac[0] = boottimebin.frac;
+	enabled = cpu_fill_vdso_timehands32(vdso_th32);
+	if (!vdso_th_enable)
+		enabled = 0;
+	return (enabled);
+}
+#endif
diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c
new file mode 100644
index 0000000..4270b41
--- /dev/null
+++ b/sys/kern/kern_thr.c
@@ -0,0 +1,555 @@
+/*-
+ * Copyright (c) 2003, Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_posix.h"
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/sysctl.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/ucontext.h>
+#include <sys/thr.h>
+#include <sys/rtprio.h>
+#include <sys/umtx.h>
+#include <sys/limits.h>
+
+#include <machine/frame.h>
+
+#include <security/audit/audit.h>
+
+static SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0,
+    "thread allocation");
+
+static int max_threads_per_proc = 1500;
+SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
+    &max_threads_per_proc, 0, "Limit on threads per proc");
+
+static int max_threads_hits;
+SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
+    &max_threads_hits, 0, "kern.threads.max_threads_per_proc hit count");
+
+#ifdef COMPAT_FREEBSD32
+
+static inline int
+suword_lwpid(void *addr, lwpid_t lwpid)
+{
+	int error;
+
+	if (SV_CURPROC_FLAG(SV_LP64))
+		error = suword(addr, lwpid);
+	else
+		error = suword32(addr, lwpid);
+	return (error);
+}
+
+#else
+#define suword_lwpid	suword
+#endif
+
+static int create_thread(struct thread *td, mcontext_t *ctx,
+			 void (*start_func)(void *), void *arg,
+			 char *stack_base, size_t stack_size,
+			 char *tls_base,
+			 long *child_tid, long *parent_tid,
+			 int flags, struct rtprio *rtp);
+
+/*
+ * System call interface.
+ */
+int
+sys_thr_create(struct thread *td, struct thr_create_args *uap)
+    /* ucontext_t *ctx, long *id, int flags */
+{
+	ucontext_t ctx;
+	int error;
+
+	if ((error = copyin(uap->ctx, &ctx, sizeof(ctx))))
+		return (error);
+
+	error = create_thread(td, &ctx.uc_mcontext, NULL, NULL,
+		NULL, 0, NULL, uap->id, NULL, uap->flags, NULL);
+	return (error);
+}
+
+int
+sys_thr_new(struct thread *td, struct thr_new_args *uap)
+    /* struct thr_param * */
+{
+	struct thr_param param;
+	int error;
+
+	if (uap->param_size < 0 || uap->param_size > sizeof(param))
+		return (EINVAL);
+	bzero(&param, sizeof(param));
+	if ((error = copyin(uap->param, &param, uap->param_size)))
+		return (error);
+	return (kern_thr_new(td, &param));
+}
+
+int
+kern_thr_new(struct thread *td, struct thr_param *param)
+{
+	struct rtprio rtp, *rtpp;
+	int error;
+
+	rtpp = NULL;
+	if (param->rtp != 0) {
+		error = copyin(param->rtp, &rtp, sizeof(struct rtprio));
+		if (error)
+			return (error);
+		rtpp = &rtp;
+	}
+	error = create_thread(td, NULL, param->start_func, param->arg,
+		param->stack_base, param->stack_size, param->tls_base,
+		param->child_tid, param->parent_tid, param->flags,
+		rtpp);
+	return (error);
+}
+
+static int
+create_thread(struct thread *td, mcontext_t *ctx,
+	    void (*start_func)(void *), void *arg,
+	    char *stack_base, size_t stack_size,
+	    char *tls_base,
+	    long *child_tid, long *parent_tid,
+	    int flags, struct rtprio *rtp)
+{
+	stack_t stack;
+	struct thread *newtd;
+	struct proc *p;
+	int error;
+
+	p = td->td_proc;
+
+	/* Have race condition but it is cheap. */
+	if (p->p_numthreads >= max_threads_per_proc) {
+		++max_threads_hits;
+		return (EPROCLIM);
+	}
+
+	if (rtp != NULL) {
+		switch(rtp->type) {
+		case RTP_PRIO_REALTIME:
+		case RTP_PRIO_FIFO:
+			/* Only root can set scheduler policy */
+			if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0)
+				return (EPERM);
+			if (rtp->prio > RTP_PRIO_MAX)
+				return (EINVAL);
+			break;
+		case RTP_PRIO_NORMAL:
+			rtp->prio = 0;
+			break;
+		default:
+			return (EINVAL);
+		}
+	}
+
+#ifdef RACCT
+	PROC_LOCK(td->td_proc);
+	error = racct_add(p, RACCT_NTHR, 1);
+	PROC_UNLOCK(td->td_proc);
+	if (error != 0)
+		return (EPROCLIM);
+#endif
+
+	/* Initialize our td */
+	newtd = thread_alloc(0);
+	if (newtd == NULL) {
+		error = ENOMEM;
+		goto fail;
+	}
+
+	cpu_set_upcall(newtd, td);
+
+	/*
+	 * Try the copyout as soon as we allocate the td so we don't
+	 * have to tear things down in a failure case below.
+	 * Here we copy out tid to two places, one for child and one
+	 * for parent, because pthread can create a detached thread,
+	 * if parent wants to safely access child tid, it has to provide 
+	 * its storage, because child thread may exit quickly and
+	 * memory is freed before parent thread can access it.
+	 */
+	if ((child_tid != NULL &&
+	    suword_lwpid(child_tid, newtd->td_tid)) ||
+	    (parent_tid != NULL &&
+	    suword_lwpid(parent_tid, newtd->td_tid))) {
+		thread_free(newtd);
+		error = EFAULT;
+		goto fail;
+	}
+
+	bzero(&newtd->td_startzero,
+	    __rangeof(struct thread, td_startzero, td_endzero));
+	bcopy(&td->td_startcopy, &newtd->td_startcopy,
+	    __rangeof(struct thread, td_startcopy, td_endcopy));
+	newtd->td_proc = td->td_proc;
+	newtd->td_ucred = crhold(td->td_ucred);
+
+	if (ctx != NULL) { /* old way to set user context */
+		error = set_mcontext(newtd, ctx);
+		if (error != 0) {
+			thread_free(newtd);
+			crfree(td->td_ucred);
+			goto fail;
+		}
+	} else {
+		/* Set up our machine context. */
+		stack.ss_sp = stack_base;
+		stack.ss_size = stack_size;
+		/* Set upcall address to user thread entry function. */
+		cpu_set_upcall_kse(newtd, start_func, arg, &stack);
+		/* Setup user TLS address and TLS pointer register. */
+		error = cpu_set_user_tls(newtd, tls_base);
+		if (error != 0) {
+			thread_free(newtd);
+			crfree(td->td_ucred);
+			goto fail;
+		}
+	}
+
+	PROC_LOCK(td->td_proc);
+	td->td_proc->p_flag |= P_HADTHREADS;
+	thread_link(newtd, p); 
+	bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name));
+	thread_lock(td);
+	/* let the scheduler know about these things. */
+	sched_fork_thread(td, newtd);
+	thread_unlock(td);
+	if (P_SHOULDSTOP(p))
+		newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
+	PROC_UNLOCK(p);
+
+	tidhash_add(newtd);
+
+	thread_lock(newtd);
+	if (rtp != NULL) {
+		if (!(td->td_pri_class == PRI_TIMESHARE &&
+		      rtp->type == RTP_PRIO_NORMAL)) {
+			rtp_to_pri(rtp, newtd);
+			sched_prio(newtd, newtd->td_user_pri);
+		} /* ignore timesharing class */
+	}
+	TD_SET_CAN_RUN(newtd);
+	sched_add(newtd, SRQ_BORING);
+	thread_unlock(newtd);
+
+	return (0);
+
+fail:
+#ifdef RACCT
+	PROC_LOCK(p);
+	racct_sub(p, RACCT_NTHR, 1);
+	PROC_UNLOCK(p);
+#endif
+	return (error);
+}
+
+int
+sys_thr_self(struct thread *td, struct thr_self_args *uap)
+    /* long *id */
+{
+	int error;
+
+	error = suword_lwpid(uap->id, (unsigned)td->td_tid);
+	if (error == -1)
+		return (EFAULT);
+	return (0);
+}
+
+int
+sys_thr_exit(struct thread *td, struct thr_exit_args *uap)
+    /* long *state */
+{
+	struct proc *p;
+
+	p = td->td_proc;
+
+	/* Signal userland that it can free the stack. */
+	if ((void *)uap->state != NULL) {
+		suword_lwpid(uap->state, 1);
+		kern_umtx_wake(td, uap->state, INT_MAX, 0);
+	}
+
+	rw_wlock(&tidhash_lock);
+
+	PROC_LOCK(p);
+
+	/*
+	 * Shutting down last thread in the proc.  This will actually
+	 * call exit() in the trampoline when it returns.
+	 */
+	if (p->p_numthreads != 1) {
+		racct_sub(p, RACCT_NTHR, 1);
+		LIST_REMOVE(td, td_hash);
+		rw_wunlock(&tidhash_lock);
+		tdsigcleanup(td);
+		PROC_SLOCK(p);
+		thread_stopped(p);
+		thread_exit();
+		/* NOTREACHED */
+	}
+	PROC_UNLOCK(p);
+	rw_wunlock(&tidhash_lock);
+	return (0);
+}
+
+int
+sys_thr_kill(struct thread *td, struct thr_kill_args *uap)
+    /* long id, int sig */
+{
+	ksiginfo_t ksi;
+	struct thread *ttd;
+	struct proc *p;
+	int error;
+
+	p = td->td_proc;
+	ksiginfo_init(&ksi);
+	ksi.ksi_signo = uap->sig;
+	ksi.ksi_code = SI_LWP;
+	ksi.ksi_pid = p->p_pid;
+	ksi.ksi_uid = td->td_ucred->cr_ruid;
+	if (uap->id == -1) {
+		if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
+			error = EINVAL;
+		} else {
+			error = ESRCH;
+			PROC_LOCK(p);
+			FOREACH_THREAD_IN_PROC(p, ttd) {
+				if (ttd != td) {
+					error = 0;
+					if (uap->sig == 0)
+						break;
+					tdksignal(ttd, uap->sig, &ksi);
+				}
+			}
+			PROC_UNLOCK(p);
+		}
+	} else {
+		error = 0;
+		ttd = tdfind((lwpid_t)uap->id, p->p_pid);
+		if (ttd == NULL)
+			return (ESRCH);
+		if (uap->sig == 0)
+			;
+		else if (!_SIG_VALID(uap->sig))
+			error = EINVAL;
+		else 
+			tdksignal(ttd, uap->sig, &ksi);
+		PROC_UNLOCK(ttd->td_proc);
+	}
+	return (error);
+}
+
+int
+sys_thr_kill2(struct thread *td, struct thr_kill2_args *uap)
+    /* pid_t pid, long id, int sig */
+{
+	ksiginfo_t ksi;
+	struct thread *ttd;
+	struct proc *p;
+	int error;
+
+	AUDIT_ARG_SIGNUM(uap->sig);
+
+	ksiginfo_init(&ksi);
+	ksi.ksi_signo = uap->sig;
+	ksi.ksi_code = SI_LWP;
+	ksi.ksi_pid = td->td_proc->p_pid;
+	ksi.ksi_uid = td->td_ucred->cr_ruid;
+	if (uap->id == -1) {
+		if ((p = pfind(uap->pid)) == NULL)
+			return (ESRCH);
+		AUDIT_ARG_PROCESS(p);
+		error = p_cansignal(td, p, uap->sig);
+		if (error) {
+			PROC_UNLOCK(p);
+			return (error);
+		}
+		if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
+			error = EINVAL;
+		} else {
+			error = ESRCH;
+			FOREACH_THREAD_IN_PROC(p, ttd) {
+				if (ttd != td) {
+					error = 0;
+					if (uap->sig == 0)
+						break;
+					tdksignal(ttd, uap->sig, &ksi);
+				}
+			}
+		}
+		PROC_UNLOCK(p);
+	} else {
+		ttd = tdfind((lwpid_t)uap->id, uap->pid);
+		if (ttd == NULL)
+			return (ESRCH);
+		p = ttd->td_proc;
+		AUDIT_ARG_PROCESS(p);
+		error = p_cansignal(td, p, uap->sig);
+		if (uap->sig == 0)
+			;
+		else if (!_SIG_VALID(uap->sig))
+			error = EINVAL;
+		else
+			tdksignal(ttd, uap->sig, &ksi);
+		PROC_UNLOCK(p);
+	}
+	return (error);
+}
+
+int
+sys_thr_suspend(struct thread *td, struct thr_suspend_args *uap)
+	/* const struct timespec *timeout */
+{
+	struct timespec ts, *tsp;
+	int error;
+
+	tsp = NULL;
+	if (uap->timeout != NULL) {
+		error = umtx_copyin_timeout(uap->timeout, &ts);
+		if (error != 0)
+			return (error);
+		tsp = &ts;
+	}
+
+	return (kern_thr_suspend(td, tsp));
+}
+
+int
+kern_thr_suspend(struct thread *td, struct timespec *tsp)
+{
+	struct proc *p = td->td_proc;
+	struct timeval tv;
+	int error = 0;
+	int timo = 0;
+
+	if (td->td_pflags & TDP_WAKEUP) {
+		td->td_pflags &= ~TDP_WAKEUP;
+		return (0);
+	}
+
+	if (tsp != NULL) {
+		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
+			error = EWOULDBLOCK;
+		else {
+			TIMESPEC_TO_TIMEVAL(&tv, tsp);
+			timo = tvtohz(&tv);
+		}
+	}
+
+	PROC_LOCK(p);
+	if (error == 0 && (td->td_flags & TDF_THRWAKEUP) == 0)
+		error = msleep((void *)td, &p->p_mtx,
+			 PCATCH, "lthr", timo);
+
+	if (td->td_flags & TDF_THRWAKEUP) {
+		thread_lock(td);
+		td->td_flags &= ~TDF_THRWAKEUP;
+		thread_unlock(td);
+		PROC_UNLOCK(p);
+		return (0);
+	}
+	PROC_UNLOCK(p);
+	if (error == EWOULDBLOCK)
+		error = ETIMEDOUT;
+	else if (error == ERESTART) {
+		if (timo != 0)
+			error = EINTR;
+	}
+	return (error);
+}
+
+int
+sys_thr_wake(struct thread *td, struct thr_wake_args *uap)
+	/* long id */
+{
+	struct proc *p;
+	struct thread *ttd;
+
+	if (uap->id == td->td_tid) {
+		td->td_pflags |= TDP_WAKEUP;
+		return (0);
+	} 
+
+	p = td->td_proc;
+	ttd = tdfind((lwpid_t)uap->id, p->p_pid);
+	if (ttd == NULL)
+		return (ESRCH);
+	thread_lock(ttd);
+	ttd->td_flags |= TDF_THRWAKEUP;
+	thread_unlock(ttd);
+	wakeup((void *)ttd);
+	PROC_UNLOCK(p);
+	return (0);
+}
+
+int
+sys_thr_set_name(struct thread *td, struct thr_set_name_args *uap)
+{
+	struct proc *p;
+	char name[MAXCOMLEN + 1];
+	struct thread *ttd;
+	int error;
+
+	error = 0;
+	name[0] = '\0';
+	if (uap->name != NULL) {
+		error = copyinstr(uap->name, name, sizeof(name),
+			NULL);
+		if (error)
+			return (error);
+	}
+	p = td->td_proc;
+	ttd = tdfind((lwpid_t)uap->id, p->p_pid);
+	if (ttd == NULL)
+		return (ESRCH);
+	strcpy(ttd->td_name, name);
+#ifdef KTR
+	sched_clear_tdname(ttd);
+#endif
+	PROC_UNLOCK(p);
+	return (error);
+}
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
new file mode 100644
index 0000000..5da4866
--- /dev/null
+++ b/sys/kern/kern_thread.c
@@ -0,0 +1,1054 @@
+/*-
+ * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
+ *  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice(s), this list of conditions and the following disclaimer as
+ *    the first lines of this file unmodified other than the possible
+ *    addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice(s), this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include "opt_witness.h"
+#include "opt_kdtrace.h"
+#include "opt_hwpmc_hooks.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rangelock.h>
+#include <sys/resourcevar.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/sched.h>
+#include <sys/sleepqueue.h>
+#include <sys/selinfo.h>
+#include <sys/turnstile.h>
+#include <sys/ktr.h>
+#include <sys/rwlock.h>
+#include <sys/umtx.h>
+#include <sys/cpuset.h>
+#ifdef	HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#include <security/audit/audit.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+#include <sys/eventhandler.h>
+
+SDT_PROVIDER_DECLARE(proc);
+SDT_PROBE_DEFINE(proc, , , lwp_exit, lwp-exit);
+
+
+/*
+ * thread related storage.
+ */
+static uma_zone_t thread_zone;
+
+TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
+static struct mtx zombie_lock;
+MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN);
+
+static void thread_zombie(struct thread *);
+
+#define TID_BUFFER_SIZE	1024
+
+struct mtx tid_lock;
+static struct unrhdr *tid_unrhdr;
+static lwpid_t tid_buffer[TID_BUFFER_SIZE];
+static int tid_head, tid_tail;
+static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash");
+
+struct	tidhashhead *tidhashtbl;
+u_long	tidhash;
+struct	rwlock tidhash_lock;
+
+static lwpid_t
+tid_alloc(void)
+{
+	lwpid_t	tid;
+
+	tid = alloc_unr(tid_unrhdr);
+	if (tid != -1)
+		return (tid);
+	mtx_lock(&tid_lock);
+	if (tid_head == tid_tail) {
+		mtx_unlock(&tid_lock);
+		return (-1);
+	}
+	tid = tid_buffer[tid_head];
+	tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
+	mtx_unlock(&tid_lock);
+	return (tid);
+}
+
+static void
+tid_free(lwpid_t tid)
+{
+	lwpid_t tmp_tid = -1;
+
+	mtx_lock(&tid_lock);
+	if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) {
+		tmp_tid = tid_buffer[tid_head];
+		tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
+	}
+	tid_buffer[tid_tail] = tid;
+	tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE;
+	mtx_unlock(&tid_lock);
+	if (tmp_tid != -1)
+		free_unr(tid_unrhdr, tmp_tid);
+}
+
+/*
+ * Prepare a thread for use.
+ */
+static int
+thread_ctor(void *mem, int size, void *arg, int flags)
+{
+	struct thread	*td;
+
+	td = (struct thread *)mem;
+	td->td_state = TDS_INACTIVE;
+	td->td_oncpu = NOCPU;
+
+	td->td_tid = tid_alloc();
+
+	/*
+	 * Note that td_critnest begins life as 1 because the thread is not
+	 * running and is thereby implicitly waiting to be on the receiving
+	 * end of a context switch.
+	 */
+	td->td_critnest = 1;
+	td->td_lend_user_pri = PRI_MAX;
+	EVENTHANDLER_INVOKE(thread_ctor, td);
+#ifdef AUDIT
+	audit_thread_alloc(td);
+#endif
+	umtx_thread_alloc(td);
+	return (0);
+}
+
+/*
+ * Reclaim a thread after use.
+ */
+static void
+thread_dtor(void *mem, int size, void *arg)
+{
+	struct thread *td;
+
+	td = (struct thread *)mem;
+
+#ifdef INVARIANTS
+	/* Verify that this thread is in a safe state to free. */
+	switch (td->td_state) {
+	case TDS_INHIBITED:
+	case TDS_RUNNING:
+	case TDS_CAN_RUN:
+	case TDS_RUNQ:
+		/*
+		 * We must never unlink a thread that is in one of
+		 * these states, because it is currently active.
+		 */
+		panic("bad state for thread unlinking");
+		/* NOTREACHED */
+	case TDS_INACTIVE:
+		break;
+	default:
+		panic("bad thread state");
+		/* NOTREACHED */
+	}
+#endif
+#ifdef AUDIT
+	audit_thread_free(td);
+#endif
+	/* Free all OSD associated to this thread. */
+	osd_thread_exit(td);
+
+	EVENTHANDLER_INVOKE(thread_dtor, td);
+	tid_free(td->td_tid);
+}
+
+/*
+ * Initialize type-stable parts of a thread (when newly created).
+ */
+static int
+thread_init(void *mem, int size, int flags)
+{
+	struct thread *td;
+
+	td = (struct thread *)mem;
+
+	td->td_sleepqueue = sleepq_alloc();
+	td->td_turnstile = turnstile_alloc();
+	td->td_rlqe = NULL;
+	EVENTHANDLER_INVOKE(thread_init, td);
+	td->td_sched = (struct td_sched *)&td[1];
+	umtx_thread_init(td);
+	td->td_kstack = 0;
+	return (0);
+}
+
+/*
+ * Tear down type-stable parts of a thread (just before being discarded).
+ */
+static void
+thread_fini(void *mem, int size)
+{
+	struct thread *td;
+
+	td = (struct thread *)mem;
+	EVENTHANDLER_INVOKE(thread_fini, td);
+	rlqentry_free(td->td_rlqe);
+	turnstile_free(td->td_turnstile);
+	sleepq_free(td->td_sleepqueue);
+	umtx_thread_fini(td);
+	seltdfini(td);
+}
+
+/*
+ * For a newly created process,
+ * link up all the structures and its initial threads etc.
+ * called from:
+ * {arch}/{arch}/machdep.c   ia64_init(), init386() etc.
+ * proc_dtor() (should go away)
+ * proc_init()
+ */
+void
+proc_linkup0(struct proc *p, struct thread *td)
+{
+	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
+	proc_linkup(p, td);
+}
+
+void
+proc_linkup(struct proc *p, struct thread *td)
+{
+
+	sigqueue_init(&p->p_sigqueue, p);
+	p->p_ksi = ksiginfo_alloc(1);
+	if (p->p_ksi != NULL) {
+		/* XXX p_ksi may be null if ksiginfo zone is not ready */
+		p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
+	}
+	LIST_INIT(&p->p_mqnotifier);
+	p->p_numthreads = 0;
+	thread_link(td, p);
+}
+
+/*
+ * Initialize global thread allocation resources.
+ */
+void
+threadinit(void)
+{
+
+	mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF);
+
+	/*
+	 * pid_max cannot be greater than PID_MAX.
+	 * leave one number for thread0.
+	 */
+	tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock);
+
+	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
+	    thread_ctor, thread_dtor, thread_init, thread_fini,
+	    16 - 1, 0);
+	tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash);
+	rw_init(&tidhash_lock, "tidhash");
+}
+
+/*
+ * Place an unused thread on the zombie list.
+ * Use the slpq as that must be unused by now.
+ */
+void
+thread_zombie(struct thread *td)
+{
+	mtx_lock_spin(&zombie_lock);
+	TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq);
+	mtx_unlock_spin(&zombie_lock);
+}
+
+/*
+ * Release a thread that has exited after cpu_throw().
+ */
+void
+thread_stash(struct thread *td)
+{
+	atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
+	thread_zombie(td);
+}
+
+/*
+ * Reap zombie resources.
+ */
+void
+thread_reap(void)
+{
+	struct thread *td_first, *td_next;
+
+	/*
+	 * Don't even bother to lock if none at this instant,
+	 * we really don't care about the next instant..
+	 */
+	if (!TAILQ_EMPTY(&zombie_threads)) {
+		mtx_lock_spin(&zombie_lock);
+		td_first = TAILQ_FIRST(&zombie_threads);
+		if (td_first)
+			TAILQ_INIT(&zombie_threads);
+		mtx_unlock_spin(&zombie_lock);
+		while (td_first) {
+			td_next = TAILQ_NEXT(td_first, td_slpq);
+			if (td_first->td_ucred)
+				crfree(td_first->td_ucred);
+			thread_free(td_first);
+			td_first = td_next;
+		}
+	}
+}
+
+/*
+ * Allocate a thread.
+ */
+struct thread *
+thread_alloc(int pages)
+{
+	struct thread *td;
+
+	thread_reap(); /* check if any zombies to get */
+
+	td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK);
+	KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
+	if (!vm_thread_new(td, pages)) {
+		uma_zfree(thread_zone, td);
+		return (NULL);
+	}
+	cpu_thread_alloc(td);
+	return (td);
+}
+
+int
+thread_alloc_stack(struct thread *td, int pages)
+{
+
+	KASSERT(td->td_kstack == 0,
+	    ("thread_alloc_stack called on a thread with kstack"));
+	if (!vm_thread_new(td, pages))
+		return (0);
+	cpu_thread_alloc(td);
+	return (1);
+}
+
+/*
+ * Deallocate a thread.
+ */
+void
+thread_free(struct thread *td)
+{
+
+	lock_profile_thread_exit(td);
+	if (td->td_cpuset)
+		cpuset_rel(td->td_cpuset);
+	td->td_cpuset = NULL;
+	cpu_thread_free(td);
+	if (td->td_kstack != 0)
+		vm_thread_dispose(td);
+	uma_zfree(thread_zone, td);
+}
+
+/*
+ * Discard the current thread and exit from its context.
+ * Always called with scheduler locked.
+ *
+ * Because we can't free a thread while we're operating under its context,
+ * push the current thread into our CPU's deadthread holder. This means
+ * we needn't worry about someone else grabbing our context before we
+ * do a cpu_throw().
+ */
+void
+thread_exit(void)
+{
+	uint64_t runtime, new_switchtime;
+	struct thread *td;
+	struct thread *td2;
+	struct proc *p;
+	int wakeup_swapper;
+
+	td = curthread;
+	p = td->td_proc;
+
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	mtx_assert(&Giant, MA_NOTOWNED);
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	KASSERT(p != NULL, ("thread exiting without a process"));
+	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
+	    (long)p->p_pid, td->td_name);
+	KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
+
+#ifdef AUDIT
+	AUDIT_SYSCALL_EXIT(0, td);
+#endif
+	umtx_thread_exit(td);
+	/*
+	 * drop FPU & debug register state storage, or any other
+	 * architecture specific resources that
+	 * would not be on a new untouched process.
+	 */
+	cpu_thread_exit(td);	/* XXXSMP */
+
+	/*
+	 * The last thread is left attached to the process
+	 * So that the whole bundle gets recycled. Skip
+	 * all this stuff if we never had threads.
+	 * EXIT clears all sign of other threads when
+	 * it goes to single threading, so the last thread always
+	 * takes the short path.
+	 */
+	if (p->p_flag & P_HADTHREADS) {
+		if (p->p_numthreads > 1) {
+			thread_unlink(td);
+			td2 = FIRST_THREAD_IN_PROC(p);
+			sched_exit_thread(td2, td);
+
+			/*
+			 * The test below is NOT true if we are the
+			 * sole exiting thread. P_STOPPED_SINGLE is unset
+			 * in exit1() after it is the only survivor.
+			 */
+			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
+				if (p->p_numthreads == p->p_suspcount) {
+					thread_lock(p->p_singlethread);
+					wakeup_swapper = thread_unsuspend_one(
+						p->p_singlethread);
+					thread_unlock(p->p_singlethread);
+					if (wakeup_swapper)
+						kick_proc0();
+				}
+			}
+
+			atomic_add_int(&td->td_proc->p_exitthreads, 1);
+			PCPU_SET(deadthread, td);
+		} else {
+			/*
+			 * The last thread is exiting.. but not through exit()
+			 */
+			panic ("thread_exit: Last thread exiting on its own");
+		}
+	} 
+#ifdef	HWPMC_HOOKS
+	/*
+	 * If this thread is part of a process that is being tracked by hwpmc(4),
+	 * inform the module of the thread's impending exit.
+	 */
+	if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+		PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
+#endif
+	PROC_UNLOCK(p);
+
+	/* Do the same timestamp bookkeeping that mi_switch() would do. */
+	new_switchtime = cpu_ticks();
+	runtime = new_switchtime - PCPU_GET(switchtime);
+	td->td_runtime += runtime;
+	td->td_incruntime += runtime;
+	PCPU_SET(switchtime, new_switchtime);
+	PCPU_SET(switchticks, ticks);
+	PCPU_INC(cnt.v_swtch);
+
+	/* Save our resource usage in our process. */
+	td->td_ru.ru_nvcsw++;
+	ruxagg(p, td);
+	rucollect(&p->p_ru, &td->td_ru);
+
+	thread_lock(td);
+	PROC_SUNLOCK(p);
+	td->td_state = TDS_INACTIVE;
+#ifdef WITNESS
+	witness_thread_exit(td);
+#endif
+	CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
+	sched_throw(td);
+	panic("I'm a teapot!");
+	/* NOTREACHED */
+}
+
+/*
+ * Do any thread specific cleanups that may be needed in wait()
+ * called with Giant, proc and schedlock not held.
+ */
+void
+thread_wait(struct proc *p)
+{
+	struct thread *td;
+
+	mtx_assert(&Giant, MA_NOTOWNED);
+	KASSERT((p->p_numthreads == 1), ("Multiple threads in wait1()"));
+	td = FIRST_THREAD_IN_PROC(p);
+	/* Lock the last thread so we spin until it exits cpu_throw(). */
+	thread_lock(td);
+	thread_unlock(td);
+	/* Wait for any remaining threads to exit cpu_throw(). */
+	while (p->p_exitthreads)
+		sched_relinquish(curthread);
+	lock_profile_thread_exit(td);
+	cpuset_rel(td->td_cpuset);
+	td->td_cpuset = NULL;
+	cpu_thread_clean(td);
+	crfree(td->td_ucred);
+	thread_reap();	/* check for zombie threads etc. */
+}
+
+/*
+ * Link a thread to a process.
+ * set up anything that needs to be initialized for it to
+ * be used by the process.
+ */
+void
+thread_link(struct thread *td, struct proc *p)
+{
+
+	/*
+	 * XXX This can't be enabled because it's called for proc0 before
+	 * its lock has been created.
+	 * PROC_LOCK_ASSERT(p, MA_OWNED);
+	 */
+	td->td_state    = TDS_INACTIVE;
+	td->td_proc     = p;
+	td->td_flags    = TDF_INMEM;
+
+	LIST_INIT(&td->td_contested);
+	LIST_INIT(&td->td_lprof[0]);
+	LIST_INIT(&td->td_lprof[1]);
+	sigqueue_init(&td->td_sigqueue, p);
+	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
+	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
+	p->p_numthreads++;
+}
+
+/*
+ * Convert a process with one thread to an unthreaded process.
+ */
+void
+thread_unthread(struct thread *td)
+{
+	struct proc *p = td->td_proc;
+
+	KASSERT((p->p_numthreads == 1), ("Unthreading with >1 threads"));
+	p->p_flag &= ~P_HADTHREADS;
+}
+
+/*
+ * Called from:
+ *  thread_exit()
+ */
+void
+thread_unlink(struct thread *td)
+{
+	struct proc *p = td->td_proc;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	TAILQ_REMOVE(&p->p_threads, td, td_plist);
+	p->p_numthreads--;
+	/* could clear a few other things here */
+	/* Must  NOT clear links to proc! */
+}
+
+static int
+calc_remaining(struct proc *p, int mode)
+{
+	int remaining;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	if (mode == SINGLE_EXIT)
+		remaining = p->p_numthreads;
+	else if (mode == SINGLE_BOUNDARY)
+		remaining = p->p_numthreads - p->p_boundary_count;
+	else if (mode == SINGLE_NO_EXIT)
+		remaining = p->p_numthreads - p->p_suspcount;
+	else
+		panic("calc_remaining: wrong mode %d", mode);
+	return (remaining);
+}
+
+/*
+ * Enforce single-threading.
+ *
+ * Returns 1 if the caller must abort (another thread is waiting to
+ * exit the process or similar). Process is locked!
+ * Returns 0 when you are successfully the only thread running.
+ * A process has successfully single threaded in the suspend mode when
+ * There are no threads in user mode. Threads in the kernel must be
+ * allowed to continue until they get to the user boundary. They may even
+ * copy out their return values and data before suspending. They may however be
+ * accelerated in reaching the user boundary as we will wake up
+ * any sleeping threads that are interruptable. (PCATCH).
+ */
+int
+thread_single(int mode)
+{
+	struct thread *td;
+	struct thread *td2;
+	struct proc *p;
+	int remaining, wakeup_swapper;
+
+	td = curthread;
+	p = td->td_proc;
+	mtx_assert(&Giant, MA_NOTOWNED);
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	if ((p->p_flag & P_HADTHREADS) == 0)
+		return (0);
+
+	/* Is someone already single threading? */
+	if (p->p_singlethread != NULL && p->p_singlethread != td)
+		return (1);
+
+	if (mode == SINGLE_EXIT) {
+		p->p_flag |= P_SINGLE_EXIT;
+		p->p_flag &= ~P_SINGLE_BOUNDARY;
+	} else {
+		p->p_flag &= ~P_SINGLE_EXIT;
+		if (mode == SINGLE_BOUNDARY)
+			p->p_flag |= P_SINGLE_BOUNDARY;
+		else
+			p->p_flag &= ~P_SINGLE_BOUNDARY;
+	}
+	p->p_flag |= P_STOPPED_SINGLE;
+	PROC_SLOCK(p);
+	p->p_singlethread = td;
+	remaining = calc_remaining(p, mode);
+	while (remaining != 1) {
+		if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
+			goto stopme;
+		wakeup_swapper = 0;
+		FOREACH_THREAD_IN_PROC(p, td2) {
+			if (td2 == td)
+				continue;
+			thread_lock(td2);
+			td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
+			if (TD_IS_INHIBITED(td2)) {
+				switch (mode) {
+				case SINGLE_EXIT:
+					if (TD_IS_SUSPENDED(td2))
+						wakeup_swapper |=
+						    thread_unsuspend_one(td2);
+					if (TD_ON_SLEEPQ(td2) &&
+					    (td2->td_flags & TDF_SINTR))
+						wakeup_swapper |=
+						    sleepq_abort(td2, EINTR);
+					break;
+				case SINGLE_BOUNDARY:
+					if (TD_IS_SUSPENDED(td2) &&
+					    !(td2->td_flags & TDF_BOUNDARY))
+						wakeup_swapper |=
+						    thread_unsuspend_one(td2);
+					if (TD_ON_SLEEPQ(td2) &&
+					    (td2->td_flags & TDF_SINTR))
+						wakeup_swapper |=
+						    sleepq_abort(td2, ERESTART);
+					break;
+				case SINGLE_NO_EXIT:
+					if (TD_IS_SUSPENDED(td2) &&
+					    !(td2->td_flags & TDF_BOUNDARY))
+						wakeup_swapper |=
+						    thread_unsuspend_one(td2);
+					if (TD_ON_SLEEPQ(td2) &&
+					    (td2->td_flags & TDF_SINTR))
+						wakeup_swapper |=
+						    sleepq_abort(td2, ERESTART);
+					break;
+				default:
+					break;
+				}
+			}
+#ifdef SMP
+			else if (TD_IS_RUNNING(td2) && td != td2) {
+				forward_signal(td2);
+			}
+#endif
+			thread_unlock(td2);
+		}
+		if (wakeup_swapper)
+			kick_proc0();
+		remaining = calc_remaining(p, mode);
+
+		/*
+		 * Maybe we suspended some threads.. was it enough?
+		 */
+		if (remaining == 1)
+			break;
+
+stopme:
+		/*
+		 * Wake us up when everyone else has suspended.
+		 * In the mean time we suspend as well.
+		 */
+		thread_suspend_switch(td);
+		remaining = calc_remaining(p, mode);
+	}
+	if (mode == SINGLE_EXIT) {
+		/*
+		 * We have gotten rid of all the other threads and we
+		 * are about to either exit or exec. In either case,
+		 * we try our utmost to revert to being a non-threaded
+		 * process.
+		 */
+		p->p_singlethread = NULL;
+		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT);
+		thread_unthread(td);
+	}
+	PROC_SUNLOCK(p);
+	return (0);
+}
+
+/*
+ * Called in from locations that can safely check to see
+ * whether we have to suspend or at least throttle for a
+ * single-thread event (e.g. fork).
+ *
+ * Such locations include userret().
+ * If the "return_instead" argument is non zero, the thread must be able to
+ * accept 0 (caller may continue), or 1 (caller must abort) as a result.
+ *
+ * The 'return_instead' argument tells the function if it may do a
+ * thread_exit() or suspend, or whether the caller must abort and back
+ * out instead.
+ *
+ * If the thread that set the single_threading request has set the
+ * P_SINGLE_EXIT bit in the process flags then this call will never return
+ * if 'return_instead' is false, but will exit.
+ *
+ * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
+ *---------------+--------------------+---------------------
+ *       0       | returns 0          |   returns 0 or 1
+ *               | when ST ends       |   immediately
+ *---------------+--------------------+---------------------
+ *       1       | thread exits       |   returns 1
+ *               |                    |  immediately
+ * 0 = thread_exit() or suspension ok,
+ * other = return error instead of stopping the thread.
+ *
+ * While a full suspension is under effect, even a single threading
+ * thread would be suspended if it made this call (but it shouldn't).
+ * This call should only be made from places where
+ * thread_exit() would be safe as that may be the outcome unless
+ * return_instead is set.
+ */
+int
+thread_suspend_check(int return_instead)
+{
+	struct thread *td;
+	struct proc *p;
+	int wakeup_swapper;
+
+	td = curthread;
+	p = td->td_proc;
+	mtx_assert(&Giant, MA_NOTOWNED);
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	while (P_SHOULDSTOP(p) ||
+	      ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
+		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
+			KASSERT(p->p_singlethread != NULL,
+			    ("singlethread not set"));
+			/*
+			 * The only suspension in action is a
+			 * single-threading. Single threader need not stop.
+			 * XXX Should be safe to access unlocked
+			 * as it can only be set to be true by us.
+			 */
+			if (p->p_singlethread == td)
+				return (0);	/* Exempt from stopping. */
+		}
+		if ((p->p_flag & P_SINGLE_EXIT) && return_instead)
+			return (EINTR);
+
+		/* Should we goto user boundary if we didn't come from there? */
+		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
+		    (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
+			return (ERESTART);
+
+		/*
+		 * Ignore suspend requests for stop signals if they
+		 * are deferred.
+		 */
+		if (P_SHOULDSTOP(p) == P_STOPPED_SIG &&
+		    td->td_flags & TDF_SBDRY) {
+			KASSERT(return_instead,
+			    ("TDF_SBDRY set for unsafe thread_suspend_check"));
+			return (0);
+		}
+
+		/*
+		 * If the process is waiting for us to exit,
+		 * this thread should just suicide.
+		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
+		 */
+		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
+			PROC_UNLOCK(p);
+			tidhash_remove(td);
+			PROC_LOCK(p);
+			tdsigcleanup(td);
+			PROC_SLOCK(p);
+			thread_stopped(p);
+			thread_exit();
+		}
+
+		PROC_SLOCK(p);
+		thread_stopped(p);
+		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
+			if (p->p_numthreads == p->p_suspcount + 1) {
+				thread_lock(p->p_singlethread);
+				wakeup_swapper =
+				    thread_unsuspend_one(p->p_singlethread);
+				thread_unlock(p->p_singlethread);
+				if (wakeup_swapper)
+					kick_proc0();
+			}
+		}
+		PROC_UNLOCK(p);
+		thread_lock(td);
+		/*
+		 * When a thread suspends, it just
+		 * gets taken off all queues.
+		 */
+		thread_suspend_one(td);
+		if (return_instead == 0) {
+			p->p_boundary_count++;
+			td->td_flags |= TDF_BOUNDARY;
+		}
+		PROC_SUNLOCK(p);
+		mi_switch(SW_INVOL | SWT_SUSPEND, NULL);
+		if (return_instead == 0)
+			td->td_flags &= ~TDF_BOUNDARY;
+		thread_unlock(td);
+		PROC_LOCK(p);
+		if (return_instead == 0) {
+			PROC_SLOCK(p);
+			p->p_boundary_count--;
+			PROC_SUNLOCK(p);
+		}
+	}
+	return (0);
+}
+
+void
+thread_suspend_switch(struct thread *td)
+{
+	struct proc *p;
+
+	p = td->td_proc;
+	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	/*
+	 * We implement thread_suspend_one in stages here to avoid
+	 * dropping the proc lock while the thread lock is owned.
+	 */
+	thread_stopped(p);
+	p->p_suspcount++;
+	PROC_UNLOCK(p);
+	thread_lock(td);
+	td->td_flags &= ~TDF_NEEDSUSPCHK;
+	TD_SET_SUSPENDED(td);
+	sched_sleep(td, 0);
+	PROC_SUNLOCK(p);
+	DROP_GIANT();
+	mi_switch(SW_VOL | SWT_SUSPEND, NULL);
+	thread_unlock(td);
+	PICKUP_GIANT();
+	PROC_LOCK(p);
+	PROC_SLOCK(p);
+}
+
+void
+thread_suspend_one(struct thread *td)
+{
+	struct proc *p = td->td_proc;
+
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
+	p->p_suspcount++;
+	td->td_flags &= ~TDF_NEEDSUSPCHK;
+	TD_SET_SUSPENDED(td);
+	sched_sleep(td, 0);
+}
+
+int
+thread_unsuspend_one(struct thread *td)
+{
+	struct proc *p = td->td_proc;
+
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
+	TD_CLR_SUSPENDED(td);
+	p->p_suspcount--;
+	return (setrunnable(td));
+}
+
+/*
+ * Allow all threads blocked by single threading to continue running.
+ */
+void
+thread_unsuspend(struct proc *p)
+{
+	struct thread *td;
+	int wakeup_swapper;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	wakeup_swapper = 0;
+	if (!P_SHOULDSTOP(p)) {
+                FOREACH_THREAD_IN_PROC(p, td) {
+			thread_lock(td);
+			if (TD_IS_SUSPENDED(td)) {
+				wakeup_swapper |= thread_unsuspend_one(td);
+			}
+			thread_unlock(td);
+		}
+	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
+	    (p->p_numthreads == p->p_suspcount)) {
+		/*
+		 * Stopping everything also did the job for the single
+		 * threading request. Now we've downgraded to single-threaded,
+		 * let it continue.
+		 */
+		thread_lock(p->p_singlethread);
+		wakeup_swapper = thread_unsuspend_one(p->p_singlethread);
+		thread_unlock(p->p_singlethread);
+	}
+	if (wakeup_swapper)
+		kick_proc0();
+}
+
+/*
+ * End the single threading mode..
+ */
+void
+thread_single_end(void)
+{
+	struct thread *td;
+	struct proc *p;
+	int wakeup_swapper;
+
+	td = curthread;
+	p = td->td_proc;
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY);
+	PROC_SLOCK(p);
+	p->p_singlethread = NULL;
+	wakeup_swapper = 0;
+	/*
+	 * If there are other threads they may now run,
+	 * unless of course there is a blanket 'stop order'
+	 * on the process. The single threader must be allowed
+	 * to continue however as this is a bad place to stop.
+	 */
+	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
+                FOREACH_THREAD_IN_PROC(p, td) {
+			thread_lock(td);
+			if (TD_IS_SUSPENDED(td)) {
+				wakeup_swapper |= thread_unsuspend_one(td);
+			}
+			thread_unlock(td);
+		}
+	}
+	PROC_SUNLOCK(p);
+	if (wakeup_swapper)
+		kick_proc0();
+}
+
+struct thread *
+thread_find(struct proc *p, lwpid_t tid)
+{
+	struct thread *td;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	FOREACH_THREAD_IN_PROC(p, td) {
+		if (td->td_tid == tid)
+			break;
+	}
+	return (td);
+}
+
+/* Locate a thread by number; return with proc lock held. */
+struct thread *
+tdfind(lwpid_t tid, pid_t pid)
+{
+#define RUN_THRESH	16
+	struct thread *td;
+	int run = 0;
+
+	rw_rlock(&tidhash_lock);
+	LIST_FOREACH(td, TIDHASH(tid), td_hash) {
+		if (td->td_tid == tid) {
+			if (pid != -1 && td->td_proc->p_pid != pid) {
+				td = NULL;
+				break;
+			}
+			PROC_LOCK(td->td_proc);
+			if (td->td_proc->p_state == PRS_NEW) {
+				PROC_UNLOCK(td->td_proc);
+				td = NULL;
+				break;
+			}
+			if (run > RUN_THRESH) {
+				if (rw_try_upgrade(&tidhash_lock)) {
+					LIST_REMOVE(td, td_hash);
+					LIST_INSERT_HEAD(TIDHASH(td->td_tid),
+						td, td_hash);
+					rw_wunlock(&tidhash_lock);
+					return (td);
+				}
+			}
+			break;
+		}
+		run++;
+	}
+	rw_runlock(&tidhash_lock);
+	return (td);
+}
+
+void
+tidhash_add(struct thread *td)
+{
+	rw_wlock(&tidhash_lock);
+	LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
+	rw_wunlock(&tidhash_lock);
+}
+
+void
+tidhash_remove(struct thread *td)
+{
+	rw_wlock(&tidhash_lock);
+	LIST_REMOVE(td, td_hash);
+	rw_wunlock(&tidhash_lock);
+}
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
new file mode 100644
index 0000000..3aaed60
--- /dev/null
+++ b/sys/kern/kern_time.c
@@ -0,0 +1,1648 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_time.c	8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/clock.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/eventhandler.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/sleepqueue.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/time.h>
+#include <sys/timers.h>
+#include <sys/timetc.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+#define MAX_CLOCKS 	(CLOCK_MONOTONIC+1)
+#define CPUCLOCK_BIT		0x80000000
+#define CPUCLOCK_PROCESS_BIT	0x40000000
+#define CPUCLOCK_ID_MASK	(~(CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT))
+#define MAKE_THREAD_CPUCLOCK(tid)	(CPUCLOCK_BIT|(tid))
+#define MAKE_PROCESS_CPUCLOCK(pid)	\
+	(CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT|(pid))
+
+static struct kclock	posix_clocks[MAX_CLOCKS];
+static uma_zone_t	itimer_zone = NULL;
+
+/*
+ * Time of day and interval timer support.
+ *
+ * These routines provide the kernel entry points to get and set
+ * the time-of-day and per-process interval timers.  Subroutines
+ * here provide support for adding and subtracting timeval structures
+ * and decrementing interval timers, optionally reloading the interval
+ * timers when they expire.
+ */
+
+static int	settime(struct thread *, struct timeval *);
+static void	timevalfix(struct timeval *);
+
+static void	itimer_start(void);
+static int	itimer_init(void *, int, int);
+static void	itimer_fini(void *, int);
+static void	itimer_enter(struct itimer *);
+static void	itimer_leave(struct itimer *);
+static struct itimer *itimer_find(struct proc *, int);
+static void	itimers_alloc(struct proc *);
+static void	itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp);
+static void	itimers_event_hook_exit(void *arg, struct proc *p);
+static int	realtimer_create(struct itimer *);
+static int	realtimer_gettime(struct itimer *, struct itimerspec *);
+static int	realtimer_settime(struct itimer *, int,
+			struct itimerspec *, struct itimerspec *);
+static int	realtimer_delete(struct itimer *);
+static void	realtimer_clocktime(clockid_t, struct timespec *);
+static void	realtimer_expire(void *);
+
+int		register_posix_clock(int, struct kclock *);
+void		itimer_fire(struct itimer *it);
+int		itimespecfix(struct timespec *ts);
+
+#define CLOCK_CALL(clock, call, arglist)		\
+	((*posix_clocks[clock].call) arglist)
+
+SYSINIT(posix_timer, SI_SUB_P1003_1B, SI_ORDER_FIRST+4, itimer_start, NULL);
+
+
+static int
+settime(struct thread *td, struct timeval *tv)
+{
+	struct timeval delta, tv1, tv2;
+	static struct timeval maxtime, laststep;
+	struct timespec ts;
+	int s;
+
+	s = splclock();
+	microtime(&tv1);
+	delta = *tv;
+	timevalsub(&delta, &tv1);
+
+	/*
+	 * If the system is secure, we do not allow the time to be 
+	 * set to a value earlier than 1 second less than the highest
+	 * time we have yet seen. The worst a miscreant can do in
+	 * this circumstance is "freeze" time. He couldn't go
+	 * back to the past.
+	 *
+	 * We similarly do not allow the clock to be stepped more
+	 * than one second, nor more than once per second. This allows
+	 * a miscreant to make the clock march double-time, but no worse.
+	 */
+	if (securelevel_gt(td->td_ucred, 1) != 0) {
+		if (delta.tv_sec < 0 || delta.tv_usec < 0) {
+			/*
+			 * Update maxtime to latest time we've seen.
+			 */
+			if (tv1.tv_sec > maxtime.tv_sec)
+				maxtime = tv1;
+			tv2 = *tv;
+			timevalsub(&tv2, &maxtime);
+			if (tv2.tv_sec < -1) {
+				tv->tv_sec = maxtime.tv_sec - 1;
+				printf("Time adjustment clamped to -1 second\n");
+			}
+		} else {
+			if (tv1.tv_sec == laststep.tv_sec) {
+				splx(s);
+				return (EPERM);
+			}
+			if (delta.tv_sec > 1) {
+				tv->tv_sec = tv1.tv_sec + 1;
+				printf("Time adjustment clamped to +1 second\n");
+			}
+			laststep = *tv;
+		}
+	}
+
+	ts.tv_sec = tv->tv_sec;
+	ts.tv_nsec = tv->tv_usec * 1000;
+	mtx_lock(&Giant);
+	tc_setclock(&ts);
+	resettodr();
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_getcpuclockid2_args {
+	id_t id;
+	int which,
+	clockid_t *clock_id;
+};
+#endif
+/* ARGSUSED */
+int
+sys_clock_getcpuclockid2(struct thread *td, struct clock_getcpuclockid2_args *uap)
+{
+	clockid_t clk_id;
+	int error;
+
+	error = kern_clock_getcpuclockid2(td, uap->id, uap->which, &clk_id);
+	if (error == 0)
+		error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t));
+	return (error);
+}
+
+int
+kern_clock_getcpuclockid2(struct thread *td, id_t id, int which,
+    clockid_t *clk_id)
+{
+	struct proc *p;
+	pid_t pid;
+	lwpid_t tid;
+	int error;
+
+	switch (which) {
+	case CPUCLOCK_WHICH_PID:
+		if (id != 0) {
+			p = pfind(id);
+			if (p == NULL)
+				return (ESRCH);
+			error = p_cansee(td, p);
+			PROC_UNLOCK(p);
+			if (error != 0)
+				return (error);
+			pid = id;
+		} else {
+			pid = td->td_proc->p_pid;
+		}
+		*clk_id = MAKE_PROCESS_CPUCLOCK(pid);
+		return (0);
+	case CPUCLOCK_WHICH_TID:
+		tid = id == 0 ? td->td_tid : id;
+		*clk_id = MAKE_THREAD_CPUCLOCK(tid);
+		return (0);
+	default:
+		return (EINVAL);
+	}
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_gettime_args {
+	clockid_t clock_id;
+	struct	timespec *tp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_clock_gettime(struct thread *td, struct clock_gettime_args *uap)
+{
+	struct timespec ats;
+	int error;
+
+	error = kern_clock_gettime(td, uap->clock_id, &ats);
+	if (error == 0)
+		error = copyout(&ats, uap->tp, sizeof(ats));
+
+	return (error);
+}
+
+static inline void 
+cputick2timespec(uint64_t runtime, struct timespec *ats)
+{
+	runtime = cputick2usec(runtime);
+	ats->tv_sec = runtime / 1000000;
+	ats->tv_nsec = runtime % 1000000 * 1000;
+}
+
+static void
+get_thread_cputime(struct thread *targettd, struct timespec *ats)
+{
+	uint64_t runtime, curtime, switchtime;
+
+	if (targettd == NULL) { /* current thread */
+		critical_enter();
+		switchtime = PCPU_GET(switchtime);
+		curtime = cpu_ticks();
+		runtime = curthread->td_runtime;
+		critical_exit();
+		runtime += curtime - switchtime;
+	} else {
+		thread_lock(targettd);
+		runtime = targettd->td_runtime;
+		thread_unlock(targettd);
+	}
+	cputick2timespec(runtime, ats);
+}
+
+static void
+get_process_cputime(struct proc *targetp, struct timespec *ats)
+{
+	uint64_t runtime;
+	struct rusage ru;
+
+	PROC_SLOCK(targetp);
+	rufetch(targetp, &ru);
+	runtime = targetp->p_rux.rux_runtime;
+	PROC_SUNLOCK(targetp);
+	cputick2timespec(runtime, ats);
+}
+
+static int
+get_cputime(struct thread *td, clockid_t clock_id, struct timespec *ats)
+{
+	struct proc *p, *p2;
+	struct thread *td2;
+	lwpid_t tid;
+	pid_t pid;
+	int error;
+
+	p = td->td_proc;
+	if ((clock_id & CPUCLOCK_PROCESS_BIT) == 0) {
+		tid = clock_id & CPUCLOCK_ID_MASK;
+		td2 = tdfind(tid, p->p_pid);
+		if (td2 == NULL)
+			return (EINVAL);
+		get_thread_cputime(td2, ats);
+		PROC_UNLOCK(td2->td_proc);
+	} else {
+		pid = clock_id & CPUCLOCK_ID_MASK;
+		error = pget(pid, PGET_CANSEE, &p2);
+		if (error != 0)
+			return (EINVAL);
+		get_process_cputime(p2, ats);
+		PROC_UNLOCK(p2);
+	}
+	return (0);
+}
+
+int
+kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats)
+{
+	struct timeval sys, user;
+	struct proc *p;
+
+	p = td->td_proc;
+	switch (clock_id) {
+	case CLOCK_REALTIME:		/* Default to precise. */
+	case CLOCK_REALTIME_PRECISE:
+		nanotime(ats);
+		break;
+	case CLOCK_REALTIME_FAST:
+		getnanotime(ats);
+		break;
+	case CLOCK_VIRTUAL:
+		PROC_LOCK(p);
+		PROC_SLOCK(p);
+		calcru(p, &user, &sys);
+		PROC_SUNLOCK(p);
+		PROC_UNLOCK(p);
+		TIMEVAL_TO_TIMESPEC(&user, ats);
+		break;
+	case CLOCK_PROF:
+		PROC_LOCK(p);
+		PROC_SLOCK(p);
+		calcru(p, &user, &sys);
+		PROC_SUNLOCK(p);
+		PROC_UNLOCK(p);
+		timevaladd(&user, &sys);
+		TIMEVAL_TO_TIMESPEC(&user, ats);
+		break;
+	case CLOCK_MONOTONIC:		/* Default to precise. */
+	case CLOCK_MONOTONIC_PRECISE:
+	case CLOCK_UPTIME:
+	case CLOCK_UPTIME_PRECISE:
+		nanouptime(ats);
+		break;
+	case CLOCK_UPTIME_FAST:
+	case CLOCK_MONOTONIC_FAST:
+		getnanouptime(ats);
+		break;
+	case CLOCK_SECOND:
+		ats->tv_sec = time_second;
+		ats->tv_nsec = 0;
+		break;
+	case CLOCK_THREAD_CPUTIME_ID:
+		get_thread_cputime(NULL, ats);
+		break;
+	case CLOCK_PROCESS_CPUTIME_ID:
+		PROC_LOCK(p);
+		get_process_cputime(p, ats);
+		PROC_UNLOCK(p);
+		break;
+	default:
+		if ((int)clock_id >= 0)
+			return (EINVAL);
+		return (get_cputime(td, clock_id, ats));
+	}
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_settime_args {
+	clockid_t clock_id;
+	const struct	timespec *tp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_clock_settime(struct thread *td, struct clock_settime_args *uap)
+{
+	struct timespec ats;
+	int error;
+
+	if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0)
+		return (error);
+	return (kern_clock_settime(td, uap->clock_id, &ats));
+}
+
+int
+kern_clock_settime(struct thread *td, clockid_t clock_id, struct timespec *ats)
+{
+	struct timeval atv;
+	int error;
+
+	if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
+		return (error);
+	if (clock_id != CLOCK_REALTIME)
+		return (EINVAL);
+	if (ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000)
+		return (EINVAL);
+	/* XXX Don't convert nsec->usec and back */
+	TIMESPEC_TO_TIMEVAL(&atv, ats);
+	error = settime(td, &atv);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_getres_args {
+	clockid_t clock_id;
+	struct	timespec *tp;
+};
+#endif
+int
+sys_clock_getres(struct thread *td, struct clock_getres_args *uap)
+{
+	struct timespec ts;
+	int error;
+
+	if (uap->tp == NULL)
+		return (0);
+
+	error = kern_clock_getres(td, uap->clock_id, &ts);
+	if (error == 0)
+		error = copyout(&ts, uap->tp, sizeof(ts));
+	return (error);
+}
+
+int
+kern_clock_getres(struct thread *td, clockid_t clock_id, struct timespec *ts)
+{
+
+	ts->tv_sec = 0;
+	switch (clock_id) {
+	case CLOCK_REALTIME:
+	case CLOCK_REALTIME_FAST:
+	case CLOCK_REALTIME_PRECISE:
+	case CLOCK_MONOTONIC:
+	case CLOCK_MONOTONIC_FAST:
+	case CLOCK_MONOTONIC_PRECISE:
+	case CLOCK_UPTIME:
+	case CLOCK_UPTIME_FAST:
+	case CLOCK_UPTIME_PRECISE:
+		/*
+		 * Round up the result of the division cheaply by adding 1.
+		 * Rounding up is especially important if rounding down
+		 * would give 0.  Perfect rounding is unimportant.
+		 */
+		ts->tv_nsec = 1000000000 / tc_getfrequency() + 1;
+		break;
+	case CLOCK_VIRTUAL:
+	case CLOCK_PROF:
+		/* Accurately round up here because we can do so cheaply. */
+		ts->tv_nsec = (1000000000 + hz - 1) / hz;
+		break;
+	case CLOCK_SECOND:
+		ts->tv_sec = 1;
+		ts->tv_nsec = 0;
+		break;
+	case CLOCK_THREAD_CPUTIME_ID:
+	case CLOCK_PROCESS_CPUTIME_ID:
+	cputime:
+		/* sync with cputick2usec */
+		ts->tv_nsec = 1000000 / cpu_tickrate();
+		if (ts->tv_nsec == 0)
+			ts->tv_nsec = 1000;
+		break;
+	default:
+		if ((int)clock_id < 0)
+			goto cputime;
+		return (EINVAL);
+	}
+	return (0);
+}
+
+static uint8_t nanowait[MAXCPU];
+
+int
+kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt)
+{
+	struct timespec ts;
+	sbintime_t sbt, sbtt, prec, tmp;
+	time_t over;
+	int error;
+
+	if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
+		return (EINVAL);
+	if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0))
+		return (0);
+	ts = *rqt;
+	if (ts.tv_sec > INT32_MAX / 2) {
+		over = ts.tv_sec - INT32_MAX / 2;
+		ts.tv_sec -= over;
+	} else
+		over = 0;
+	tmp = tstosbt(ts);
+	prec = tmp;
+	prec >>= tc_precexp;
+	if (TIMESEL(&sbt, tmp))
+		sbt += tc_tick_sbt;
+	sbt += tmp;
+	error = tsleep_sbt(&nanowait[curcpu], PWAIT | PCATCH, "nanslp",
+	    sbt, prec, C_ABSOLUTE);
+	if (error != EWOULDBLOCK) {
+		if (error == ERESTART)
+			error = EINTR;
+		TIMESEL(&sbtt, tmp);
+		if (rmt != NULL) {
+			ts = sbttots(sbt - sbtt);
+			ts.tv_sec += over;
+			if (ts.tv_sec < 0)
+				timespecclear(&ts);
+			*rmt = ts;
+		}
+		if (sbtt >= sbt)
+			return (0);
+		return (error);
+	}
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nanosleep_args {
+	struct	timespec *rqtp;
+	struct	timespec *rmtp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_nanosleep(struct thread *td, struct nanosleep_args *uap)
+{
+	struct timespec rmt, rqt;
+	int error;
+
+	error = copyin(uap->rqtp, &rqt, sizeof(rqt));
+	if (error)
+		return (error);
+
+	if (uap->rmtp &&
+	    !useracc((caddr_t)uap->rmtp, sizeof(rmt), VM_PROT_WRITE))
+			return (EFAULT);
+	error = kern_nanosleep(td, &rqt, &rmt);
+	if (error && uap->rmtp) {
+		int error2;
+
+		error2 = copyout(&rmt, uap->rmtp, sizeof(rmt));
+		if (error2)
+			error = error2;
+	}
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct gettimeofday_args {
+	struct	timeval *tp;
+	struct	timezone *tzp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_gettimeofday(struct thread *td, struct gettimeofday_args *uap)
+{
+	struct timeval atv;
+	struct timezone rtz;
+	int error = 0;
+
+	if (uap->tp) {
+		microtime(&atv);
+		error = copyout(&atv, uap->tp, sizeof (atv));
+	}
+	if (error == 0 && uap->tzp != NULL) {
+		rtz.tz_minuteswest = tz_minuteswest;
+		rtz.tz_dsttime = tz_dsttime;
+		error = copyout(&rtz, uap->tzp, sizeof (rtz));
+	}
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct settimeofday_args {
+	struct	timeval *tv;
+	struct	timezone *tzp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_settimeofday(struct thread *td, struct settimeofday_args *uap)
+{
+	struct timeval atv, *tvp;
+	struct timezone atz, *tzp;
+	int error;
+
+	if (uap->tv) {
+		error = copyin(uap->tv, &atv, sizeof(atv));
+		if (error)
+			return (error);
+		tvp = &atv;
+	} else
+		tvp = NULL;
+	if (uap->tzp) {
+		error = copyin(uap->tzp, &atz, sizeof(atz));
+		if (error)
+			return (error);
+		tzp = &atz;
+	} else
+		tzp = NULL;
+	return (kern_settimeofday(td, tvp, tzp));
+}
+
+int
+kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp)
+{
+	int error;
+
+	error = priv_check(td, PRIV_SETTIMEOFDAY);
+	if (error)
+		return (error);
+	/* Verify all parameters before changing time. */
+	if (tv) {
+		if (tv->tv_usec < 0 || tv->tv_usec >= 1000000)
+			return (EINVAL);
+		error = settime(td, tv);
+	}
+	if (tzp && error == 0) {
+		tz_minuteswest = tzp->tz_minuteswest;
+		tz_dsttime = tzp->tz_dsttime;
+	}
+	return (error);
+}
+
+/*
+ * Get value of an interval timer.  The process virtual and profiling virtual
+ * time timers are kept in the p_stats area, since they can be swapped out.
+ * These are kept internally in the way they are specified externally: in
+ * time until they expire.
+ *
+ * The real time interval timer is kept in the process table slot for the
+ * process, and its value (it_value) is kept as an absolute time rather than
+ * as a delta, so that it is easy to keep periodic real-time signals from
+ * drifting.
+ *
+ * Virtual time timers are processed in the hardclock() routine of
+ * kern_clock.c.  The real time timer is processed by a timeout routine,
+ * called from the softclock() routine.  Since a callout may be delayed in
+ * real time due to interrupt processing in the system, it is possible for
+ * the real time timeout routine (realitexpire, given below), to be delayed
+ * in real time past when it is supposed to occur.  It does not suffice,
+ * therefore, to reload the real timer .it_value from the real time timers
+ * .it_interval.  Rather, we compute the next time in absolute time the timer
+ * should go off.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getitimer_args {
+	u_int	which;
+	struct	itimerval *itv;
+};
+#endif
+int
+sys_getitimer(struct thread *td, struct getitimer_args *uap)
+{
+	struct itimerval aitv;
+	int error;
+
+	error = kern_getitimer(td, uap->which, &aitv);
+	if (error != 0)
+		return (error);
+	return (copyout(&aitv, uap->itv, sizeof (struct itimerval)));
+}
+
+int
+kern_getitimer(struct thread *td, u_int which, struct itimerval *aitv)
+{
+	struct proc *p = td->td_proc;
+	struct timeval ctv;
+
+	if (which > ITIMER_PROF)
+		return (EINVAL);
+
+	if (which == ITIMER_REAL) {
+		/*
+		 * Convert from absolute to relative time in .it_value
+		 * part of real time timer.  If time for real time timer
+		 * has passed return 0, else return difference between
+		 * current time and time for the timer to go off.
+		 */
+		PROC_LOCK(p);
+		*aitv = p->p_realtimer;
+		PROC_UNLOCK(p);
+		if (timevalisset(&aitv->it_value)) {
+			microuptime(&ctv);
+			if (timevalcmp(&aitv->it_value, &ctv, <))
+				timevalclear(&aitv->it_value);
+			else
+				timevalsub(&aitv->it_value, &ctv);
+		}
+	} else {
+		PROC_SLOCK(p);
+		*aitv = p->p_stats->p_timer[which];
+		PROC_SUNLOCK(p);
+	}
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setitimer_args {
+	u_int	which;
+	struct	itimerval *itv, *oitv;
+};
+#endif
+int
+sys_setitimer(struct thread *td, struct setitimer_args *uap)
+{
+	struct itimerval aitv, oitv;
+	int error;
+
+	if (uap->itv == NULL) {
+		uap->itv = uap->oitv;
+		return (sys_getitimer(td, (struct getitimer_args *)uap));
+	}
+
+	if ((error = copyin(uap->itv, &aitv, sizeof(struct itimerval))))
+		return (error);
+	error = kern_setitimer(td, uap->which, &aitv, &oitv);
+	if (error != 0 || uap->oitv == NULL)
+		return (error);
+	return (copyout(&oitv, uap->oitv, sizeof(struct itimerval)));
+}
+
+int
+kern_setitimer(struct thread *td, u_int which, struct itimerval *aitv,
+    struct itimerval *oitv)
+{
+	struct proc *p = td->td_proc;
+	struct timeval ctv;
+	sbintime_t sbt, pr;
+
+	if (aitv == NULL)
+		return (kern_getitimer(td, which, oitv));
+
+	if (which > ITIMER_PROF)
+		return (EINVAL);
+	if (itimerfix(&aitv->it_value) ||
+	    aitv->it_value.tv_sec > INT32_MAX / 2)
+		return (EINVAL);
+	if (!timevalisset(&aitv->it_value))
+		timevalclear(&aitv->it_interval);
+	else if (itimerfix(&aitv->it_interval) ||
+	    aitv->it_interval.tv_sec > INT32_MAX / 2)
+		return (EINVAL);
+
+	if (which == ITIMER_REAL) {
+		PROC_LOCK(p);
+		if (timevalisset(&p->p_realtimer.it_value))
+			callout_stop(&p->p_itcallout);
+		microuptime(&ctv);
+		if (timevalisset(&aitv->it_value)) {
+			pr = tvtosbt(aitv->it_value) >> tc_precexp;
+			timevaladd(&aitv->it_value, &ctv);
+			sbt = tvtosbt(aitv->it_value);
+			callout_reset_sbt(&p->p_itcallout, sbt, pr,
+			    realitexpire, p, C_ABSOLUTE);
+		}
+		*oitv = p->p_realtimer;
+		p->p_realtimer = *aitv;
+		PROC_UNLOCK(p);
+		if (timevalisset(&oitv->it_value)) {
+			if (timevalcmp(&oitv->it_value, &ctv, <))
+				timevalclear(&oitv->it_value);
+			else
+				timevalsub(&oitv->it_value, &ctv);
+		}
+	} else {
+		PROC_SLOCK(p);
+		*oitv = p->p_stats->p_timer[which];
+		p->p_stats->p_timer[which] = *aitv;
+		PROC_SUNLOCK(p);
+	}
+	return (0);
+}
+
+/*
+ * Real interval timer expired:
+ * send process whose timer expired an alarm signal.
+ * If time is not set up to reload, then just return.
+ * Else compute next time timer should go off which is > current time.
+ * This is where delay in processing this timeout causes multiple
+ * SIGALRM calls to be compressed into one.
+ * tvtohz() always adds 1 to allow for the time until the next clock
+ * interrupt being strictly less than 1 clock tick, but we don't want
+ * that here since we want to appear to be in sync with the clock
+ * interrupt even when we're delayed.
+ */
+void
+realitexpire(void *arg)
+{
+	struct proc *p;
+	struct timeval ctv;
+	sbintime_t isbt;
+
+	p = (struct proc *)arg;
+	kern_psignal(p, SIGALRM);
+	if (!timevalisset(&p->p_realtimer.it_interval)) {
+		timevalclear(&p->p_realtimer.it_value);
+		if (p->p_flag & P_WEXIT)
+			wakeup(&p->p_itcallout);
+		return;
+	}
+	isbt = tvtosbt(p->p_realtimer.it_interval);
+	if (isbt >= sbt_timethreshold)
+		getmicrouptime(&ctv);
+	else
+		microuptime(&ctv);
+	do {
+		timevaladd(&p->p_realtimer.it_value,
+		    &p->p_realtimer.it_interval);
+	} while (timevalcmp(&p->p_realtimer.it_value, &ctv, <=));
+	callout_reset_sbt(&p->p_itcallout, tvtosbt(p->p_realtimer.it_value),
+	    isbt >> tc_precexp, realitexpire, p, C_ABSOLUTE);
+}
+
+/*
+ * Check that a proposed value to load into the .it_value or
+ * .it_interval part of an interval timer is acceptable, and
+ * fix it to have at least minimal value (i.e. if it is less
+ * than the resolution of the clock, round it up.)
+ */
+int
+itimerfix(struct timeval *tv)
+{
+
+	if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000)
+		return (EINVAL);
+	if (tv->tv_sec == 0 && tv->tv_usec != 0 &&
+	    tv->tv_usec < (u_int)tick / 16)
+		tv->tv_usec = (u_int)tick / 16;
+	return (0);
+}
+
+/*
+ * Decrement an interval timer by a specified number
+ * of microseconds, which must be less than a second,
+ * i.e. < 1000000.  If the timer expires, then reload
+ * it.  In this case, carry over (usec - old value) to
+ * reduce the value reloaded into the timer so that
+ * the timer does not drift.  This routine assumes
+ * that it is called in a context where the timers
+ * on which it is operating cannot change in value.
+ */
+int
+itimerdecr(struct itimerval *itp, int usec)
+{
+
+	if (itp->it_value.tv_usec < usec) {
+		if (itp->it_value.tv_sec == 0) {
+			/* expired, and already in next interval */
+			usec -= itp->it_value.tv_usec;
+			goto expire;
+		}
+		itp->it_value.tv_usec += 1000000;
+		itp->it_value.tv_sec--;
+	}
+	itp->it_value.tv_usec -= usec;
+	usec = 0;
+	if (timevalisset(&itp->it_value))
+		return (1);
+	/* expired, exactly at end of interval */
+expire:
+	if (timevalisset(&itp->it_interval)) {
+		itp->it_value = itp->it_interval;
+		itp->it_value.tv_usec -= usec;
+		if (itp->it_value.tv_usec < 0) {
+			itp->it_value.tv_usec += 1000000;
+			itp->it_value.tv_sec--;
+		}
+	} else
+		itp->it_value.tv_usec = 0;		/* sec is already 0 */
+	return (0);
+}
+
+/*
+ * Add and subtract routines for timevals.
+ * N.B.: subtract routine doesn't deal with
+ * results which are before the beginning,
+ * it just gets very confused in this case.
+ * Caveat emptor.
+ */
+void
+timevaladd(struct timeval *t1, const struct timeval *t2)
+{
+
+	t1->tv_sec += t2->tv_sec;
+	t1->tv_usec += t2->tv_usec;
+	timevalfix(t1);
+}
+
+void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+	t1->tv_sec -= t2->tv_sec;
+	t1->tv_usec -= t2->tv_usec;
+	timevalfix(t1);
+}
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+	if (t1->tv_usec < 0) {
+		t1->tv_sec--;
+		t1->tv_usec += 1000000;
+	}
+	if (t1->tv_usec >= 1000000) {
+		t1->tv_sec++;
+		t1->tv_usec -= 1000000;
+	}
+}
+
+/*
+ * ratecheck(): simple time-based rate-limit checking.
+ */
+int
+ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
+{
+	struct timeval tv, delta;
+	int rv = 0;
+
+	getmicrouptime(&tv);		/* NB: 10ms precision */
+	delta = tv;
+	timevalsub(&delta, lasttime);
+
+	/*
+	 * check for 0,0 is so that the message will be seen at least once,
+	 * even if interval is huge.
+	 */
+	if (timevalcmp(&delta, mininterval, >=) ||
+	    (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
+		*lasttime = tv;
+		rv = 1;
+	}
+
+	return (rv);
+}
+
+/*
+ * ppsratecheck(): packets (or events) per second limitation.
+ *
+ * Return 0 if the limit is to be enforced (e.g. the caller
+ * should drop a packet because of the rate limitation).
+ *
+ * maxpps of 0 always causes zero to be returned.  maxpps of -1
+ * always causes 1 to be returned; this effectively defeats rate
+ * limiting.
+ *
+ * Note that we maintain the struct timeval for compatibility
+ * with other bsd systems.  We reuse the storage and just monitor
+ * clock ticks for minimal overhead.  
+ */
+int
+ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
+{
+	int now;
+
+	/*
+	 * Reset the last time and counter if this is the first call
+	 * or more than a second has passed since the last update of
+	 * lasttime.
+	 */
+	now = ticks;
+	if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) {
+		lasttime->tv_sec = now;
+		*curpps = 1;
+		return (maxpps != 0);
+	} else {
+		(*curpps)++;		/* NB: ignore potential overflow */
+		return (maxpps < 0 || *curpps < maxpps);
+	}
+}
+
+static void
+itimer_start(void)
+{
+	struct kclock rt_clock = {
+		.timer_create  = realtimer_create,
+		.timer_delete  = realtimer_delete,
+		.timer_settime = realtimer_settime,
+		.timer_gettime = realtimer_gettime,
+		.event_hook    = NULL
+	};
+
+	itimer_zone = uma_zcreate("itimer", sizeof(struct itimer),
+		NULL, NULL, itimer_init, itimer_fini, UMA_ALIGN_PTR, 0);
+	register_posix_clock(CLOCK_REALTIME,  &rt_clock);
+	register_posix_clock(CLOCK_MONOTONIC, &rt_clock);
+	p31b_setcfg(CTL_P1003_1B_TIMERS, 200112L);
+	p31b_setcfg(CTL_P1003_1B_DELAYTIMER_MAX, INT_MAX);
+	p31b_setcfg(CTL_P1003_1B_TIMER_MAX, TIMER_MAX);
+	EVENTHANDLER_REGISTER(process_exit, itimers_event_hook_exit,
+		(void *)ITIMER_EV_EXIT, EVENTHANDLER_PRI_ANY);
+	EVENTHANDLER_REGISTER(process_exec, itimers_event_hook_exec,
+		(void *)ITIMER_EV_EXEC, EVENTHANDLER_PRI_ANY);
+}
+
+int
+register_posix_clock(int clockid, struct kclock *clk)
+{
+	if ((unsigned)clockid >= MAX_CLOCKS) {
+		printf("%s: invalid clockid\n", __func__);
+		return (0);
+	}
+	posix_clocks[clockid] = *clk;
+	return (1);
+}
+
+static int
+itimer_init(void *mem, int size, int flags)
+{
+	struct itimer *it;
+
+	it = (struct itimer *)mem;
+	mtx_init(&it->it_mtx, "itimer lock", NULL, MTX_DEF);
+	return (0);
+}
+
+static void
+itimer_fini(void *mem, int size)
+{
+	struct itimer *it;
+
+	it = (struct itimer *)mem;
+	mtx_destroy(&it->it_mtx);
+}
+
+static void
+itimer_enter(struct itimer *it)
+{
+
+	mtx_assert(&it->it_mtx, MA_OWNED);
+	it->it_usecount++;
+}
+
+static void
+itimer_leave(struct itimer *it)
+{
+
+	mtx_assert(&it->it_mtx, MA_OWNED);
+	KASSERT(it->it_usecount > 0, ("invalid it_usecount"));
+
+	if (--it->it_usecount == 0 && (it->it_flags & ITF_WANTED) != 0)
+		wakeup(it);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_create_args {
+	clockid_t clock_id;
+	struct sigevent * evp;
+	int * timerid;
+};
+#endif
+int
+sys_ktimer_create(struct thread *td, struct ktimer_create_args *uap)
+{
+	struct sigevent *evp, ev;
+	int id;
+	int error;
+
+	if (uap->evp == NULL) {
+		evp = NULL;
+	} else {
+		error = copyin(uap->evp, &ev, sizeof(ev));
+		if (error != 0)
+			return (error);
+		evp = &ev;
+	}
+	error = kern_ktimer_create(td, uap->clock_id, evp, &id, -1);
+	if (error == 0) {
+		error = copyout(&id, uap->timerid, sizeof(int));
+		if (error != 0)
+			kern_ktimer_delete(td, id);
+	}
+	return (error);
+}
+
+int
+kern_ktimer_create(struct thread *td, clockid_t clock_id, struct sigevent *evp,
+    int *timerid, int preset_id)
+{
+	struct proc *p = td->td_proc;
+	struct itimer *it;
+	int id;
+	int error;
+
+	if (clock_id < 0 || clock_id >= MAX_CLOCKS)
+		return (EINVAL);
+
+	if (posix_clocks[clock_id].timer_create == NULL)
+		return (EINVAL);
+
+	if (evp != NULL) {
+		if (evp->sigev_notify != SIGEV_NONE &&
+		    evp->sigev_notify != SIGEV_SIGNAL &&
+		    evp->sigev_notify != SIGEV_THREAD_ID)
+			return (EINVAL);
+		if ((evp->sigev_notify == SIGEV_SIGNAL ||
+		     evp->sigev_notify == SIGEV_THREAD_ID) &&
+			!_SIG_VALID(evp->sigev_signo))
+			return (EINVAL);
+	}
+	
+	if (p->p_itimers == NULL)
+		itimers_alloc(p);
+	
+	it = uma_zalloc(itimer_zone, M_WAITOK);
+	it->it_flags = 0;
+	it->it_usecount = 0;
+	it->it_active = 0;
+	timespecclear(&it->it_time.it_value);
+	timespecclear(&it->it_time.it_interval);
+	it->it_overrun = 0;
+	it->it_overrun_last = 0;
+	it->it_clockid = clock_id;
+	it->it_timerid = -1;
+	it->it_proc = p;
+	ksiginfo_init(&it->it_ksi);
+	it->it_ksi.ksi_flags |= KSI_INS | KSI_EXT;
+	error = CLOCK_CALL(clock_id, timer_create, (it));
+	if (error != 0)
+		goto out;
+
+	PROC_LOCK(p);
+	if (preset_id != -1) {
+		KASSERT(preset_id >= 0 && preset_id < 3, ("invalid preset_id"));
+		id = preset_id;
+		if (p->p_itimers->its_timers[id] != NULL) {
+			PROC_UNLOCK(p);
+			error = 0;
+			goto out;
+		}
+	} else {
+		/*
+		 * Find a free timer slot, skipping those reserved
+		 * for setitimer().
+		 */
+		for (id = 3; id < TIMER_MAX; id++)
+			if (p->p_itimers->its_timers[id] == NULL)
+				break;
+		if (id == TIMER_MAX) {
+			PROC_UNLOCK(p);
+			error = EAGAIN;
+			goto out;
+		}
+	}
+	it->it_timerid = id;
+	p->p_itimers->its_timers[id] = it;
+	if (evp != NULL)
+		it->it_sigev = *evp;
+	else {
+		it->it_sigev.sigev_notify = SIGEV_SIGNAL;
+		switch (clock_id) {
+		default:
+		case CLOCK_REALTIME:
+			it->it_sigev.sigev_signo = SIGALRM;
+			break;
+		case CLOCK_VIRTUAL:
+ 			it->it_sigev.sigev_signo = SIGVTALRM;
+			break;
+		case CLOCK_PROF:
+			it->it_sigev.sigev_signo = SIGPROF;
+			break;
+		}
+		it->it_sigev.sigev_value.sival_int = id;
+	}
+
+	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
+	    it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
+		it->it_ksi.ksi_signo = it->it_sigev.sigev_signo;
+		it->it_ksi.ksi_code = SI_TIMER;
+		it->it_ksi.ksi_value = it->it_sigev.sigev_value;
+		it->it_ksi.ksi_timerid = id;
+	}
+	PROC_UNLOCK(p);
+	*timerid = id;
+	return (0);
+
+out:
+	ITIMER_LOCK(it);
+	CLOCK_CALL(it->it_clockid, timer_delete, (it));
+	ITIMER_UNLOCK(it);
+	uma_zfree(itimer_zone, it);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_delete_args {
+	int timerid;
+};
+#endif
+int
+sys_ktimer_delete(struct thread *td, struct ktimer_delete_args *uap)
+{
+
+	return (kern_ktimer_delete(td, uap->timerid));
+}
+
+static struct itimer *
+itimer_find(struct proc *p, int timerid)
+{
+	struct itimer *it;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if ((p->p_itimers == NULL) ||
+	    (timerid < 0) || (timerid >= TIMER_MAX) ||
+	    (it = p->p_itimers->its_timers[timerid]) == NULL) {
+		return (NULL);
+	}
+	ITIMER_LOCK(it);
+	if ((it->it_flags & ITF_DELETING) != 0) {
+		ITIMER_UNLOCK(it);
+		it = NULL;
+	}
+	return (it);
+}
+
+int
+kern_ktimer_delete(struct thread *td, int timerid)
+{
+	struct proc *p = td->td_proc;
+	struct itimer *it;
+
+	PROC_LOCK(p);
+	it = itimer_find(p, timerid);
+	if (it == NULL) {
+		PROC_UNLOCK(p);
+		return (EINVAL);
+	}
+	PROC_UNLOCK(p);
+
+	it->it_flags |= ITF_DELETING;
+	while (it->it_usecount > 0) {
+		it->it_flags |= ITF_WANTED;
+		msleep(it, &it->it_mtx, PPAUSE, "itimer", 0);
+	}
+	it->it_flags &= ~ITF_WANTED;
+	CLOCK_CALL(it->it_clockid, timer_delete, (it));
+	ITIMER_UNLOCK(it);
+
+	PROC_LOCK(p);
+	if (KSI_ONQ(&it->it_ksi))
+		sigqueue_take(&it->it_ksi);
+	p->p_itimers->its_timers[timerid] = NULL;
+	PROC_UNLOCK(p);
+	uma_zfree(itimer_zone, it);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_settime_args {
+	int timerid;
+	int flags;
+	const struct itimerspec * value;
+	struct itimerspec * ovalue;
+};
+#endif
+int
+sys_ktimer_settime(struct thread *td, struct ktimer_settime_args *uap)
+{
+	struct itimerspec val, oval, *ovalp;
+	int error;
+
+	error = copyin(uap->value, &val, sizeof(val));
+	if (error != 0)
+		return (error);
+	ovalp = uap->ovalue != NULL ? &oval : NULL;
+	error = kern_ktimer_settime(td, uap->timerid, uap->flags, &val, ovalp);
+	if (error == 0 && uap->ovalue != NULL)
+		error = copyout(ovalp, uap->ovalue, sizeof(*ovalp));
+	return (error);
+}
+
+int
+kern_ktimer_settime(struct thread *td, int timer_id, int flags,
+    struct itimerspec *val, struct itimerspec *oval)
+{
+	struct proc *p;
+	struct itimer *it;
+	int error;
+
+	p = td->td_proc;
+	PROC_LOCK(p);
+	if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) {
+		PROC_UNLOCK(p);
+		error = EINVAL;
+	} else {
+		PROC_UNLOCK(p);
+		itimer_enter(it);
+		error = CLOCK_CALL(it->it_clockid, timer_settime, (it,
+		    flags, val, oval));
+		itimer_leave(it);
+		ITIMER_UNLOCK(it);
+	}
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_gettime_args {
+	int timerid;
+	struct itimerspec * value;
+};
+#endif
+int
+sys_ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap)
+{
+	struct itimerspec val;
+	int error;
+
+	error = kern_ktimer_gettime(td, uap->timerid, &val);
+	if (error == 0)
+		error = copyout(&val, uap->value, sizeof(val));
+	return (error);
+}
+
+int
+kern_ktimer_gettime(struct thread *td, int timer_id, struct itimerspec *val)
+{
+	struct proc *p;
+	struct itimer *it;
+	int error;
+
+	p = td->td_proc;
+	PROC_LOCK(p);
+	if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) {
+		PROC_UNLOCK(p);
+		error = EINVAL;
+	} else {
+		PROC_UNLOCK(p);
+		itimer_enter(it);
+		error = CLOCK_CALL(it->it_clockid, timer_gettime, (it, val));
+		itimer_leave(it);
+		ITIMER_UNLOCK(it);
+	}
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct timer_getoverrun_args {
+	int timerid;
+};
+#endif
+int
+sys_ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct itimer *it;
+	int error ;
+
+	PROC_LOCK(p);
+	if (uap->timerid < 3 ||
+	    (it = itimer_find(p, uap->timerid)) == NULL) {
+		PROC_UNLOCK(p);
+		error = EINVAL;
+	} else {
+		td->td_retval[0] = it->it_overrun_last;
+		ITIMER_UNLOCK(it);
+		PROC_UNLOCK(p);
+		error = 0;
+	}
+	return (error);
+}
+
+static int
+realtimer_create(struct itimer *it)
+{
+	callout_init_mtx(&it->it_callout, &it->it_mtx, 0);
+	return (0);
+}
+
+static int
+realtimer_delete(struct itimer *it)
+{
+	mtx_assert(&it->it_mtx, MA_OWNED);
+	
+	/*
+	 * clear timer's value and interval to tell realtimer_expire
+	 * to not rearm the timer.
+	 */
+	timespecclear(&it->it_time.it_value);
+	timespecclear(&it->it_time.it_interval);
+	ITIMER_UNLOCK(it);
+	callout_drain(&it->it_callout);
+	ITIMER_LOCK(it);
+	return (0);
+}
+
+static int
+realtimer_gettime(struct itimer *it, struct itimerspec *ovalue)
+{
+	struct timespec cts;
+
+	mtx_assert(&it->it_mtx, MA_OWNED);
+
+	realtimer_clocktime(it->it_clockid, &cts);
+	*ovalue = it->it_time;
+	if (ovalue->it_value.tv_sec != 0 || ovalue->it_value.tv_nsec != 0) {
+		timespecsub(&ovalue->it_value, &cts);
+		if (ovalue->it_value.tv_sec < 0 ||
+		    (ovalue->it_value.tv_sec == 0 &&
+		     ovalue->it_value.tv_nsec == 0)) {
+			ovalue->it_value.tv_sec  = 0;
+			ovalue->it_value.tv_nsec = 1;
+		}
+	}
+	return (0);
+}
+
+static int
+realtimer_settime(struct itimer *it, int flags,
+	struct itimerspec *value, struct itimerspec *ovalue)
+{
+	struct timespec cts, ts;
+	struct timeval tv;
+	struct itimerspec val;
+
+	mtx_assert(&it->it_mtx, MA_OWNED);
+
+	val = *value;
+	if (itimespecfix(&val.it_value))
+		return (EINVAL);
+
+	if (timespecisset(&val.it_value)) {
+		if (itimespecfix(&val.it_interval))
+			return (EINVAL);
+	} else {
+		timespecclear(&val.it_interval);
+	}
+	
+	if (ovalue != NULL)
+		realtimer_gettime(it, ovalue);
+
+	it->it_time = val;
+	if (timespecisset(&val.it_value)) {
+		realtimer_clocktime(it->it_clockid, &cts);
+		ts = val.it_value;
+		if ((flags & TIMER_ABSTIME) == 0) {
+			/* Convert to absolute time. */
+			timespecadd(&it->it_time.it_value, &cts);
+		} else {
+			timespecsub(&ts, &cts);
+			/*
+			 * We don't care if ts is negative, tztohz will
+			 * fix it.
+			 */
+		}
+		TIMESPEC_TO_TIMEVAL(&tv, &ts);
+		callout_reset(&it->it_callout, tvtohz(&tv),
+			realtimer_expire, it);
+	} else {
+		callout_stop(&it->it_callout);
+	}
+
+	return (0);
+}
+
+static void
+realtimer_clocktime(clockid_t id, struct timespec *ts)
+{
+	if (id == CLOCK_REALTIME)
+		getnanotime(ts);
+	else	/* CLOCK_MONOTONIC */
+		getnanouptime(ts);
+}
+
+int
+itimer_accept(struct proc *p, int timerid, ksiginfo_t *ksi)
+{
+	struct itimer *it;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	it = itimer_find(p, timerid);
+	if (it != NULL) {
+		ksi->ksi_overrun = it->it_overrun;
+		it->it_overrun_last = it->it_overrun;
+		it->it_overrun = 0;
+		ITIMER_UNLOCK(it);
+		return (0);
+	}
+	return (EINVAL);
+}
+
+int
+itimespecfix(struct timespec *ts)
+{
+
+	if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
+		return (EINVAL);
+	if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000)
+		ts->tv_nsec = tick * 1000;
+	return (0);
+}
+
+/* Timeout callback for realtime timer */
+static void
+realtimer_expire(void *arg)
+{
+	struct timespec cts, ts;
+	struct timeval tv;
+	struct itimer *it;
+
+	it = (struct itimer *)arg;
+
+	realtimer_clocktime(it->it_clockid, &cts);
+	/* Only fire if time is reached. */
+	if (timespeccmp(&cts, &it->it_time.it_value, >=)) {
+		if (timespecisset(&it->it_time.it_interval)) {
+			timespecadd(&it->it_time.it_value,
+				    &it->it_time.it_interval);
+			while (timespeccmp(&cts, &it->it_time.it_value, >=)) {
+				if (it->it_overrun < INT_MAX)
+					it->it_overrun++;
+				else
+					it->it_ksi.ksi_errno = ERANGE;
+				timespecadd(&it->it_time.it_value,
+					    &it->it_time.it_interval);
+			}
+		} else {
+			/* single shot timer ? */
+			timespecclear(&it->it_time.it_value);
+		}
+		if (timespecisset(&it->it_time.it_value)) {
+			ts = it->it_time.it_value;
+			timespecsub(&ts, &cts);
+			TIMESPEC_TO_TIMEVAL(&tv, &ts);
+			callout_reset(&it->it_callout, tvtohz(&tv),
+				 realtimer_expire, it);
+		}
+		itimer_enter(it);
+		ITIMER_UNLOCK(it);
+		itimer_fire(it);
+		ITIMER_LOCK(it);
+		itimer_leave(it);
+	} else if (timespecisset(&it->it_time.it_value)) {
+		ts = it->it_time.it_value;
+		timespecsub(&ts, &cts);
+		TIMESPEC_TO_TIMEVAL(&tv, &ts);
+		callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire,
+ 			it);
+	}
+}
+
+void
+itimer_fire(struct itimer *it)
+{
+	struct proc *p = it->it_proc;
+	struct thread *td;
+
+	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
+	    it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
+		if (sigev_findtd(p, &it->it_sigev, &td) != 0) {
+			ITIMER_LOCK(it);
+			timespecclear(&it->it_time.it_value);
+			timespecclear(&it->it_time.it_interval);
+			callout_stop(&it->it_callout);
+			ITIMER_UNLOCK(it);
+			return;
+		}
+		if (!KSI_ONQ(&it->it_ksi)) {
+			it->it_ksi.ksi_errno = 0;
+			ksiginfo_set_sigev(&it->it_ksi, &it->it_sigev);
+			tdsendsignal(p, td, it->it_ksi.ksi_signo, &it->it_ksi);
+		} else {
+			if (it->it_overrun < INT_MAX)
+				it->it_overrun++;
+			else
+				it->it_ksi.ksi_errno = ERANGE;
+		}
+		PROC_UNLOCK(p);
+	}
+}
+
+static void
+itimers_alloc(struct proc *p)
+{
+	struct itimers *its;
+	int i;
+
+	its = malloc(sizeof (struct itimers), M_SUBPROC, M_WAITOK | M_ZERO);
+	LIST_INIT(&its->its_virtual);
+	LIST_INIT(&its->its_prof);
+	TAILQ_INIT(&its->its_worklist);
+	for (i = 0; i < TIMER_MAX; i++)
+		its->its_timers[i] = NULL;
+	PROC_LOCK(p);
+	if (p->p_itimers == NULL) {
+		p->p_itimers = its;
+		PROC_UNLOCK(p);
+	}
+	else {
+		PROC_UNLOCK(p);
+		free(its, M_SUBPROC);
+	}
+}
+
+static void
+itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
+{
+	itimers_event_hook_exit(arg, p);
+}
+
+/* Clean up timers when some process events are being triggered. */
+static void
+itimers_event_hook_exit(void *arg, struct proc *p)
+{
+	struct itimers *its;
+	struct itimer *it;
+	int event = (int)(intptr_t)arg;
+	int i;
+
+	if (p->p_itimers != NULL) {
+		its = p->p_itimers;
+		for (i = 0; i < MAX_CLOCKS; ++i) {
+			if (posix_clocks[i].event_hook != NULL)
+				CLOCK_CALL(i, event_hook, (p, i, event));
+		}
+		/*
+		 * According to susv3, XSI interval timers should be inherited
+		 * by new image.
+		 */
+		if (event == ITIMER_EV_EXEC)
+			i = 3;
+		else if (event == ITIMER_EV_EXIT)
+			i = 0;
+		else
+			panic("unhandled event");
+		for (; i < TIMER_MAX; ++i) {
+			if ((it = its->its_timers[i]) != NULL)
+				kern_ktimer_delete(curthread, i);
+		}
+		if (its->its_timers[0] == NULL &&
+		    its->its_timers[1] == NULL &&
+		    its->its_timers[2] == NULL) {
+			free(its, M_SUBPROC);
+			p->p_itimers = NULL;
+		}
+	}
+}
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
new file mode 100644
index 0000000..e3580fc
--- /dev/null
+++ b/sys/kern/kern_timeout.c
@@ -0,0 +1,1433 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	From: @(#)kern_clock.c	8.5 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_callout_profiling.h"
+#include "opt_kdtrace.h"
+#if defined(__arm__)
+#include "opt_timer.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/callout.h>
+#include <sys/file.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sdt.h>
+#include <sys/sleepqueue.h>
+#include <sys/sysctl.h>
+#include <sys/smp.h>
+
+#ifdef SMP
+#include <machine/cpu.h>
+#endif
+
+#ifndef NO_EVENTTIMERS
+DPCPU_DECLARE(sbintime_t, hardclocktime);
+#endif
+
+SDT_PROVIDER_DEFINE(callout_execute);
+SDT_PROBE_DEFINE1(callout_execute, kernel, , callout_start, callout-start,
+    "struct callout *");
+SDT_PROBE_DEFINE1(callout_execute, kernel, , callout_end, callout-end,
+    "struct callout *");
+
+#ifdef CALLOUT_PROFILING
+static int avg_depth;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
+    "Average number of items examined per softclock call. Units = 1/1000");
+static int avg_gcalls;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
+    "Average number of Giant callouts made per softclock call. Units = 1/1000");
+static int avg_lockcalls;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0,
+    "Average number of lock callouts made per softclock call. Units = 1/1000");
+static int avg_mpcalls;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
+    "Average number of MP callouts made per softclock call. Units = 1/1000");
+static int avg_depth_dir;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0,
+    "Average number of direct callouts examined per callout_process call. "
+    "Units = 1/1000");
+static int avg_lockcalls_dir;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD,
+    &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per "
+    "callout_process call. Units = 1/1000");
+static int avg_mpcalls_dir;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir,
+    0, "Average number of MP direct callouts made per callout_process call. "
+    "Units = 1/1000");
+#endif
+
+static int ncallout;
+SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN, &ncallout, 0,
+    "Number of entries in callwheel and size of timeout() preallocation");
+
+/*
+ * TODO:
+ *	allocate more timeout table slots when table overflows.
+ */
+u_int callwheelsize, callwheelmask;
+
+/*
+ * The callout cpu exec entities represent informations necessary for
+ * describing the state of callouts currently running on the CPU and the ones
+ * necessary for migrating callouts to the new callout cpu. In particular,
+ * the first entry of the array cc_exec_entity holds informations for callout
+ * running in SWI thread context, while the second one holds informations
+ * for callout running directly from hardware interrupt context.
+ * The cached informations are very important for deferring migration when
+ * the migrating callout is already running.
+ */
+struct cc_exec {
+	struct callout		*cc_next;
+	struct callout		*cc_curr;
+#ifdef SMP
+	void			(*ce_migration_func)(void *);
+	void			*ce_migration_arg;
+	int			ce_migration_cpu;
+	sbintime_t		ce_migration_time;
+	sbintime_t		ce_migration_prec;
+#endif
+	bool			cc_cancel;
+	bool			cc_waiting;
+};
+
+/*
+ * There is one struct callout_cpu per cpu, holding all relevant
+ * state for the callout processing thread on the individual CPU.
+ */
+struct callout_cpu {
+	struct mtx_padalign	cc_lock;
+	struct cc_exec 		cc_exec_entity[2];
+	struct callout		*cc_callout;
+	struct callout_list	*cc_callwheel;
+	struct callout_tailq	cc_expireq;
+	struct callout_slist	cc_callfree;
+	sbintime_t		cc_firstevent;
+	sbintime_t		cc_lastscan;
+	void			*cc_cookie;
+	u_int			cc_bucket;
+};
+
+#define	cc_exec_curr		cc_exec_entity[0].cc_curr
+#define	cc_exec_next		cc_exec_entity[0].cc_next
+#define	cc_exec_cancel		cc_exec_entity[0].cc_cancel
+#define	cc_exec_waiting		cc_exec_entity[0].cc_waiting
+#define	cc_exec_curr_dir	cc_exec_entity[1].cc_curr
+#define	cc_exec_next_dir	cc_exec_entity[1].cc_next
+#define	cc_exec_cancel_dir	cc_exec_entity[1].cc_cancel
+#define	cc_exec_waiting_dir	cc_exec_entity[1].cc_waiting
+
+#ifdef SMP
+#define	cc_migration_func	cc_exec_entity[0].ce_migration_func
+#define	cc_migration_arg	cc_exec_entity[0].ce_migration_arg
+#define	cc_migration_cpu	cc_exec_entity[0].ce_migration_cpu
+#define	cc_migration_time	cc_exec_entity[0].ce_migration_time
+#define	cc_migration_prec	cc_exec_entity[0].ce_migration_prec
+#define	cc_migration_func_dir	cc_exec_entity[1].ce_migration_func
+#define	cc_migration_arg_dir	cc_exec_entity[1].ce_migration_arg
+#define	cc_migration_cpu_dir	cc_exec_entity[1].ce_migration_cpu
+#define	cc_migration_time_dir	cc_exec_entity[1].ce_migration_time
+#define	cc_migration_prec_dir	cc_exec_entity[1].ce_migration_prec
+
+struct callout_cpu cc_cpu[MAXCPU];
+#define	CPUBLOCK	MAXCPU
+#define	CC_CPU(cpu)	(&cc_cpu[(cpu)])
+#define	CC_SELF()	CC_CPU(PCPU_GET(cpuid))
+#else
+struct callout_cpu cc_cpu;
+#define	CC_CPU(cpu)	&cc_cpu
+#define	CC_SELF()	&cc_cpu
+#endif
+#define	CC_LOCK(cc)	mtx_lock_spin(&(cc)->cc_lock)
+#define	CC_UNLOCK(cc)	mtx_unlock_spin(&(cc)->cc_lock)
+#define	CC_LOCK_ASSERT(cc)	mtx_assert(&(cc)->cc_lock, MA_OWNED)
+
+static int timeout_cpu;
+
+static void	callout_cpu_init(struct callout_cpu *cc);
+static void	softclock_call_cc(struct callout *c, struct callout_cpu *cc,
+#ifdef CALLOUT_PROFILING
+		    int *mpcalls, int *lockcalls, int *gcalls,
+#endif
+		    int direct);
+
+static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
+
+/**
+ * Locked by cc_lock:
+ *   cc_curr         - If a callout is in progress, it is cc_curr.
+ *                     If cc_curr is non-NULL, threads waiting in
+ *                     callout_drain() will be woken up as soon as the
+ *                     relevant callout completes.
+ *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
+ *                     guarantees that the current callout will not run.
+ *                     The softclock() function sets this to 0 before it
+ *                     drops callout_lock to acquire c_lock, and it calls
+ *                     the handler only if curr_cancelled is still 0 after
+ *                     cc_lock is successfully acquired.
+ *   cc_waiting      - If a thread is waiting in callout_drain(), then
+ *                     callout_wait is nonzero.  Set only when
+ *                     cc_curr is non-NULL.
+ */
+
+/*
+ * Resets the execution entity tied to a specific callout cpu.
+ */
+static void
+cc_cce_cleanup(struct callout_cpu *cc, int direct)
+{
+
+	cc->cc_exec_entity[direct].cc_curr = NULL;
+	cc->cc_exec_entity[direct].cc_next = NULL;
+	cc->cc_exec_entity[direct].cc_cancel = false;
+	cc->cc_exec_entity[direct].cc_waiting = false;
+#ifdef SMP
+	cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK;
+	cc->cc_exec_entity[direct].ce_migration_time = 0;
+	cc->cc_exec_entity[direct].ce_migration_prec = 0;
+	cc->cc_exec_entity[direct].ce_migration_func = NULL;
+	cc->cc_exec_entity[direct].ce_migration_arg = NULL;
+#endif
+}
+
+/*
+ * Checks if migration is requested by a specific callout cpu.
+ */
+static int
+cc_cce_migrating(struct callout_cpu *cc, int direct)
+{
+
+#ifdef SMP
+	return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK);
+#else
+	return (0);
+#endif
+}
+
+/*
+ * Kernel low level callwheel initialization
+ * called on cpu0 during kernel startup.
+ */
+static void
+callout_callwheel_init(void *dummy)
+{
+	struct callout_cpu *cc;
+
+	/*
+	 * Calculate the size of the callout wheel and the preallocated
+	 * timeout() structures.
+	 * XXX: Clip callout to result of previous function of maxusers
+	 * maximum 384.  This is still huge, but acceptable.
+	 */
+	ncallout = imin(16 + maxproc + maxfiles, 18508);
+	TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
+
+	/*
+	 * Calculate callout wheel size, should be next power of two higher
+	 * than 'ncallout'.
+	 */
+	callwheelsize = 1 << fls(ncallout);
+	callwheelmask = callwheelsize - 1;
+
+	/*
+	 * Only cpu0 handles timeout(9) and receives a preallocation.
+	 *
+	 * XXX: Once all timeout(9) consumers are converted this can
+	 * be removed.
+	 */
+	timeout_cpu = PCPU_GET(cpuid);
+	cc = CC_CPU(timeout_cpu);
+	cc->cc_callout = malloc(ncallout * sizeof(struct callout),
+	    M_CALLOUT, M_WAITOK);
+	callout_cpu_init(cc);
+}
+SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
+
+/*
+ * Initialize the per-cpu callout structures.
+ */
+static void
+callout_cpu_init(struct callout_cpu *cc)
+{
+	struct callout *c;
+	int i;
+
+	mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
+	SLIST_INIT(&cc->cc_callfree);
+	cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize,
+	    M_CALLOUT, M_WAITOK);
+	for (i = 0; i < callwheelsize; i++)
+		LIST_INIT(&cc->cc_callwheel[i]);
+	TAILQ_INIT(&cc->cc_expireq);
+	cc->cc_firstevent = INT64_MAX;
+	for (i = 0; i < 2; i++)
+		cc_cce_cleanup(cc, i);
+	if (cc->cc_callout == NULL)	/* Only cpu0 handles timeout(9) */
+		return;
+	for (i = 0; i < ncallout; i++) {
+		c = &cc->cc_callout[i];
+		callout_init(c, 0);
+		c->c_flags = CALLOUT_LOCAL_ALLOC;
+		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
+	}
+}
+
+#ifdef SMP
+/*
+ * Switches the cpu tied to a specific callout.
+ * The function expects a locked incoming callout cpu and returns with
+ * locked outcoming callout cpu.
+ */
+static struct callout_cpu *
+callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu)
+{
+	struct callout_cpu *new_cc;
+
+	MPASS(c != NULL && cc != NULL);
+	CC_LOCK_ASSERT(cc);
+
+	/*
+	 * Avoid interrupts and preemption firing after the callout cpu
+	 * is blocked in order to avoid deadlocks as the new thread
+	 * may be willing to acquire the callout cpu lock.
+	 */
+	c->c_cpu = CPUBLOCK;
+	spinlock_enter();
+	CC_UNLOCK(cc);
+	new_cc = CC_CPU(new_cpu);
+	CC_LOCK(new_cc);
+	spinlock_exit();
+	c->c_cpu = new_cpu;
+	return (new_cc);
+}
+#endif
+
+/*
+ * Start standard softclock thread.
+ */
+static void
+start_softclock(void *dummy)
+{
+	struct callout_cpu *cc;
+#ifdef SMP
+	int cpu;
+#endif
+
+	cc = CC_CPU(timeout_cpu);
+	if (swi_add(&clk_intr_event, "clock", softclock, cc, SWI_CLOCK,
+	    INTR_MPSAFE, &cc->cc_cookie))
+		panic("died while creating standard software ithreads");
+#ifdef SMP
+	CPU_FOREACH(cpu) {
+		if (cpu == timeout_cpu)
+			continue;
+		cc = CC_CPU(cpu);
+		cc->cc_callout = NULL;	/* Only cpu0 handles timeout(9). */
+		callout_cpu_init(cc);
+		if (swi_add(NULL, "clock", softclock, cc, SWI_CLOCK,
+		    INTR_MPSAFE, &cc->cc_cookie))
+			panic("died while creating standard software ithreads");
+	}
+#endif
+}
+SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL);
+
+#define	CC_HASH_SHIFT	8
+
+static inline u_int
+callout_hash(sbintime_t sbt)
+{
+
+	return (sbt >> (32 - CC_HASH_SHIFT));
+}
+
+static inline u_int
+callout_get_bucket(sbintime_t sbt)
+{
+
+	return (callout_hash(sbt) & callwheelmask);
+}
+
+void
+callout_process(sbintime_t now)
+{
+	struct callout *tmp, *tmpn;
+	struct callout_cpu *cc;
+	struct callout_list *sc;
+	sbintime_t first, last, max, tmp_max;
+	uint32_t lookahead;
+	u_int firstb, lastb, nowb;
+#ifdef CALLOUT_PROFILING
+	int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
+#endif
+
+	cc = CC_SELF();
+	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
+
+	/* Compute the buckets of the last scan and present times. */
+	firstb = callout_hash(cc->cc_lastscan);
+	cc->cc_lastscan = now;
+	nowb = callout_hash(now);
+
+	/* Compute the last bucket and minimum time of the bucket after it. */
+	if (nowb == firstb)
+		lookahead = (SBT_1S / 16);
+	else if (nowb - firstb == 1)
+		lookahead = (SBT_1S / 8);
+	else
+		lookahead = (SBT_1S / 2);
+	first = last = now;
+	first += (lookahead / 2);
+	last += lookahead;
+	last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT));
+	lastb = callout_hash(last) - 1;
+	max = last;
+
+	/*
+	 * Check if we wrapped around the entire wheel from the last scan.
+	 * In case, we need to scan entirely the wheel for pending callouts.
+	 */
+	if (lastb - firstb >= callwheelsize) {
+		lastb = firstb + callwheelsize - 1;
+		if (nowb - firstb >= callwheelsize)
+			nowb = lastb;
+	}
+
+	/* Iterate callwheel from firstb to nowb and then up to lastb. */
+	do {
+		sc = &cc->cc_callwheel[firstb & callwheelmask];
+		tmp = LIST_FIRST(sc);
+		while (tmp != NULL) {
+			/* Run the callout if present time within allowed. */
+			if (tmp->c_time <= now) {
+				/*
+				 * Consumer told us the callout may be run
+				 * directly from hardware interrupt context.
+				 */
+				if (tmp->c_flags & CALLOUT_DIRECT) {
+#ifdef CALLOUT_PROFILING
+					++depth_dir;
+#endif
+					cc->cc_exec_next_dir =
+					    LIST_NEXT(tmp, c_links.le);
+					cc->cc_bucket = firstb & callwheelmask;
+					LIST_REMOVE(tmp, c_links.le);
+					softclock_call_cc(tmp, cc,
+#ifdef CALLOUT_PROFILING
+					    &mpcalls_dir, &lockcalls_dir, NULL,
+#endif
+					    1);
+					tmp = cc->cc_exec_next_dir;
+				} else {
+					tmpn = LIST_NEXT(tmp, c_links.le);
+					LIST_REMOVE(tmp, c_links.le);
+					TAILQ_INSERT_TAIL(&cc->cc_expireq,
+					    tmp, c_links.tqe);
+					tmp->c_flags |= CALLOUT_PROCESSED;
+					tmp = tmpn;
+				}
+				continue;
+			}
+			/* Skip events from distant future. */
+			if (tmp->c_time >= max)
+				goto next;
+			/*
+			 * Event minimal time is bigger than present maximal
+			 * time, so it cannot be aggregated.
+			 */
+			if (tmp->c_time > last) {
+				lastb = nowb;
+				goto next;
+			}
+			/* Update first and last time, respecting this event. */
+			if (tmp->c_time < first)
+				first = tmp->c_time;
+			tmp_max = tmp->c_time + tmp->c_precision;
+			if (tmp_max < last)
+				last = tmp_max;
+next:
+			tmp = LIST_NEXT(tmp, c_links.le);
+		}
+		/* Proceed with the next bucket. */
+		firstb++;
+		/*
+		 * Stop if we looked after present time and found
+		 * some event we can't execute at now.
+		 * Stop if we looked far enough into the future.
+		 */
+	} while (((int)(firstb - lastb)) <= 0);
+	cc->cc_firstevent = last;
+#ifndef NO_EVENTTIMERS
+	cpu_new_callout(curcpu, last, first);
+#endif
+#ifdef CALLOUT_PROFILING
+	avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8;
+	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
+	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
+#endif
+	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
+	/*
+	 * swi_sched acquires the thread lock, so we don't want to call it
+	 * with cc_lock held; incorrect locking order.
+	 */
+	if (!TAILQ_EMPTY(&cc->cc_expireq))
+		swi_sched(cc->cc_cookie, 0);
+}
+
+static struct callout_cpu *
+callout_lock(struct callout *c)
+{
+	struct callout_cpu *cc;
+	int cpu;
+
+	for (;;) {
+		cpu = c->c_cpu;
+#ifdef SMP
+		if (cpu == CPUBLOCK) {
+			while (c->c_cpu == CPUBLOCK)
+				cpu_spinwait();
+			continue;
+		}
+#endif
+		cc = CC_CPU(cpu);
+		CC_LOCK(cc);
+		if (cpu == c->c_cpu)
+			break;
+		CC_UNLOCK(cc);
+	}
+	return (cc);
+}
+
+static void
+callout_cc_add(struct callout *c, struct callout_cpu *cc,
+    sbintime_t sbt, sbintime_t precision, void (*func)(void *),
+    void *arg, int cpu, int flags)
+{
+	int bucket;
+
+	CC_LOCK_ASSERT(cc);
+	if (sbt < cc->cc_lastscan)
+		sbt = cc->cc_lastscan;
+	c->c_arg = arg;
+	c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
+	if (flags & C_DIRECT_EXEC)
+		c->c_flags |= CALLOUT_DIRECT;
+	c->c_flags &= ~CALLOUT_PROCESSED;
+	c->c_func = func;
+	c->c_time = sbt;
+	c->c_precision = precision;
+	bucket = callout_get_bucket(c->c_time);
+	CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
+	    c, (int)(c->c_precision >> 32),
+	    (u_int)(c->c_precision & 0xffffffff));
+	LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
+	if (cc->cc_bucket == bucket)
+		cc->cc_exec_next_dir = c;
+#ifndef NO_EVENTTIMERS
+	/*
+	 * Inform the eventtimers(4) subsystem there's a new callout
+	 * that has been inserted, but only if really required.
+	 */
+	sbt = c->c_time + c->c_precision;
+	if (sbt < cc->cc_firstevent) {
+		cc->cc_firstevent = sbt;
+		cpu_new_callout(cpu, sbt, c->c_time);
+	}
+#endif
+}
+
+static void
+callout_cc_del(struct callout *c, struct callout_cpu *cc)
+{
+
+	if ((c->c_flags & CALLOUT_LOCAL_ALLOC) == 0)
+		return;
+	c->c_func = NULL;
+	SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
+}
+
+static void
+softclock_call_cc(struct callout *c, struct callout_cpu *cc,
+#ifdef CALLOUT_PROFILING
+    int *mpcalls, int *lockcalls, int *gcalls,
+#endif
+    int direct)
+{
+	void (*c_func)(void *);
+	void *c_arg;
+	struct lock_class *class;
+	struct lock_object *c_lock;
+	int c_flags, sharedlock;
+#ifdef SMP
+	struct callout_cpu *new_cc;
+	void (*new_func)(void *);
+	void *new_arg;
+	int flags, new_cpu;
+	sbintime_t new_prec, new_time;
+#endif
+#if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) 
+	sbintime_t sbt1, sbt2;
+	struct timespec ts2;
+	static sbintime_t maxdt = 2 * SBT_1MS;	/* 2 msec */
+	static timeout_t *lastfunc;
+#endif
+
+	KASSERT((c->c_flags & (CALLOUT_PENDING | CALLOUT_ACTIVE)) ==
+	    (CALLOUT_PENDING | CALLOUT_ACTIVE),
+	    ("softclock_call_cc: pend|act %p %x", c, c->c_flags));
+	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
+	sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1;
+	c_lock = c->c_lock;
+	c_func = c->c_func;
+	c_arg = c->c_arg;
+	c_flags = c->c_flags;
+	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
+		c->c_flags = CALLOUT_LOCAL_ALLOC;
+	else
+		c->c_flags &= ~CALLOUT_PENDING;
+	cc->cc_exec_entity[direct].cc_curr = c;
+	cc->cc_exec_entity[direct].cc_cancel = false;
+	CC_UNLOCK(cc);
+	if (c_lock != NULL) {
+		class->lc_lock(c_lock, sharedlock);
+		/*
+		 * The callout may have been cancelled
+		 * while we switched locks.
+		 */
+		if (cc->cc_exec_entity[direct].cc_cancel) {
+			class->lc_unlock(c_lock);
+			goto skip;
+		}
+		/* The callout cannot be stopped now. */
+		cc->cc_exec_entity[direct].cc_cancel = true;
+		if (c_lock == &Giant.lock_object) {
+#ifdef CALLOUT_PROFILING
+			(*gcalls)++;
+#endif
+			CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
+			    c, c_func, c_arg);
+		} else {
+#ifdef CALLOUT_PROFILING
+			(*lockcalls)++;
+#endif
+			CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
+			    c, c_func, c_arg);
+		}
+	} else {
+#ifdef CALLOUT_PROFILING
+		(*mpcalls)++;
+#endif
+		CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
+		    c, c_func, c_arg);
+	}
+#if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
+	sbt1 = sbinuptime();
+#endif
+	THREAD_NO_SLEEPING();
+	SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0);
+	c_func(c_arg);
+	SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0);
+	THREAD_SLEEPING_OK();
+#if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
+	sbt2 = sbinuptime();
+	sbt2 -= sbt1;
+	if (sbt2 > maxdt) {
+		if (lastfunc != c_func || sbt2 > maxdt * 2) {
+			ts2 = sbttots(sbt2);
+			printf(
+		"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
+			    c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
+		}
+		maxdt = sbt2;
+		lastfunc = c_func;
+	}
+#endif
+	CTR1(KTR_CALLOUT, "callout %p finished", c);
+	if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
+		class->lc_unlock(c_lock);
+skip:
+	CC_LOCK(cc);
+	KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr"));
+	cc->cc_exec_entity[direct].cc_curr = NULL;
+	if (cc->cc_exec_entity[direct].cc_waiting) {
+		/*
+		 * There is someone waiting for the
+		 * callout to complete.
+		 * If the callout was scheduled for
+		 * migration just cancel it.
+		 */
+		if (cc_cce_migrating(cc, direct)) {
+			cc_cce_cleanup(cc, direct);
+
+			/*
+			 * It should be assert here that the callout is not
+			 * destroyed but that is not easy.
+			 */
+			c->c_flags &= ~CALLOUT_DFRMIGRATION;
+		}
+		cc->cc_exec_entity[direct].cc_waiting = false;
+		CC_UNLOCK(cc);
+		wakeup(&cc->cc_exec_entity[direct].cc_waiting);
+		CC_LOCK(cc);
+	} else if (cc_cce_migrating(cc, direct)) {
+		KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0,
+		    ("Migrating legacy callout %p", c));
+#ifdef SMP
+		/*
+		 * If the callout was scheduled for
+		 * migration just perform it now.
+		 */
+		new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu;
+		new_time = cc->cc_exec_entity[direct].ce_migration_time;
+		new_prec = cc->cc_exec_entity[direct].ce_migration_prec;
+		new_func = cc->cc_exec_entity[direct].ce_migration_func;
+		new_arg = cc->cc_exec_entity[direct].ce_migration_arg;
+		cc_cce_cleanup(cc, direct);
+
+		/*
+		 * It should be assert here that the callout is not destroyed
+		 * but that is not easy.
+		 *
+		 * As first thing, handle deferred callout stops.
+		 */
+		if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) {
+			CTR3(KTR_CALLOUT,
+			     "deferred cancelled %p func %p arg %p",
+			     c, new_func, new_arg);
+			callout_cc_del(c, cc);
+			return;
+		}
+		c->c_flags &= ~CALLOUT_DFRMIGRATION;
+
+		new_cc = callout_cpu_switch(c, cc, new_cpu);
+		flags = (direct) ? C_DIRECT_EXEC : 0;
+		callout_cc_add(c, new_cc, new_time, new_prec, new_func,
+		    new_arg, new_cpu, flags);
+		CC_UNLOCK(new_cc);
+		CC_LOCK(cc);
+#else
+		panic("migration should not happen");
+#endif
+	}
+	/*
+	 * If the current callout is locally allocated (from
+	 * timeout(9)) then put it on the freelist.
+	 *
+	 * Note: we need to check the cached copy of c_flags because
+	 * if it was not local, then it's not safe to deref the
+	 * callout pointer.
+	 */
+	KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0 ||
+	    c->c_flags == CALLOUT_LOCAL_ALLOC,
+	    ("corrupted callout"));
+	if (c_flags & CALLOUT_LOCAL_ALLOC)
+		callout_cc_del(c, cc);
+}
+
+/*
+ * The callout mechanism is based on the work of Adam M. Costello and
+ * George Varghese, published in a technical report entitled "Redesigning
+ * the BSD Callout and Timer Facilities" and modified slightly for inclusion
+ * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
+ * used in this implementation was published by G. Varghese and T. Lauck in
+ * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
+ * the Efficient Implementation of a Timer Facility" in the Proceedings of
+ * the 11th ACM Annual Symposium on Operating Systems Principles,
+ * Austin, Texas Nov 1987.
+ */
+
+/*
+ * Software (low priority) clock interrupt.
+ * Run periodic events from timeout queue.
+ */
+void
+softclock(void *arg)
+{
+	struct callout_cpu *cc;
+	struct callout *c;
+#ifdef CALLOUT_PROFILING
+	int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0;
+#endif
+
+	cc = (struct callout_cpu *)arg;
+	CC_LOCK(cc);
+	while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
+		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
+		softclock_call_cc(c, cc,
+#ifdef CALLOUT_PROFILING
+		    &mpcalls, &lockcalls, &gcalls,
+#endif
+		    0);
+#ifdef CALLOUT_PROFILING
+		++depth;
+#endif
+	}
+#ifdef CALLOUT_PROFILING
+	avg_depth += (depth * 1000 - avg_depth) >> 8;
+	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
+	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
+	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
+#endif
+	CC_UNLOCK(cc);
+}
+
+/*
+ * timeout --
+ *	Execute a function after a specified length of time.
+ *
+ * untimeout --
+ *	Cancel previous timeout function call.
+ *
+ * callout_handle_init --
+ *	Initialize a handle so that using it with untimeout is benign.
+ *
+ *	See AT&T BCI Driver Reference Manual for specification.  This
+ *	implementation differs from that one in that although an
+ *	identification value is returned from timeout, the original
+ *	arguments to timeout as well as the identifier are used to
+ *	identify entries for untimeout.
+ */
+struct callout_handle
+timeout(ftn, arg, to_ticks)
+	timeout_t *ftn;
+	void *arg;
+	int to_ticks;
+{
+	struct callout_cpu *cc;
+	struct callout *new;
+	struct callout_handle handle;
+
+	cc = CC_CPU(timeout_cpu);
+	CC_LOCK(cc);
+	/* Fill in the next free callout structure. */
+	new = SLIST_FIRST(&cc->cc_callfree);
+	if (new == NULL)
+		/* XXX Attempt to malloc first */
+		panic("timeout table full");
+	SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
+	callout_reset(new, to_ticks, ftn, arg);
+	handle.callout = new;
+	CC_UNLOCK(cc);
+
+	return (handle);
+}
+
+void
+untimeout(ftn, arg, handle)
+	timeout_t *ftn;
+	void *arg;
+	struct callout_handle handle;
+{
+	struct callout_cpu *cc;
+
+	/*
+	 * Check for a handle that was initialized
+	 * by callout_handle_init, but never used
+	 * for a real timeout.
+	 */
+	if (handle.callout == NULL)
+		return;
+
+	cc = callout_lock(handle.callout);
+	if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
+		callout_stop(handle.callout);
+	CC_UNLOCK(cc);
+}
+
+void
+callout_handle_init(struct callout_handle *handle)
+{
+	handle->callout = NULL;
+}
+
+/*
+ * New interface; clients allocate their own callout structures.
+ *
+ * callout_reset() - establish or change a timeout
+ * callout_stop() - disestablish a timeout
+ * callout_init() - initialize a callout structure so that it can
+ *	safely be passed to callout_reset() and callout_stop()
+ *
+ * <sys/callout.h> defines three convenience macros:
+ *
+ * callout_active() - returns truth if callout has not been stopped,
+ *	drained, or deactivated since the last time the callout was
+ *	reset.
+ * callout_pending() - returns truth if callout is still waiting for timeout
+ * callout_deactivate() - marks the callout as having been serviced
+ */
+int
+callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
+    void (*ftn)(void *), void *arg, int cpu, int flags)
+{
+	sbintime_t to_sbt, pr;
+	struct callout_cpu *cc;
+	int cancelled, direct;
+
+	cancelled = 0;
+	if (flags & C_ABSOLUTE) {
+		to_sbt = sbt;
+	} else {
+		if ((flags & C_HARDCLOCK) && (sbt < tick_sbt))
+			sbt = tick_sbt;
+		if ((flags & C_HARDCLOCK) ||
+#ifdef NO_EVENTTIMERS
+		    sbt >= sbt_timethreshold) {
+			to_sbt = getsbinuptime();
+
+			/* Add safety belt for the case of hz > 1000. */
+			to_sbt += tc_tick_sbt - tick_sbt;
+#else
+		    sbt >= sbt_tickthreshold) {
+			/*
+			 * Obtain the time of the last hardclock() call on
+			 * this CPU directly from the kern_clocksource.c.
+			 * This value is per-CPU, but it is equal for all
+			 * active ones.
+			 */
+#ifdef __LP64__
+			to_sbt = DPCPU_GET(hardclocktime);
+#else
+			spinlock_enter();
+			to_sbt = DPCPU_GET(hardclocktime);
+			spinlock_exit();
+#endif
+#endif
+			if ((flags & C_HARDCLOCK) == 0)
+				to_sbt += tick_sbt;
+		} else
+			to_sbt = sbinuptime();
+		to_sbt += sbt;
+		pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
+		    sbt >> C_PRELGET(flags));
+		if (pr > precision)
+			precision = pr;
+	}
+	/*
+	 * Don't allow migration of pre-allocated callouts lest they
+	 * become unbalanced.
+	 */
+	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
+		cpu = c->c_cpu;
+	direct = (c->c_flags & CALLOUT_DIRECT) != 0;
+	KASSERT(!direct || c->c_lock == NULL,
+	    ("%s: direct callout %p has lock", __func__, c));
+	cc = callout_lock(c);
+	if (cc->cc_exec_entity[direct].cc_curr == c) {
+		/*
+		 * We're being asked to reschedule a callout which is
+		 * currently in progress.  If there is a lock then we
+		 * can cancel the callout if it has not really started.
+		 */
+		if (c->c_lock != NULL && !cc->cc_exec_entity[direct].cc_cancel)
+			cancelled = cc->cc_exec_entity[direct].cc_cancel = true;
+		if (cc->cc_exec_entity[direct].cc_waiting) {
+			/*
+			 * Someone has called callout_drain to kill this
+			 * callout.  Don't reschedule.
+			 */
+			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
+			    cancelled ? "cancelled" : "failed to cancel",
+			    c, c->c_func, c->c_arg);
+			CC_UNLOCK(cc);
+			return (cancelled);
+		}
+	}
+	if (c->c_flags & CALLOUT_PENDING) {
+		if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
+			if (cc->cc_exec_next_dir == c)
+				cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
+			LIST_REMOVE(c, c_links.le);
+		} else
+			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
+		cancelled = 1;
+		c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
+	}
+
+#ifdef SMP
+	/*
+	 * If the callout must migrate try to perform it immediately.
+	 * If the callout is currently running, just defer the migration
+	 * to a more appropriate moment.
+	 */
+	if (c->c_cpu != cpu) {
+		if (cc->cc_exec_entity[direct].cc_curr == c) {
+			cc->cc_exec_entity[direct].ce_migration_cpu = cpu;
+			cc->cc_exec_entity[direct].ce_migration_time
+			    = to_sbt;
+			cc->cc_exec_entity[direct].ce_migration_prec 
+			    = precision;
+			cc->cc_exec_entity[direct].ce_migration_func = ftn;
+			cc->cc_exec_entity[direct].ce_migration_arg = arg;
+			c->c_flags |= CALLOUT_DFRMIGRATION;
+			CTR6(KTR_CALLOUT,
+		    "migration of %p func %p arg %p in %d.%08x to %u deferred",
+			    c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
+			    (u_int)(to_sbt & 0xffffffff), cpu);
+			CC_UNLOCK(cc);
+			return (cancelled);
+		}
+		cc = callout_cpu_switch(c, cc, cpu);
+	}
+#endif
+
+	callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags);
+	CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
+	    cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
+	    (u_int)(to_sbt & 0xffffffff));
+	CC_UNLOCK(cc);
+
+	return (cancelled);
+}
+
+/*
+ * Common idioms that can be optimized in the future.
+ */
+int
+callout_schedule_on(struct callout *c, int to_ticks, int cpu)
+{
+	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu);
+}
+
+int
+callout_schedule(struct callout *c, int to_ticks)
+{
+	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu);
+}
+
+int
+_callout_stop_safe(c, safe)
+	struct	callout *c;
+	int	safe;
+{
+	struct callout_cpu *cc, *old_cc;
+	struct lock_class *class;
+	int direct, sq_locked, use_lock;
+
+	/*
+	 * Some old subsystems don't hold Giant while running a callout_stop(),
+	 * so just discard this check for the moment.
+	 */
+	if (!safe && c->c_lock != NULL) {
+		if (c->c_lock == &Giant.lock_object)
+			use_lock = mtx_owned(&Giant);
+		else {
+			use_lock = 1;
+			class = LOCK_CLASS(c->c_lock);
+			class->lc_assert(c->c_lock, LA_XLOCKED);
+		}
+	} else
+		use_lock = 0;
+	direct = (c->c_flags & CALLOUT_DIRECT) != 0;
+	sq_locked = 0;
+	old_cc = NULL;
+again:
+	cc = callout_lock(c);
+
+	/*
+	 * If the callout was migrating while the callout cpu lock was
+	 * dropped,  just drop the sleepqueue lock and check the states
+	 * again.
+	 */
+	if (sq_locked != 0 && cc != old_cc) {
+#ifdef SMP
+		CC_UNLOCK(cc);
+		sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting);
+		sq_locked = 0;
+		old_cc = NULL;
+		goto again;
+#else
+		panic("migration should not happen");
+#endif
+	}
+
+	/*
+	 * If the callout isn't pending, it's not on the queue, so
+	 * don't attempt to remove it from the queue.  We can try to
+	 * stop it by other means however.
+	 */
+	if (!(c->c_flags & CALLOUT_PENDING)) {
+		c->c_flags &= ~CALLOUT_ACTIVE;
+
+		/*
+		 * If it wasn't on the queue and it isn't the current
+		 * callout, then we can't stop it, so just bail.
+		 */
+		if (cc->cc_exec_entity[direct].cc_curr != c) {
+			CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
+			    c, c->c_func, c->c_arg);
+			CC_UNLOCK(cc);
+			if (sq_locked)
+				sleepq_release(
+				    &cc->cc_exec_entity[direct].cc_waiting);
+			return (0);
+		}
+
+		if (safe) {
+			/*
+			 * The current callout is running (or just
+			 * about to run) and blocking is allowed, so
+			 * just wait for the current invocation to
+			 * finish.
+			 */
+			while (cc->cc_exec_entity[direct].cc_curr == c) {
+				/*
+				 * Use direct calls to sleepqueue interface
+				 * instead of cv/msleep in order to avoid
+				 * a LOR between cc_lock and sleepqueue
+				 * chain spinlocks.  This piece of code
+				 * emulates a msleep_spin() call actually.
+				 *
+				 * If we already have the sleepqueue chain
+				 * locked, then we can safely block.  If we
+				 * don't already have it locked, however,
+				 * we have to drop the cc_lock to lock
+				 * it.  This opens several races, so we
+				 * restart at the beginning once we have
+				 * both locks.  If nothing has changed, then
+				 * we will end up back here with sq_locked
+				 * set.
+				 */
+				if (!sq_locked) {
+					CC_UNLOCK(cc);
+					sleepq_lock(
+					&cc->cc_exec_entity[direct].cc_waiting);
+					sq_locked = 1;
+					old_cc = cc;
+					goto again;
+				}
+
+				/*
+				 * Migration could be cancelled here, but
+				 * as long as it is still not sure when it
+				 * will be packed up, just let softclock()
+				 * take care of it.
+				 */
+				cc->cc_exec_entity[direct].cc_waiting = true;
+				DROP_GIANT();
+				CC_UNLOCK(cc);
+				sleepq_add(
+				    &cc->cc_exec_entity[direct].cc_waiting,
+				    &cc->cc_lock.lock_object, "codrain",
+				    SLEEPQ_SLEEP, 0);
+				sleepq_wait(
+				    &cc->cc_exec_entity[direct].cc_waiting,
+					     0);
+				sq_locked = 0;
+				old_cc = NULL;
+
+				/* Reacquire locks previously released. */
+				PICKUP_GIANT();
+				CC_LOCK(cc);
+			}
+		} else if (use_lock &&
+			    !cc->cc_exec_entity[direct].cc_cancel) {
+			/*
+			 * The current callout is waiting for its
+			 * lock which we hold.  Cancel the callout
+			 * and return.  After our caller drops the
+			 * lock, the callout will be skipped in
+			 * softclock().
+			 */
+			cc->cc_exec_entity[direct].cc_cancel = true;
+			CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
+			    c, c->c_func, c->c_arg);
+			KASSERT(!cc_cce_migrating(cc, direct),
+			    ("callout wrongly scheduled for migration"));
+			CC_UNLOCK(cc);
+			KASSERT(!sq_locked, ("sleepqueue chain locked"));
+			return (1);
+		} else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) {
+			c->c_flags &= ~CALLOUT_DFRMIGRATION;
+			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
+			    c, c->c_func, c->c_arg);
+			CC_UNLOCK(cc);
+			return (1);
+		}
+		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
+		    c, c->c_func, c->c_arg);
+		CC_UNLOCK(cc);
+		KASSERT(!sq_locked, ("sleepqueue chain still locked"));
+		return (0);
+	}
+	if (sq_locked)
+		sleepq_release(&cc->cc_exec_entity[direct].cc_waiting);
+
+	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
+
+	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
+	    c, c->c_func, c->c_arg);
+	if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
+		if (cc->cc_exec_next_dir == c)
+			cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
+		LIST_REMOVE(c, c_links.le);
+	} else
+		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
+	callout_cc_del(c, cc);
+
+	CC_UNLOCK(cc);
+	return (1);
+}
+
+void
+callout_init(c, mpsafe)
+	struct	callout *c;
+	int mpsafe;
+{
+	bzero(c, sizeof *c);
+	if (mpsafe) {
+		c->c_lock = NULL;
+		c->c_flags = CALLOUT_RETURNUNLOCKED;
+	} else {
+		c->c_lock = &Giant.lock_object;
+		c->c_flags = 0;
+	}
+	c->c_cpu = timeout_cpu;
+}
+
+void
+_callout_init_lock(c, lock, flags)
+	struct	callout *c;
+	struct	lock_object *lock;
+	int flags;
+{
+	bzero(c, sizeof *c);
+	c->c_lock = lock;
+	KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
+	    ("callout_init_lock: bad flags %d", flags));
+	KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
+	    ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
+	KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
+	    (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
+	    __func__));
+	c->c_flags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
+	c->c_cpu = timeout_cpu;
+}
+
+#ifdef APM_FIXUP_CALLTODO
+/* 
+ * Adjust the kernel calltodo timeout list.  This routine is used after 
+ * an APM resume to recalculate the calltodo timer list values with the 
+ * number of hz's we have been sleeping.  The next hardclock() will detect 
+ * that there are fired timers and run softclock() to execute them.
+ *
+ * Please note, I have not done an exhaustive analysis of what code this
+ * might break.  I am motivated to have my select()'s and alarm()'s that
+ * have expired during suspend firing upon resume so that the applications
+ * which set the timer can do the maintanence the timer was for as close
+ * as possible to the originally intended time.  Testing this code for a 
+ * week showed that resuming from a suspend resulted in 22 to 25 timers 
+ * firing, which seemed independant on whether the suspend was 2 hours or
+ * 2 days.  Your milage may vary.   - Ken Key <key@cs.utk.edu>
+ */
+void
+adjust_timeout_calltodo(time_change)
+    struct timeval *time_change;
+{
+	register struct callout *p;
+	unsigned long delta_ticks;
+
+	/* 
+	 * How many ticks were we asleep?
+	 * (stolen from tvtohz()).
+	 */
+
+	/* Don't do anything */
+	if (time_change->tv_sec < 0)
+		return;
+	else if (time_change->tv_sec <= LONG_MAX / 1000000)
+		delta_ticks = (time_change->tv_sec * 1000000 +
+			       time_change->tv_usec + (tick - 1)) / tick + 1;
+	else if (time_change->tv_sec <= LONG_MAX / hz)
+		delta_ticks = time_change->tv_sec * hz +
+			      (time_change->tv_usec + (tick - 1)) / tick + 1;
+	else
+		delta_ticks = LONG_MAX;
+
+	if (delta_ticks > INT_MAX)
+		delta_ticks = INT_MAX;
+
+	/* 
+	 * Now rip through the timer calltodo list looking for timers
+	 * to expire.
+	 */
+
+	/* don't collide with softclock() */
+	CC_LOCK(cc);
+	for (p = calltodo.c_next; p != NULL; p = p->c_next) {
+		p->c_time -= delta_ticks;
+
+		/* Break if the timer had more time on it than delta_ticks */
+		if (p->c_time > 0)
+			break;
+
+		/* take back the ticks the timer didn't use (p->c_time <= 0) */
+		delta_ticks = -p->c_time;
+	}
+	CC_UNLOCK(cc);
+
+	return;
+}
+#endif /* APM_FIXUP_CALLTODO */
+
+static int
+flssbt(sbintime_t sbt)
+{
+
+	sbt += (uint64_t)sbt >> 1;
+	if (sizeof(long) >= sizeof(sbintime_t))
+		return (flsl(sbt));
+	if (sbt >= SBT_1S)
+		return (flsl(((uint64_t)sbt) >> 32) + 32);
+	return (flsl(sbt));
+}
+
+/*
+ * Dump immediate statistic snapshot of the scheduled callouts.
+ */
+static int
+sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
+{
+	struct callout *tmp;
+	struct callout_cpu *cc;
+	struct callout_list *sc;
+	sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t;
+	int ct[64], cpr[64], ccpbk[32];
+	int error, val, i, count, tcum, pcum, maxc, c, medc;
+#ifdef SMP
+	int cpu;
+#endif
+
+	val = 0;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	count = maxc = 0;
+	st = spr = maxt = maxpr = 0;
+	bzero(ccpbk, sizeof(ccpbk));
+	bzero(ct, sizeof(ct));
+	bzero(cpr, sizeof(cpr));
+	now = sbinuptime();
+#ifdef SMP
+	CPU_FOREACH(cpu) {
+		cc = CC_CPU(cpu);
+#else
+		cc = CC_CPU(timeout_cpu);
+#endif
+		CC_LOCK(cc);
+		for (i = 0; i < callwheelsize; i++) {
+			sc = &cc->cc_callwheel[i];
+			c = 0;
+			LIST_FOREACH(tmp, sc, c_links.le) {
+				c++;
+				t = tmp->c_time - now;
+				if (t < 0)
+					t = 0;
+				st += t / SBT_1US;
+				spr += tmp->c_precision / SBT_1US;
+				if (t > maxt)
+					maxt = t;
+				if (tmp->c_precision > maxpr)
+					maxpr = tmp->c_precision;
+				ct[flssbt(t)]++;
+				cpr[flssbt(tmp->c_precision)]++;
+			}
+			if (c > maxc)
+				maxc = c;
+			ccpbk[fls(c + c / 2)]++;
+			count += c;
+		}
+		CC_UNLOCK(cc);
+#ifdef SMP
+	}
+#endif
+
+	for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
+		tcum += ct[i];
+	medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
+	for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++)
+		pcum += cpr[i];
+	medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
+	for (i = 0, c = 0; i < 32 && c < count / 2; i++)
+		c += ccpbk[i];
+	medc = (i >= 2) ? (1 << (i - 2)) : 0;
+
+	printf("Scheduled callouts statistic snapshot:\n");
+	printf("  Callouts: %6d  Buckets: %6d*%-3d  Bucket size: 0.%06ds\n",
+	    count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
+	printf("  C/Bk: med %5d         avg %6d.%06jd  max %6d\n",
+	    medc,
+	    count / callwheelsize / mp_ncpus,
+	    (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
+	    maxc);
+	printf("  Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
+	    medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
+	    (st / count) / 1000000, (st / count) % 1000000,
+	    maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32);
+	printf("  Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
+	    medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32,
+	    (spr / count) / 1000000, (spr / count) % 1000000,
+	    maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32);
+	printf("  Distribution:       \tbuckets\t   time\t   tcum\t"
+	    "   prec\t   pcum\n");
+	for (i = 0, tcum = pcum = 0; i < 64; i++) {
+		if (ct[i] == 0 && cpr[i] == 0)
+			continue;
+		t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
+		tcum += ct[i];
+		pcum += cpr[i];
+		printf("  %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n",
+		    t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
+		    i - 1 - (32 - CC_HASH_SHIFT),
+		    ct[i], tcum, cpr[i], pcum);
+	}
+	return (error);
+}
+SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    0, 0, sysctl_kern_callout_stat, "I",
+    "Dump immediate statistic snapshot of the scheduled callouts");
diff --git a/sys/kern/kern_umtx.c b/sys/kern/kern_umtx.c
new file mode 100644
index 0000000..0e21383
--- /dev/null
+++ b/sys/kern/kern_umtx.c
@@ -0,0 +1,3918 @@
+/*-
+ * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
+ * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_umtx_profiling.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/syscallsubr.h>
+#include <sys/eventhandler.h>
+#include <sys/umtx.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+
+#include <machine/cpu.h>
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32_proto.h>
+#endif
+
+#define _UMUTEX_TRY		1
+#define _UMUTEX_WAIT		2
+
+#ifdef UMTX_PROFILING
+#define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
+	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
+#endif
+
+/* Priority inheritance mutex info. */
+struct umtx_pi {
+	/* Owner thread */
+	struct thread		*pi_owner;
+
+	/* Reference count */
+	int			pi_refcount;
+
+ 	/* List entry to link umtx holding by thread */
+	TAILQ_ENTRY(umtx_pi)	pi_link;
+
+	/* List entry in hash */
+	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
+
+	/* List for waiters */
+	TAILQ_HEAD(,umtx_q)	pi_blocked;
+
+	/* Identify a userland lock object */
+	struct umtx_key		pi_key;
+};
+
+/* A userland synchronous object user. */
+struct umtx_q {
+	/* Linked list for the hash. */
+	TAILQ_ENTRY(umtx_q)	uq_link;
+
+	/* Umtx key. */
+	struct umtx_key		uq_key;
+
+	/* Umtx flags. */
+	int			uq_flags;
+#define UQF_UMTXQ	0x0001
+
+	/* The thread waits on. */
+	struct thread		*uq_thread;
+
+	/*
+	 * Blocked on PI mutex. read can use chain lock
+	 * or umtx_lock, write must have both chain lock and
+	 * umtx_lock being hold.
+	 */
+	struct umtx_pi		*uq_pi_blocked;
+
+	/* On blocked list */
+	TAILQ_ENTRY(umtx_q)	uq_lockq;
+
+	/* Thread contending with us */
+	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
+
+	/* Inherited priority from PP mutex */
+	u_char			uq_inherited_pri;
+	
+	/* Spare queue ready to be reused */
+	struct umtxq_queue	*uq_spare_queue;
+
+	/* The queue we on */
+	struct umtxq_queue	*uq_cur_queue;
+};
+
+TAILQ_HEAD(umtxq_head, umtx_q);
+
+/* Per-key wait-queue */
+struct umtxq_queue {
+	struct umtxq_head	head;
+	struct umtx_key		key;
+	LIST_ENTRY(umtxq_queue)	link;
+	int			length;
+};
+
+LIST_HEAD(umtxq_list, umtxq_queue);
+
+/* Userland lock object's wait-queue chain */
+struct umtxq_chain {
+	/* Lock for this chain. */
+	struct mtx		uc_lock;
+
+	/* List of sleep queues. */
+	struct umtxq_list	uc_queue[2];
+#define UMTX_SHARED_QUEUE	0
+#define UMTX_EXCLUSIVE_QUEUE	1
+
+	LIST_HEAD(, umtxq_queue) uc_spare_queue;
+
+	/* Busy flag */
+	char			uc_busy;
+
+	/* Chain lock waiters */
+	int			uc_waiters;
+
+	/* All PI in the list */
+	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
+
+#ifdef UMTX_PROFILING
+	u_int 			length;
+	u_int			max_length;
+#endif
+};
+
+#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
+#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
+
+/*
+ * Don't propagate time-sharing priority, there is a security reason,
+ * a user can simply introduce PI-mutex, let thread A lock the mutex,
+ * and let another thread B block on the mutex, because B is
+ * sleeping, its priority will be boosted, this causes A's priority to
+ * be boosted via priority propagating too and will never be lowered even
+ * if it is using 100%CPU, this is unfair to other processes.
+ */
+
+#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
+			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
+			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
+
+#define	GOLDEN_RATIO_PRIME	2654404609U
+#define	UMTX_CHAINS		512
+#define	UMTX_SHIFTS		(__WORD_BIT - 9)
+
+#define	GET_SHARE(flags)	\
+    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
+
+#define BUSY_SPINS		200
+
+struct abs_timeout {
+	int clockid;
+	struct timespec cur;
+	struct timespec end;
+};
+
+static uma_zone_t		umtx_pi_zone;
+static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
+static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
+static int			umtx_pi_allocated;
+
+static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
+SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
+    &umtx_pi_allocated, 0, "Allocated umtx_pi");
+
+#ifdef UMTX_PROFILING
+static long max_length;
+SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
+static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
+#endif
+
+static void umtxq_sysinit(void *);
+static void umtxq_hash(struct umtx_key *key);
+static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
+static void umtxq_lock(struct umtx_key *key);
+static void umtxq_unlock(struct umtx_key *key);
+static void umtxq_busy(struct umtx_key *key);
+static void umtxq_unbusy(struct umtx_key *key);
+static void umtxq_insert_queue(struct umtx_q *uq, int q);
+static void umtxq_remove_queue(struct umtx_q *uq, int q);
+static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
+static int umtxq_count(struct umtx_key *key);
+static struct umtx_pi *umtx_pi_alloc(int);
+static void umtx_pi_free(struct umtx_pi *pi);
+static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
+static void umtx_thread_cleanup(struct thread *td);
+static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
+	struct image_params *imgp __unused);
+SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
+
+#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
+#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
+#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
+
+static struct mtx umtx_lock;
+
+#ifdef UMTX_PROFILING
+static void
+umtx_init_profiling(void) 
+{
+	struct sysctl_oid *chain_oid;
+	char chain_name[10];
+	int i;
+
+	for (i = 0; i < UMTX_CHAINS; ++i) {
+		snprintf(chain_name, sizeof(chain_name), "%d", i);
+		chain_oid = SYSCTL_ADD_NODE(NULL, 
+		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO, 
+		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
+		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
+		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
+		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
+		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
+	}
+}
+
+static int
+sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
+{
+	char buf[512];
+	struct sbuf sb;
+	struct umtxq_chain *uc;
+	u_int fract, i, j, tot, whole;
+	u_int sf0, sf1, sf2, sf3, sf4;
+	u_int si0, si1, si2, si3, si4;
+	u_int sw0, sw1, sw2, sw3, sw4;
+
+	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
+	for (i = 0; i < 2; i++) {
+		tot = 0;
+		for (j = 0; j < UMTX_CHAINS; ++j) {
+			uc = &umtxq_chains[i][j];
+			mtx_lock(&uc->uc_lock);
+			tot += uc->max_length;
+			mtx_unlock(&uc->uc_lock);
+		}
+		if (tot == 0)
+			sbuf_printf(&sb, "%u) Empty ", i);
+		else {
+			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
+			si0 = si1 = si2 = si3 = si4 = 0;
+			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
+			for (j = 0; j < UMTX_CHAINS; j++) {
+				uc = &umtxq_chains[i][j];
+				mtx_lock(&uc->uc_lock);
+				whole = uc->max_length * 100;
+				mtx_unlock(&uc->uc_lock);
+				fract = (whole % tot) * 100;
+				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
+					sf0 = fract;
+					si0 = j;
+					sw0 = whole;
+				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
+				    sf1)) {
+					sf1 = fract;
+					si1 = j;
+					sw1 = whole;
+				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
+				    sf2)) {
+					sf2 = fract;
+					si2 = j;
+					sw2 = whole;
+				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
+				    sf3)) {
+					sf3 = fract;
+					si3 = j;
+					sw3 = whole;
+				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
+				    sf4)) {
+					sf4 = fract;
+					si4 = j;
+					sw4 = whole;
+				}
+			}
+			sbuf_printf(&sb, "queue %u:\n", i);
+			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
+			    sf0 / tot, si0);
+			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
+			    sf1 / tot, si1);
+			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
+			    sf2 / tot, si2);
+			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
+			    sf3 / tot, si3);
+			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
+			    sf4 / tot, si4);
+		}
+	}
+	sbuf_trim(&sb);
+	sbuf_finish(&sb);
+	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
+	sbuf_delete(&sb);
+	return (0);
+}
+
+static int
+sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
+{
+	struct umtxq_chain *uc;
+	u_int i, j;
+	int clear, error;
+
+	clear = 0;
+	error = sysctl_handle_int(oidp, &clear, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	if (clear != 0) {
+		for (i = 0; i < 2; ++i) {
+			for (j = 0; j < UMTX_CHAINS; ++j) {
+				uc = &umtxq_chains[i][j];
+				mtx_lock(&uc->uc_lock);
+				uc->length = 0;
+				uc->max_length = 0;	
+				mtx_unlock(&uc->uc_lock);
+			}
+		}
+	}
+	return (0);
+}
+
+SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
+    sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
+SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
+    sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
+#endif
+
+static void
+umtxq_sysinit(void *arg __unused)
+{
+	int i, j;
+
+	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
+		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	for (i = 0; i < 2; ++i) {
+		for (j = 0; j < UMTX_CHAINS; ++j) {
+			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
+				 MTX_DEF | MTX_DUPOK);
+			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
+			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
+			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
+			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
+			umtxq_chains[i][j].uc_busy = 0;
+			umtxq_chains[i][j].uc_waiters = 0;
+#ifdef UMTX_PROFILING
+			umtxq_chains[i][j].length = 0;
+			umtxq_chains[i][j].max_length = 0;	
+#endif
+		}
+	}
+#ifdef UMTX_PROFILING
+	umtx_init_profiling();
+#endif
+	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
+	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
+	    EVENTHANDLER_PRI_ANY);
+}
+
+struct umtx_q *
+umtxq_alloc(void)
+{
+	struct umtx_q *uq;
+
+	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
+	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
+	TAILQ_INIT(&uq->uq_spare_queue->head);
+	TAILQ_INIT(&uq->uq_pi_contested);
+	uq->uq_inherited_pri = PRI_MAX;
+	return (uq);
+}
+
+void
+umtxq_free(struct umtx_q *uq)
+{
+	MPASS(uq->uq_spare_queue != NULL);
+	free(uq->uq_spare_queue, M_UMTX);
+	free(uq, M_UMTX);
+}
+
+static inline void
+umtxq_hash(struct umtx_key *key)
+{
+	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
+	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
+}
+
+static inline struct umtxq_chain *
+umtxq_getchain(struct umtx_key *key)
+{
+	if (key->type <= TYPE_SEM)
+		return (&umtxq_chains[1][key->hash]);
+	return (&umtxq_chains[0][key->hash]);
+}
+
+/*
+ * Lock a chain.
+ */
+static inline void
+umtxq_lock(struct umtx_key *key)
+{
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(key);
+	mtx_lock(&uc->uc_lock);
+}
+
+/*
+ * Unlock a chain.
+ */
+static inline void
+umtxq_unlock(struct umtx_key *key)
+{
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(key);
+	mtx_unlock(&uc->uc_lock);
+}
+
+/*
+ * Set chain to busy state when following operation
+ * may be blocked (kernel mutex can not be used).
+ */
+static inline void
+umtxq_busy(struct umtx_key *key)
+{
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(key);
+	mtx_assert(&uc->uc_lock, MA_OWNED);
+	if (uc->uc_busy) {
+#ifdef SMP
+		if (smp_cpus > 1) {
+			int count = BUSY_SPINS;
+			if (count > 0) {
+				umtxq_unlock(key);
+				while (uc->uc_busy && --count > 0)
+					cpu_spinwait();
+				umtxq_lock(key);
+			}
+		}
+#endif
+		while (uc->uc_busy) {
+			uc->uc_waiters++;
+			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
+			uc->uc_waiters--;
+		}
+	}
+	uc->uc_busy = 1;
+}
+
+/*
+ * Unbusy a chain.
+ */
+static inline void
+umtxq_unbusy(struct umtx_key *key)
+{
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(key);
+	mtx_assert(&uc->uc_lock, MA_OWNED);
+	KASSERT(uc->uc_busy != 0, ("not busy"));
+	uc->uc_busy = 0;
+	if (uc->uc_waiters)
+		wakeup_one(uc);
+}
+
+static struct umtxq_queue *
+umtxq_queue_lookup(struct umtx_key *key, int q)
+{
+	struct umtxq_queue *uh;
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
+		if (umtx_key_match(&uh->key, key))
+			return (uh);
+	}
+
+	return (NULL);
+}
+
+static inline void
+umtxq_insert_queue(struct umtx_q *uq, int q)
+{
+	struct umtxq_queue *uh;
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(&uq->uq_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
+	uh = umtxq_queue_lookup(&uq->uq_key, q);
+	if (uh != NULL) {
+		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
+	} else {
+		uh = uq->uq_spare_queue;
+		uh->key = uq->uq_key;
+		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
+#ifdef UMTX_PROFILING
+		uc->length++;
+		if (uc->length > uc->max_length) {
+			uc->max_length = uc->length;
+			if (uc->max_length > max_length)
+				max_length = uc->max_length;	
+		}
+#endif
+	}
+	uq->uq_spare_queue = NULL;
+
+	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
+	uh->length++;
+	uq->uq_flags |= UQF_UMTXQ;
+	uq->uq_cur_queue = uh;
+	return;
+}
+
+static inline void
+umtxq_remove_queue(struct umtx_q *uq, int q)
+{
+	struct umtxq_chain *uc;
+	struct umtxq_queue *uh;
+
+	uc = umtxq_getchain(&uq->uq_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	if (uq->uq_flags & UQF_UMTXQ) {
+		uh = uq->uq_cur_queue;
+		TAILQ_REMOVE(&uh->head, uq, uq_link);
+		uh->length--;
+		uq->uq_flags &= ~UQF_UMTXQ;
+		if (TAILQ_EMPTY(&uh->head)) {
+			KASSERT(uh->length == 0,
+			    ("inconsistent umtxq_queue length"));
+#ifdef UMTX_PROFILING
+			uc->length--;
+#endif
+			LIST_REMOVE(uh, link);
+		} else {
+			uh = LIST_FIRST(&uc->uc_spare_queue);
+			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
+			LIST_REMOVE(uh, link);
+		}
+		uq->uq_spare_queue = uh;
+		uq->uq_cur_queue = NULL;
+	}
+}
+
+/*
+ * Check if there are multiple waiters
+ */
+static int
+umtxq_count(struct umtx_key *key)
+{
+	struct umtxq_chain *uc;
+	struct umtxq_queue *uh;
+
+	uc = umtxq_getchain(key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
+	if (uh != NULL)
+		return (uh->length);
+	return (0);
+}
+
+/*
+ * Check if there are multiple PI waiters and returns first
+ * waiter.
+ */
+static int
+umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
+{
+	struct umtxq_chain *uc;
+	struct umtxq_queue *uh;
+
+	*first = NULL;
+	uc = umtxq_getchain(key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
+	if (uh != NULL) {
+		*first = TAILQ_FIRST(&uh->head);
+		return (uh->length);
+	}
+	return (0);
+}
+
+static int
+umtxq_check_susp(struct thread *td)
+{
+	struct proc *p;
+	int error;
+
+	/*
+	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
+	 * eventually break the lockstep loop.
+	 */
+	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
+		return (0);
+	error = 0;
+	p = td->td_proc;
+	PROC_LOCK(p);
+	if (P_SHOULDSTOP(p) ||
+	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
+		if (p->p_flag & P_SINGLE_EXIT)
+			error = EINTR;
+		else
+			error = ERESTART;
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+/*
+ * Wake up threads waiting on an userland object.
+ */
+
+static int
+umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
+{
+	struct umtxq_chain *uc;
+	struct umtxq_queue *uh;
+	struct umtx_q *uq;
+	int ret;
+
+	ret = 0;
+	uc = umtxq_getchain(key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	uh = umtxq_queue_lookup(key, q);
+	if (uh != NULL) {
+		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
+			umtxq_remove_queue(uq, q);
+			wakeup(uq);
+			if (++ret >= n_wake)
+				return (ret);
+		}
+	}
+	return (ret);
+}
+
+
+/*
+ * Wake up specified thread.
+ */
+static inline void
+umtxq_signal_thread(struct umtx_q *uq)
+{
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(&uq->uq_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	umtxq_remove(uq);
+	wakeup(uq);
+}
+
+static inline int 
+tstohz(const struct timespec *tsp)
+{
+	struct timeval tv;
+
+	TIMESPEC_TO_TIMEVAL(&tv, tsp);
+	return tvtohz(&tv);
+}
+
+static void
+abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
+	const struct timespec *timeout)
+{
+
+	timo->clockid = clockid;
+	if (!absolute) {
+		kern_clock_gettime(curthread, clockid, &timo->end);
+		timo->cur = timo->end;
+		timespecadd(&timo->end, timeout);
+	} else {
+		timo->end = *timeout;
+		kern_clock_gettime(curthread, clockid, &timo->cur);
+	}
+}
+
+static void
+abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
+{
+
+	abs_timeout_init(timo, umtxtime->_clockid,
+		(umtxtime->_flags & UMTX_ABSTIME) != 0,
+		&umtxtime->_timeout);
+}
+
+static inline void
+abs_timeout_update(struct abs_timeout *timo)
+{
+	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
+}
+
+static int
+abs_timeout_gethz(struct abs_timeout *timo)
+{
+	struct timespec tts;
+
+	if (timespeccmp(&timo->end, &timo->cur, <=))
+		return (-1); 
+	tts = timo->end;
+	timespecsub(&tts, &timo->cur);
+	return (tstohz(&tts));
+}
+
+/*
+ * Put thread into sleep state, before sleeping, check if
+ * thread was removed from umtx queue.
+ */
+static inline int
+umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
+{
+	struct umtxq_chain *uc;
+	int error, timo;
+
+	uc = umtxq_getchain(&uq->uq_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	for (;;) {
+		if (!(uq->uq_flags & UQF_UMTXQ))
+			return (0);
+		if (abstime != NULL) {
+			timo = abs_timeout_gethz(abstime);
+			if (timo < 0)
+				return (ETIMEDOUT);
+		} else
+			timo = 0;
+		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
+		if (error != EWOULDBLOCK) {
+			umtxq_lock(&uq->uq_key);
+			break;
+		}
+		if (abstime != NULL)
+			abs_timeout_update(abstime);
+		umtxq_lock(&uq->uq_key);
+	}
+	return (error);
+}
+
+/*
+ * Convert userspace address into unique logical address.
+ */
+int
+umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
+{
+	struct thread *td = curthread;
+	vm_map_t map;
+	vm_map_entry_t entry;
+	vm_pindex_t pindex;
+	vm_prot_t prot;
+	boolean_t wired;
+
+	key->type = type;
+	if (share == THREAD_SHARE) {
+		key->shared = 0;
+		key->info.private.vs = td->td_proc->p_vmspace;
+		key->info.private.addr = (uintptr_t)addr;
+	} else {
+		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
+		map = &td->td_proc->p_vmspace->vm_map;
+		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
+		    &entry, &key->info.shared.object, &pindex, &prot,
+		    &wired) != KERN_SUCCESS) {
+			return EFAULT;
+		}
+
+		if ((share == PROCESS_SHARE) ||
+		    (share == AUTO_SHARE &&
+		     VM_INHERIT_SHARE == entry->inheritance)) {
+			key->shared = 1;
+			key->info.shared.offset = entry->offset + entry->start -
+				(vm_offset_t)addr;
+			vm_object_reference(key->info.shared.object);
+		} else {
+			key->shared = 0;
+			key->info.private.vs = td->td_proc->p_vmspace;
+			key->info.private.addr = (uintptr_t)addr;
+		}
+		vm_map_lookup_done(map, entry);
+	}
+
+	umtxq_hash(key);
+	return (0);
+}
+
+/*
+ * Release key.
+ */
+void
+umtx_key_release(struct umtx_key *key)
+{
+	if (key->shared)
+		vm_object_deallocate(key->info.shared.object);
+}
+
+/*
+ * Lock a umtx object.
+ */
+static int
+do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
+	const struct timespec *timeout)
+{
+	struct abs_timeout timo;
+	struct umtx_q *uq;
+	u_long owner;
+	u_long old;
+	int error = 0;
+
+	uq = td->td_umtxq;
+	if (timeout != NULL)
+		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
+
+	/*
+	 * Care must be exercised when dealing with umtx structure. It
+	 * can fault on any access.
+	 */
+	for (;;) {
+		/*
+		 * Try the uncontested case.  This should be done in userland.
+		 */
+		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
+
+		/* The acquire succeeded. */
+		if (owner == UMTX_UNOWNED)
+			return (0);
+
+		/* The address was invalid. */
+		if (owner == -1)
+			return (EFAULT);
+
+		/* If no one owns it but it is contested try to acquire it. */
+		if (owner == UMTX_CONTESTED) {
+			owner = casuword(&umtx->u_owner,
+			    UMTX_CONTESTED, id | UMTX_CONTESTED);
+
+			if (owner == UMTX_CONTESTED)
+				return (0);
+
+			/* The address was invalid. */
+			if (owner == -1)
+				return (EFAULT);
+
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
+
+			/* If this failed the lock has changed, restart. */
+			continue;
+		}
+
+		/*
+		 * If we caught a signal, we have retried and now
+		 * exit immediately.
+		 */
+		if (error != 0)
+			break;
+
+		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
+			AUTO_SHARE, &uq->uq_key)) != 0)
+			return (error);
+
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_insert(uq);
+		umtxq_unbusy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+
+		/*
+		 * Set the contested bit so that a release in user space
+		 * knows to use the system call for unlock.  If this fails
+		 * either some one else has acquired the lock or it has been
+		 * released.
+		 */
+		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
+
+		/* The address was invalid. */
+		if (old == -1) {
+			umtxq_lock(&uq->uq_key);
+			umtxq_remove(uq);
+			umtxq_unlock(&uq->uq_key);
+			umtx_key_release(&uq->uq_key);
+			return (EFAULT);
+		}
+
+		/*
+		 * We set the contested bit, sleep. Otherwise the lock changed
+		 * and we need to retry or we lost a race to the thread
+		 * unlocking the umtx.
+		 */
+		umtxq_lock(&uq->uq_key);
+		if (old == owner)
+			error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
+			    &timo);
+		umtxq_remove(uq);
+		umtxq_unlock(&uq->uq_key);
+		umtx_key_release(&uq->uq_key);
+
+		if (error == 0)
+			error = umtxq_check_susp(td);
+	}
+
+	if (timeout == NULL) {
+		/* Mutex locking is restarted if it is interrupted. */
+		if (error == EINTR)
+			error = ERESTART;
+	} else {
+		/* Timed-locking is not restarted. */
+		if (error == ERESTART)
+			error = EINTR;
+	}
+	return (error);
+}
+
+/*
+ * Unlock a umtx object.
+ */
+static int
+do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
+{
+	struct umtx_key key;
+	u_long owner;
+	u_long old;
+	int error;
+	int count;
+
+	/*
+	 * Make sure we own this mtx.
+	 */
+	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
+	if (owner == -1)
+		return (EFAULT);
+
+	if ((owner & ~UMTX_CONTESTED) != id)
+		return (EPERM);
+
+	/* This should be done in userland */
+	if ((owner & UMTX_CONTESTED) == 0) {
+		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
+		if (old == -1)
+			return (EFAULT);
+		if (old == owner)
+			return (0);
+		owner = old;
+	}
+
+	/* We should only ever be in here for contested locks */
+	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
+		&key)) != 0)
+		return (error);
+
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	count = umtxq_count(&key);
+	umtxq_unlock(&key);
+
+	/*
+	 * When unlocking the umtx, it must be marked as unowned if
+	 * there is zero or one thread only waiting for it.
+	 * Otherwise, it must be marked as contested.
+	 */
+	old = casuword(&umtx->u_owner, owner,
+		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
+	umtxq_lock(&key);
+	umtxq_signal(&key,1);
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+	umtx_key_release(&key);
+	if (old == -1)
+		return (EFAULT);
+	if (old != owner)
+		return (EINVAL);
+	return (0);
+}
+
+#ifdef COMPAT_FREEBSD32
+
+/*
+ * Lock a umtx object.
+ */
+static int
+do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
+	const struct timespec *timeout)
+{
+	struct abs_timeout timo;
+	struct umtx_q *uq;
+	uint32_t owner;
+	uint32_t old;
+	int error = 0;
+
+	uq = td->td_umtxq;
+
+	if (timeout != NULL)
+		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
+
+	/*
+	 * Care must be exercised when dealing with umtx structure. It
+	 * can fault on any access.
+	 */
+	for (;;) {
+		/*
+		 * Try the uncontested case.  This should be done in userland.
+		 */
+		owner = casuword32(m, UMUTEX_UNOWNED, id);
+
+		/* The acquire succeeded. */
+		if (owner == UMUTEX_UNOWNED)
+			return (0);
+
+		/* The address was invalid. */
+		if (owner == -1)
+			return (EFAULT);
+
+		/* If no one owns it but it is contested try to acquire it. */
+		if (owner == UMUTEX_CONTESTED) {
+			owner = casuword32(m,
+			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+			if (owner == UMUTEX_CONTESTED)
+				return (0);
+
+			/* The address was invalid. */
+			if (owner == -1)
+				return (EFAULT);
+
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
+
+			/* If this failed the lock has changed, restart. */
+			continue;
+		}
+
+		/*
+		 * If we caught a signal, we have retried and now
+		 * exit immediately.
+		 */
+		if (error != 0)
+			return (error);
+
+		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
+			AUTO_SHARE, &uq->uq_key)) != 0)
+			return (error);
+
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_insert(uq);
+		umtxq_unbusy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+
+		/*
+		 * Set the contested bit so that a release in user space
+		 * knows to use the system call for unlock.  If this fails
+		 * either some one else has acquired the lock or it has been
+		 * released.
+		 */
+		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
+
+		/* The address was invalid. */
+		if (old == -1) {
+			umtxq_lock(&uq->uq_key);
+			umtxq_remove(uq);
+			umtxq_unlock(&uq->uq_key);
+			umtx_key_release(&uq->uq_key);
+			return (EFAULT);
+		}
+
+		/*
+		 * We set the contested bit, sleep. Otherwise the lock changed
+		 * and we need to retry or we lost a race to the thread
+		 * unlocking the umtx.
+		 */
+		umtxq_lock(&uq->uq_key);
+		if (old == owner)
+			error = umtxq_sleep(uq, "umtx", timeout == NULL ?
+			    NULL : &timo);
+		umtxq_remove(uq);
+		umtxq_unlock(&uq->uq_key);
+		umtx_key_release(&uq->uq_key);
+
+		if (error == 0)
+			error = umtxq_check_susp(td);
+	}
+
+	if (timeout == NULL) {
+		/* Mutex locking is restarted if it is interrupted. */
+		if (error == EINTR)
+			error = ERESTART;
+	} else {
+		/* Timed-locking is not restarted. */
+		if (error == ERESTART)
+			error = EINTR;
+	}
+	return (error);
+}
+
+/*
+ * Unlock a umtx object.
+ */
+static int
+do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
+{
+	struct umtx_key key;
+	uint32_t owner;
+	uint32_t old;
+	int error;
+	int count;
+
+	/*
+	 * Make sure we own this mtx.
+	 */
+	owner = fuword32(m);
+	if (owner == -1)
+		return (EFAULT);
+
+	if ((owner & ~UMUTEX_CONTESTED) != id)
+		return (EPERM);
+
+	/* This should be done in userland */
+	if ((owner & UMUTEX_CONTESTED) == 0) {
+		old = casuword32(m, owner, UMUTEX_UNOWNED);
+		if (old == -1)
+			return (EFAULT);
+		if (old == owner)
+			return (0);
+		owner = old;
+	}
+
+	/* We should only ever be in here for contested locks */
+	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
+		&key)) != 0)
+		return (error);
+
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	count = umtxq_count(&key);
+	umtxq_unlock(&key);
+
+	/*
+	 * When unlocking the umtx, it must be marked as unowned if
+	 * there is zero or one thread only waiting for it.
+	 * Otherwise, it must be marked as contested.
+	 */
+	old = casuword32(m, owner,
+		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+	umtxq_lock(&key);
+	umtxq_signal(&key,1);
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+	umtx_key_release(&key);
+	if (old == -1)
+		return (EFAULT);
+	if (old != owner)
+		return (EINVAL);
+	return (0);
+}
+#endif
+
+/*
+ * Fetch and compare value, sleep on the address if value is not changed.
+ */
+static int
+do_wait(struct thread *td, void *addr, u_long id,
+	struct _umtx_time *timeout, int compat32, int is_private)
+{
+	struct abs_timeout timo;
+	struct umtx_q *uq;
+	u_long tmp;
+	int error = 0;
+
+	uq = td->td_umtxq;
+	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
+		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
+		return (error);
+
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
+
+	umtxq_lock(&uq->uq_key);
+	umtxq_insert(uq);
+	umtxq_unlock(&uq->uq_key);
+	if (compat32 == 0)
+		tmp = fuword(addr);
+        else
+		tmp = (unsigned int)fuword32(addr);
+	umtxq_lock(&uq->uq_key);
+	if (tmp == id)
+		error = umtxq_sleep(uq, "uwait", timeout == NULL ?
+		    NULL : &timo);
+	if ((uq->uq_flags & UQF_UMTXQ) == 0)
+		error = 0;
+	else
+		umtxq_remove(uq);
+	umtxq_unlock(&uq->uq_key);
+	umtx_key_release(&uq->uq_key);
+	if (error == ERESTART)
+		error = EINTR;
+	return (error);
+}
+
+/*
+ * Wake up threads sleeping on the specified address.
+ */
+int
+kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
+{
+	struct umtx_key key;
+	int ret;
+	
+	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
+		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
+		return (ret);
+	umtxq_lock(&key);
+	ret = umtxq_signal(&key, n_wake);
+	umtxq_unlock(&key);
+	umtx_key_release(&key);
+	return (0);
+}
+
+/*
+ * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
+ */
+static int
+do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
+	struct _umtx_time *timeout, int mode)
+{
+	struct abs_timeout timo;
+	struct umtx_q *uq;
+	uint32_t owner, old, id;
+	int error = 0;
+
+	id = td->td_tid;
+	uq = td->td_umtxq;
+
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
+
+	/*
+	 * Care must be exercised when dealing with umtx structure. It
+	 * can fault on any access.
+	 */
+	for (;;) {
+		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
+		if (mode == _UMUTEX_WAIT) {
+			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
+				return (0);
+		} else {
+			/*
+			 * Try the uncontested case.  This should be done in userland.
+			 */
+			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
+
+			/* The acquire succeeded. */
+			if (owner == UMUTEX_UNOWNED)
+				return (0);
+
+			/* The address was invalid. */
+			if (owner == -1)
+				return (EFAULT);
+
+			/* If no one owns it but it is contested try to acquire it. */
+			if (owner == UMUTEX_CONTESTED) {
+				owner = casuword32(&m->m_owner,
+				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+				if (owner == UMUTEX_CONTESTED)
+					return (0);
+
+				/* The address was invalid. */
+				if (owner == -1)
+					return (EFAULT);
+
+				error = umtxq_check_susp(td);
+				if (error != 0)
+					return (error);
+
+				/* If this failed the lock has changed, restart. */
+				continue;
+			}
+		}
+
+		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
+		    (owner & ~UMUTEX_CONTESTED) == id)
+			return (EDEADLK);
+
+		if (mode == _UMUTEX_TRY)
+			return (EBUSY);
+
+		/*
+		 * If we caught a signal, we have retried and now
+		 * exit immediately.
+		 */
+		if (error != 0)
+			return (error);
+
+		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
+		    GET_SHARE(flags), &uq->uq_key)) != 0)
+			return (error);
+
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_insert(uq);
+		umtxq_unlock(&uq->uq_key);
+
+		/*
+		 * Set the contested bit so that a release in user space
+		 * knows to use the system call for unlock.  If this fails
+		 * either some one else has acquired the lock or it has been
+		 * released.
+		 */
+		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
+
+		/* The address was invalid. */
+		if (old == -1) {
+			umtxq_lock(&uq->uq_key);
+			umtxq_remove(uq);
+			umtxq_unbusy(&uq->uq_key);
+			umtxq_unlock(&uq->uq_key);
+			umtx_key_release(&uq->uq_key);
+			return (EFAULT);
+		}
+
+		/*
+		 * We set the contested bit, sleep. Otherwise the lock changed
+		 * and we need to retry or we lost a race to the thread
+		 * unlocking the umtx.
+		 */
+		umtxq_lock(&uq->uq_key);
+		umtxq_unbusy(&uq->uq_key);
+		if (old == owner)
+			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
+			    NULL : &timo);
+		umtxq_remove(uq);
+		umtxq_unlock(&uq->uq_key);
+		umtx_key_release(&uq->uq_key);
+
+		if (error == 0)
+			error = umtxq_check_susp(td);
+	}
+
+	return (0);
+}
+
+/*
+ * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
+ */
+static int
+do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
+{
+	struct umtx_key key;
+	uint32_t owner, old, id;
+	int error;
+	int count;
+
+	id = td->td_tid;
+	/*
+	 * Make sure we own this mtx.
+	 */
+	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+	if (owner == -1)
+		return (EFAULT);
+
+	if ((owner & ~UMUTEX_CONTESTED) != id)
+		return (EPERM);
+
+	if ((owner & UMUTEX_CONTESTED) == 0) {
+		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
+		if (old == -1)
+			return (EFAULT);
+		if (old == owner)
+			return (0);
+		owner = old;
+	}
+
+	/* We should only ever be in here for contested locks */
+	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
+	    &key)) != 0)
+		return (error);
+
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	count = umtxq_count(&key);
+	umtxq_unlock(&key);
+
+	/*
+	 * When unlocking the umtx, it must be marked as unowned if
+	 * there is zero or one thread only waiting for it.
+	 * Otherwise, it must be marked as contested.
+	 */
+	old = casuword32(&m->m_owner, owner,
+		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+	umtxq_lock(&key);
+	umtxq_signal(&key,1);
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+	umtx_key_release(&key);
+	if (old == -1)
+		return (EFAULT);
+	if (old != owner)
+		return (EINVAL);
+	return (0);
+}
+
+/*
+ * Check if the mutex is available and wake up a waiter,
+ * only for simple mutex.
+ */
+static int
+do_wake_umutex(struct thread *td, struct umutex *m)
+{
+	struct umtx_key key;
+	uint32_t owner;
+	uint32_t flags;
+	int error;
+	int count;
+
+	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+	if (owner == -1)
+		return (EFAULT);
+
+	if ((owner & ~UMUTEX_CONTESTED) != 0)
+		return (0);
+
+	flags = fuword32(&m->m_flags);
+
+	/* We should only ever be in here for contested locks */
+	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
+	    &key)) != 0)
+		return (error);
+
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	count = umtxq_count(&key);
+	umtxq_unlock(&key);
+
+	if (count <= 1)
+		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
+
+	umtxq_lock(&key);
+	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
+		umtxq_signal(&key, 1);
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+	umtx_key_release(&key);
+	return (0);
+}
+
+/*
+ * Check if the mutex has waiters and tries to fix contention bit.
+ */
+static int
+do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
+{
+	struct umtx_key key;
+	uint32_t owner, old;
+	int type;
+	int error;
+	int count;
+
+	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+	case 0:
+		type = TYPE_NORMAL_UMUTEX;
+		break;
+	case UMUTEX_PRIO_INHERIT:
+		type = TYPE_PI_UMUTEX;
+		break;
+	case UMUTEX_PRIO_PROTECT:
+		type = TYPE_PP_UMUTEX;
+		break;
+	default:
+		return (EINVAL);
+	}
+	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
+	    &key)) != 0)
+		return (error);
+
+	owner = 0;
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	count = umtxq_count(&key);
+	umtxq_unlock(&key);
+	/*
+	 * Only repair contention bit if there is a waiter, this means the mutex
+	 * is still being referenced by userland code, otherwise don't update
+	 * any memory.
+	 */
+	if (count > 1) {
+		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+		while ((owner & UMUTEX_CONTESTED) ==0) {
+			old = casuword32(&m->m_owner, owner,
+			    owner|UMUTEX_CONTESTED);
+			if (old == owner)
+				break;
+			owner = old;
+			if (old == -1)
+				break;
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
+		}
+	} else if (count == 1) {
+		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+		while ((owner & ~UMUTEX_CONTESTED) != 0 &&
+		       (owner & UMUTEX_CONTESTED) == 0) {
+			old = casuword32(&m->m_owner, owner,
+			    owner|UMUTEX_CONTESTED);
+			if (old == owner)
+				break;
+			owner = old;
+			if (old == -1)
+				break;
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
+		}
+	}
+	umtxq_lock(&key);
+	if (owner == -1) {
+		error = EFAULT;
+		umtxq_signal(&key, INT_MAX);
+	}
+	else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
+		umtxq_signal(&key, 1);
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+	umtx_key_release(&key);
+	return (error);
+}
+
+static inline struct umtx_pi *
+umtx_pi_alloc(int flags)
+{
+	struct umtx_pi *pi;
+
+	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
+	TAILQ_INIT(&pi->pi_blocked);
+	atomic_add_int(&umtx_pi_allocated, 1);
+	return (pi);
+}
+
+static inline void
+umtx_pi_free(struct umtx_pi *pi)
+{
+	uma_zfree(umtx_pi_zone, pi);
+	atomic_add_int(&umtx_pi_allocated, -1);
+}
+
+/*
+ * Adjust the thread's position on a pi_state after its priority has been
+ * changed.
+ */
+static int
+umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
+{
+	struct umtx_q *uq, *uq1, *uq2;
+	struct thread *td1;
+
+	mtx_assert(&umtx_lock, MA_OWNED);
+	if (pi == NULL)
+		return (0);
+
+	uq = td->td_umtxq;
+
+	/*
+	 * Check if the thread needs to be moved on the blocked chain.
+	 * It needs to be moved if either its priority is lower than
+	 * the previous thread or higher than the next thread.
+	 */
+	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
+	uq2 = TAILQ_NEXT(uq, uq_lockq);
+	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
+	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
+		/*
+		 * Remove thread from blocked chain and determine where
+		 * it should be moved to.
+		 */
+		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
+		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
+			td1 = uq1->uq_thread;
+			MPASS(td1->td_proc->p_magic == P_MAGIC);
+			if (UPRI(td1) > UPRI(td))
+				break;
+		}
+
+		if (uq1 == NULL)
+			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
+		else
+			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
+	}
+	return (1);
+}
+
+/*
+ * Propagate priority when a thread is blocked on POSIX
+ * PI mutex.
+ */ 
+static void
+umtx_propagate_priority(struct thread *td)
+{
+	struct umtx_q *uq;
+	struct umtx_pi *pi;
+	int pri;
+
+	mtx_assert(&umtx_lock, MA_OWNED);
+	pri = UPRI(td);
+	uq = td->td_umtxq;
+	pi = uq->uq_pi_blocked;
+	if (pi == NULL)
+		return;
+
+	for (;;) {
+		td = pi->pi_owner;
+		if (td == NULL || td == curthread)
+			return;
+
+		MPASS(td->td_proc != NULL);
+		MPASS(td->td_proc->p_magic == P_MAGIC);
+
+		thread_lock(td);
+		if (td->td_lend_user_pri > pri)
+			sched_lend_user_prio(td, pri);
+		else {
+			thread_unlock(td);
+			break;
+		}
+		thread_unlock(td);
+
+		/*
+		 * Pick up the lock that td is blocked on.
+		 */
+		uq = td->td_umtxq;
+		pi = uq->uq_pi_blocked;
+		if (pi == NULL)
+			break;
+		/* Resort td on the list if needed. */
+		umtx_pi_adjust_thread(pi, td);
+	}
+}
+
+/*
+ * Unpropagate priority for a PI mutex when a thread blocked on
+ * it is interrupted by signal or resumed by others.
+ */
+static void
+umtx_repropagate_priority(struct umtx_pi *pi)
+{
+	struct umtx_q *uq, *uq_owner;
+	struct umtx_pi *pi2;
+	int pri;
+
+	mtx_assert(&umtx_lock, MA_OWNED);
+
+	while (pi != NULL && pi->pi_owner != NULL) {
+		pri = PRI_MAX;
+		uq_owner = pi->pi_owner->td_umtxq;
+
+		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
+			uq = TAILQ_FIRST(&pi2->pi_blocked);
+			if (uq != NULL) {
+				if (pri > UPRI(uq->uq_thread))
+					pri = UPRI(uq->uq_thread);
+			}
+		}
+
+		if (pri > uq_owner->uq_inherited_pri)
+			pri = uq_owner->uq_inherited_pri;
+		thread_lock(pi->pi_owner);
+		sched_lend_user_prio(pi->pi_owner, pri);
+		thread_unlock(pi->pi_owner);
+		if ((pi = uq_owner->uq_pi_blocked) != NULL)
+			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
+	}
+}
+
+/*
+ * Insert a PI mutex into owned list.
+ */
+static void
+umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
+{
+	struct umtx_q *uq_owner;
+
+	uq_owner = owner->td_umtxq;
+	mtx_assert(&umtx_lock, MA_OWNED);
+	if (pi->pi_owner != NULL)
+		panic("pi_ower != NULL");
+	pi->pi_owner = owner;
+	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
+}
+
+/*
+ * Claim ownership of a PI mutex.
+ */
+static int
+umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
+{
+	struct umtx_q *uq, *uq_owner;
+
+	uq_owner = owner->td_umtxq;
+	mtx_lock_spin(&umtx_lock);
+	if (pi->pi_owner == owner) {
+		mtx_unlock_spin(&umtx_lock);
+		return (0);
+	}
+
+	if (pi->pi_owner != NULL) {
+		/*
+		 * userland may have already messed the mutex, sigh.
+		 */
+		mtx_unlock_spin(&umtx_lock);
+		return (EPERM);
+	}
+	umtx_pi_setowner(pi, owner);
+	uq = TAILQ_FIRST(&pi->pi_blocked);
+	if (uq != NULL) {
+		int pri;
+
+		pri = UPRI(uq->uq_thread);
+		thread_lock(owner);
+		if (pri < UPRI(owner))
+			sched_lend_user_prio(owner, pri);
+		thread_unlock(owner);
+	}
+	mtx_unlock_spin(&umtx_lock);
+	return (0);
+}
+
+/*
+ * Adjust a thread's order position in its blocked PI mutex,
+ * this may result new priority propagating process.
+ */
+void
+umtx_pi_adjust(struct thread *td, u_char oldpri)
+{
+	struct umtx_q *uq;
+	struct umtx_pi *pi;
+
+	uq = td->td_umtxq;
+	mtx_lock_spin(&umtx_lock);
+	/*
+	 * Pick up the lock that td is blocked on.
+	 */
+	pi = uq->uq_pi_blocked;
+	if (pi != NULL) {
+		umtx_pi_adjust_thread(pi, td);
+		umtx_repropagate_priority(pi);
+	}
+	mtx_unlock_spin(&umtx_lock);
+}
+
+/*
+ * Sleep on a PI mutex.
+ */
+static int
+umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
+	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
+{
+	struct umtxq_chain *uc;
+	struct thread *td, *td1;
+	struct umtx_q *uq1;
+	int pri;
+	int error = 0;
+
+	td = uq->uq_thread;
+	KASSERT(td == curthread, ("inconsistent uq_thread"));
+	uc = umtxq_getchain(&uq->uq_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	UMTXQ_BUSY_ASSERT(uc);
+	umtxq_insert(uq);
+	mtx_lock_spin(&umtx_lock);
+	if (pi->pi_owner == NULL) {
+		mtx_unlock_spin(&umtx_lock);
+		/* XXX Only look up thread in current process. */
+		td1 = tdfind(owner, curproc->p_pid);
+		mtx_lock_spin(&umtx_lock);
+		if (td1 != NULL) {
+			if (pi->pi_owner == NULL)
+				umtx_pi_setowner(pi, td1);
+			PROC_UNLOCK(td1->td_proc);
+		}
+	}
+
+	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
+		pri = UPRI(uq1->uq_thread);
+		if (pri > UPRI(td))
+			break;
+	}
+
+	if (uq1 != NULL)
+		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
+	else
+		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
+
+	uq->uq_pi_blocked = pi;
+	thread_lock(td);
+	td->td_flags |= TDF_UPIBLOCKED;
+	thread_unlock(td);
+	umtx_propagate_priority(td);
+	mtx_unlock_spin(&umtx_lock);
+	umtxq_unbusy(&uq->uq_key);
+
+	error = umtxq_sleep(uq, wmesg, timo);
+	umtxq_remove(uq);
+
+	mtx_lock_spin(&umtx_lock);
+	uq->uq_pi_blocked = NULL;
+	thread_lock(td);
+	td->td_flags &= ~TDF_UPIBLOCKED;
+	thread_unlock(td);
+	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
+	umtx_repropagate_priority(pi);
+	mtx_unlock_spin(&umtx_lock);
+	umtxq_unlock(&uq->uq_key);
+
+	return (error);
+}
+
+/*
+ * Add reference count for a PI mutex.
+ */
+static void
+umtx_pi_ref(struct umtx_pi *pi)
+{
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(&pi->pi_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	pi->pi_refcount++;
+}
+
+/*
+ * Decrease reference count for a PI mutex, if the counter
+ * is decreased to zero, its memory space is freed.
+ */ 
+static void
+umtx_pi_unref(struct umtx_pi *pi)
+{
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(&pi->pi_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
+	if (--pi->pi_refcount == 0) {
+		mtx_lock_spin(&umtx_lock);
+		if (pi->pi_owner != NULL) {
+			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
+				pi, pi_link);
+			pi->pi_owner = NULL;
+		}
+		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
+			("blocked queue not empty"));
+		mtx_unlock_spin(&umtx_lock);
+		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
+		umtx_pi_free(pi);
+	}
+}
+
+/*
+ * Find a PI mutex in hash table.
+ */
+static struct umtx_pi *
+umtx_pi_lookup(struct umtx_key *key)
+{
+	struct umtxq_chain *uc;
+	struct umtx_pi *pi;
+
+	uc = umtxq_getchain(key);
+	UMTXQ_LOCKED_ASSERT(uc);
+
+	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
+		if (umtx_key_match(&pi->pi_key, key)) {
+			return (pi);
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Insert a PI mutex into hash table.
+ */
+static inline void
+umtx_pi_insert(struct umtx_pi *pi)
+{
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(&pi->pi_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
+}
+
+/*
+ * Lock a PI mutex.
+ */
+static int
+do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
+    struct _umtx_time *timeout, int try)
+{
+	struct abs_timeout timo;
+	struct umtx_q *uq;
+	struct umtx_pi *pi, *new_pi;
+	uint32_t id, owner, old;
+	int error;
+
+	id = td->td_tid;
+	uq = td->td_umtxq;
+
+	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
+	    &uq->uq_key)) != 0)
+		return (error);
+
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
+
+	umtxq_lock(&uq->uq_key);
+	pi = umtx_pi_lookup(&uq->uq_key);
+	if (pi == NULL) {
+		new_pi = umtx_pi_alloc(M_NOWAIT);
+		if (new_pi == NULL) {
+			umtxq_unlock(&uq->uq_key);
+			new_pi = umtx_pi_alloc(M_WAITOK);
+			umtxq_lock(&uq->uq_key);
+			pi = umtx_pi_lookup(&uq->uq_key);
+			if (pi != NULL) {
+				umtx_pi_free(new_pi);
+				new_pi = NULL;
+			}
+		}
+		if (new_pi != NULL) {
+			new_pi->pi_key = uq->uq_key;
+			umtx_pi_insert(new_pi);
+			pi = new_pi;
+		}
+	}
+	umtx_pi_ref(pi);
+	umtxq_unlock(&uq->uq_key);
+
+	/*
+	 * Care must be exercised when dealing with umtx structure.  It
+	 * can fault on any access.
+	 */
+	for (;;) {
+		/*
+		 * Try the uncontested case.  This should be done in userland.
+		 */
+		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
+
+		/* The acquire succeeded. */
+		if (owner == UMUTEX_UNOWNED) {
+			error = 0;
+			break;
+		}
+
+		/* The address was invalid. */
+		if (owner == -1) {
+			error = EFAULT;
+			break;
+		}
+
+		/* If no one owns it but it is contested try to acquire it. */
+		if (owner == UMUTEX_CONTESTED) {
+			owner = casuword32(&m->m_owner,
+			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+			if (owner == UMUTEX_CONTESTED) {
+				umtxq_lock(&uq->uq_key);
+				umtxq_busy(&uq->uq_key);
+				error = umtx_pi_claim(pi, td);
+				umtxq_unbusy(&uq->uq_key);
+				umtxq_unlock(&uq->uq_key);
+				break;
+			}
+
+			/* The address was invalid. */
+			if (owner == -1) {
+				error = EFAULT;
+				break;
+			}
+
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
+
+			/* If this failed the lock has changed, restart. */
+			continue;
+		}
+
+		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
+		    (owner & ~UMUTEX_CONTESTED) == id) {
+			error = EDEADLK;
+			break;
+		}
+
+		if (try != 0) {
+			error = EBUSY;
+			break;
+		}
+
+		/*
+		 * If we caught a signal, we have retried and now
+		 * exit immediately.
+		 */
+		if (error != 0)
+			break;
+			
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+
+		/*
+		 * Set the contested bit so that a release in user space
+		 * knows to use the system call for unlock.  If this fails
+		 * either some one else has acquired the lock or it has been
+		 * released.
+		 */
+		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
+
+		/* The address was invalid. */
+		if (old == -1) {
+			umtxq_lock(&uq->uq_key);
+			umtxq_unbusy(&uq->uq_key);
+			umtxq_unlock(&uq->uq_key);
+			error = EFAULT;
+			break;
+		}
+
+		umtxq_lock(&uq->uq_key);
+		/*
+		 * We set the contested bit, sleep. Otherwise the lock changed
+		 * and we need to retry or we lost a race to the thread
+		 * unlocking the umtx.
+		 */
+		if (old == owner)
+			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
+			    "umtxpi", timeout == NULL ? NULL : &timo);
+		else {
+			umtxq_unbusy(&uq->uq_key);
+			umtxq_unlock(&uq->uq_key);
+		}
+
+		error = umtxq_check_susp(td);
+		if (error != 0)
+			break;
+	}
+
+	umtxq_lock(&uq->uq_key);
+	umtx_pi_unref(pi);
+	umtxq_unlock(&uq->uq_key);
+
+	umtx_key_release(&uq->uq_key);
+	return (error);
+}
+
+/*
+ * Unlock a PI mutex.
+ */
+static int
+do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
+{
+	struct umtx_key key;
+	struct umtx_q *uq_first, *uq_first2, *uq_me;
+	struct umtx_pi *pi, *pi2;
+	uint32_t owner, old, id;
+	int error;
+	int count;
+	int pri;
+
+	id = td->td_tid;
+	/*
+	 * Make sure we own this mtx.
+	 */
+	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+	if (owner == -1)
+		return (EFAULT);
+
+	if ((owner & ~UMUTEX_CONTESTED) != id)
+		return (EPERM);
+
+	/* This should be done in userland */
+	if ((owner & UMUTEX_CONTESTED) == 0) {
+		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
+		if (old == -1)
+			return (EFAULT);
+		if (old == owner)
+			return (0);
+		owner = old;
+	}
+
+	/* We should only ever be in here for contested locks */
+	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
+	    &key)) != 0)
+		return (error);
+
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	count = umtxq_count_pi(&key, &uq_first);
+	if (uq_first != NULL) {
+		mtx_lock_spin(&umtx_lock);
+		pi = uq_first->uq_pi_blocked;
+		KASSERT(pi != NULL, ("pi == NULL?"));
+		if (pi->pi_owner != curthread) {
+			mtx_unlock_spin(&umtx_lock);
+			umtxq_unbusy(&key);
+			umtxq_unlock(&key);
+			umtx_key_release(&key);
+			/* userland messed the mutex */
+			return (EPERM);
+		}
+		uq_me = curthread->td_umtxq;
+		pi->pi_owner = NULL;
+		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
+		/* get highest priority thread which is still sleeping. */
+		uq_first = TAILQ_FIRST(&pi->pi_blocked);
+		while (uq_first != NULL && 
+		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
+			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
+		}
+		pri = PRI_MAX;
+		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
+			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
+			if (uq_first2 != NULL) {
+				if (pri > UPRI(uq_first2->uq_thread))
+					pri = UPRI(uq_first2->uq_thread);
+			}
+		}
+		thread_lock(curthread);
+		sched_lend_user_prio(curthread, pri);
+		thread_unlock(curthread);
+		mtx_unlock_spin(&umtx_lock);
+		if (uq_first)
+			umtxq_signal_thread(uq_first);
+	}
+	umtxq_unlock(&key);
+
+	/*
+	 * When unlocking the umtx, it must be marked as unowned if
+	 * there is zero or one thread only waiting for it.
+	 * Otherwise, it must be marked as contested.
+	 */
+	old = casuword32(&m->m_owner, owner,
+		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+
+	umtxq_lock(&key);
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+	umtx_key_release(&key);
+	if (old == -1)
+		return (EFAULT);
+	if (old != owner)
+		return (EINVAL);
+	return (0);
+}
+
+/*
+ * Lock a PP mutex.
+ */
+static int
+do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
+    struct _umtx_time *timeout, int try)
+{
+	struct abs_timeout timo;
+	struct umtx_q *uq, *uq2;
+	struct umtx_pi *pi;
+	uint32_t ceiling;
+	uint32_t owner, id;
+	int error, pri, old_inherited_pri, su;
+
+	id = td->td_tid;
+	uq = td->td_umtxq;
+	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+	    &uq->uq_key)) != 0)
+		return (error);
+
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
+
+	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
+	for (;;) {
+		old_inherited_pri = uq->uq_inherited_pri;
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+
+		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
+		if (ceiling > RTP_PRIO_MAX) {
+			error = EINVAL;
+			goto out;
+		}
+
+		mtx_lock_spin(&umtx_lock);
+		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
+			mtx_unlock_spin(&umtx_lock);
+			error = EINVAL;
+			goto out;
+		}
+		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
+			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
+			thread_lock(td);
+			if (uq->uq_inherited_pri < UPRI(td))
+				sched_lend_user_prio(td, uq->uq_inherited_pri);
+			thread_unlock(td);
+		}
+		mtx_unlock_spin(&umtx_lock);
+
+		owner = casuword32(&m->m_owner,
+		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+		if (owner == UMUTEX_CONTESTED) {
+			error = 0;
+			break;
+		}
+
+		/* The address was invalid. */
+		if (owner == -1) {
+			error = EFAULT;
+			break;
+		}
+
+		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
+		    (owner & ~UMUTEX_CONTESTED) == id) {
+			error = EDEADLK;
+			break;
+		}
+
+		if (try != 0) {
+			error = EBUSY;
+			break;
+		}
+
+		/*
+		 * If we caught a signal, we have retried and now
+		 * exit immediately.
+		 */
+		if (error != 0)
+			break;
+
+		umtxq_lock(&uq->uq_key);
+		umtxq_insert(uq);
+		umtxq_unbusy(&uq->uq_key);
+		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
+		    NULL : &timo);
+		umtxq_remove(uq);
+		umtxq_unlock(&uq->uq_key);
+
+		mtx_lock_spin(&umtx_lock);
+		uq->uq_inherited_pri = old_inherited_pri;
+		pri = PRI_MAX;
+		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
+			uq2 = TAILQ_FIRST(&pi->pi_blocked);
+			if (uq2 != NULL) {
+				if (pri > UPRI(uq2->uq_thread))
+					pri = UPRI(uq2->uq_thread);
+			}
+		}
+		if (pri > uq->uq_inherited_pri)
+			pri = uq->uq_inherited_pri;
+		thread_lock(td);
+		sched_lend_user_prio(td, pri);
+		thread_unlock(td);
+		mtx_unlock_spin(&umtx_lock);
+	}
+
+	if (error != 0) {
+		mtx_lock_spin(&umtx_lock);
+		uq->uq_inherited_pri = old_inherited_pri;
+		pri = PRI_MAX;
+		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
+			uq2 = TAILQ_FIRST(&pi->pi_blocked);
+			if (uq2 != NULL) {
+				if (pri > UPRI(uq2->uq_thread))
+					pri = UPRI(uq2->uq_thread);
+			}
+		}
+		if (pri > uq->uq_inherited_pri)
+			pri = uq->uq_inherited_pri;
+		thread_lock(td);
+		sched_lend_user_prio(td, pri);
+		thread_unlock(td);
+		mtx_unlock_spin(&umtx_lock);
+	}
+
+out:
+	umtxq_lock(&uq->uq_key);
+	umtxq_unbusy(&uq->uq_key);
+	umtxq_unlock(&uq->uq_key);
+	umtx_key_release(&uq->uq_key);
+	return (error);
+}
+
+/*
+ * Unlock a PP mutex.
+ */
+static int
+do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
+{
+	struct umtx_key key;
+	struct umtx_q *uq, *uq2;
+	struct umtx_pi *pi;
+	uint32_t owner, id;
+	uint32_t rceiling;
+	int error, pri, new_inherited_pri, su;
+
+	id = td->td_tid;
+	uq = td->td_umtxq;
+	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
+
+	/*
+	 * Make sure we own this mtx.
+	 */
+	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+	if (owner == -1)
+		return (EFAULT);
+
+	if ((owner & ~UMUTEX_CONTESTED) != id)
+		return (EPERM);
+
+	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
+	if (error != 0)
+		return (error);
+
+	if (rceiling == -1)
+		new_inherited_pri = PRI_MAX;
+	else {
+		rceiling = RTP_PRIO_MAX - rceiling;
+		if (rceiling > RTP_PRIO_MAX)
+			return (EINVAL);
+		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
+	}
+
+	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+	    &key)) != 0)
+		return (error);
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	umtxq_unlock(&key);
+	/*
+	 * For priority protected mutex, always set unlocked state
+	 * to UMUTEX_CONTESTED, so that userland always enters kernel
+	 * to lock the mutex, it is necessary because thread priority
+	 * has to be adjusted for such mutex.
+	 */
+	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
+		UMUTEX_CONTESTED);
+
+	umtxq_lock(&key);
+	if (error == 0)
+		umtxq_signal(&key, 1);
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+
+	if (error == -1)
+		error = EFAULT;
+	else {
+		mtx_lock_spin(&umtx_lock);
+		if (su != 0)
+			uq->uq_inherited_pri = new_inherited_pri;
+		pri = PRI_MAX;
+		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
+			uq2 = TAILQ_FIRST(&pi->pi_blocked);
+			if (uq2 != NULL) {
+				if (pri > UPRI(uq2->uq_thread))
+					pri = UPRI(uq2->uq_thread);
+			}
+		}
+		if (pri > uq->uq_inherited_pri)
+			pri = uq->uq_inherited_pri;
+		thread_lock(td);
+		sched_lend_user_prio(td, pri);
+		thread_unlock(td);
+		mtx_unlock_spin(&umtx_lock);
+	}
+	umtx_key_release(&key);
+	return (error);
+}
+
+static int
+do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
+	uint32_t *old_ceiling)
+{
+	struct umtx_q *uq;
+	uint32_t save_ceiling;
+	uint32_t owner, id;
+	uint32_t flags;
+	int error;
+
+	flags = fuword32(&m->m_flags);
+	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
+		return (EINVAL);
+	if (ceiling > RTP_PRIO_MAX)
+		return (EINVAL);
+	id = td->td_tid;
+	uq = td->td_umtxq;
+	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+	   &uq->uq_key)) != 0)
+		return (error);
+	for (;;) {
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+
+		save_ceiling = fuword32(&m->m_ceilings[0]);
+
+		owner = casuword32(&m->m_owner,
+		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+		if (owner == UMUTEX_CONTESTED) {
+			suword32(&m->m_ceilings[0], ceiling);
+			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
+				UMUTEX_CONTESTED);
+			error = 0;
+			break;
+		}
+
+		/* The address was invalid. */
+		if (owner == -1) {
+			error = EFAULT;
+			break;
+		}
+
+		if ((owner & ~UMUTEX_CONTESTED) == id) {
+			suword32(&m->m_ceilings[0], ceiling);
+			error = 0;
+			break;
+		}
+
+		/*
+		 * If we caught a signal, we have retried and now
+		 * exit immediately.
+		 */
+		if (error != 0)
+			break;
+
+		/*
+		 * We set the contested bit, sleep. Otherwise the lock changed
+		 * and we need to retry or we lost a race to the thread
+		 * unlocking the umtx.
+		 */
+		umtxq_lock(&uq->uq_key);
+		umtxq_insert(uq);
+		umtxq_unbusy(&uq->uq_key);
+		error = umtxq_sleep(uq, "umtxpp", NULL);
+		umtxq_remove(uq);
+		umtxq_unlock(&uq->uq_key);
+	}
+	umtxq_lock(&uq->uq_key);
+	if (error == 0)
+		umtxq_signal(&uq->uq_key, INT_MAX);
+	umtxq_unbusy(&uq->uq_key);
+	umtxq_unlock(&uq->uq_key);
+	umtx_key_release(&uq->uq_key);
+	if (error == 0 && old_ceiling != NULL)
+		suword32(old_ceiling, save_ceiling);
+	return (error);
+}
+
+/*
+ * Lock a userland POSIX mutex.
+ */
+static int
+do_lock_umutex(struct thread *td, struct umutex *m,
+    struct _umtx_time *timeout, int mode)
+{
+	uint32_t flags;
+	int error;
+
+	flags = fuword32(&m->m_flags);
+	if (flags == -1)
+		return (EFAULT);
+
+	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+	case 0:
+		error = do_lock_normal(td, m, flags, timeout, mode);
+		break;
+	case UMUTEX_PRIO_INHERIT:
+		error = do_lock_pi(td, m, flags, timeout, mode);
+		break;
+	case UMUTEX_PRIO_PROTECT:
+		error = do_lock_pp(td, m, flags, timeout, mode);
+		break;
+	default:
+		return (EINVAL);
+	}
+	if (timeout == NULL) {
+		if (error == EINTR && mode != _UMUTEX_WAIT)
+			error = ERESTART;
+	} else {
+		/* Timed-locking is not restarted. */
+		if (error == ERESTART)
+			error = EINTR;
+	}
+	return (error);
+}
+
+/*
+ * Unlock a userland POSIX mutex.
+ */
+static int
+do_unlock_umutex(struct thread *td, struct umutex *m)
+{
+	uint32_t flags;
+
+	flags = fuword32(&m->m_flags);
+	if (flags == -1)
+		return (EFAULT);
+
+	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+	case 0:
+		return (do_unlock_normal(td, m, flags));
+	case UMUTEX_PRIO_INHERIT:
+		return (do_unlock_pi(td, m, flags));
+	case UMUTEX_PRIO_PROTECT:
+		return (do_unlock_pp(td, m, flags));
+	}
+
+	return (EINVAL);
+}
+
+static int
+do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
+	struct timespec *timeout, u_long wflags)
+{
+	struct abs_timeout timo;
+	struct umtx_q *uq;
+	uint32_t flags;
+	uint32_t clockid;
+	int error;
+
+	uq = td->td_umtxq;
+	flags = fuword32(&cv->c_flags);
+	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
+	if (error != 0)
+		return (error);
+
+	if ((wflags & CVWAIT_CLOCKID) != 0) {
+		clockid = fuword32(&cv->c_clockid);
+		if (clockid < CLOCK_REALTIME ||
+		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
+			/* hmm, only HW clock id will work. */
+			return (EINVAL);
+		}
+	} else {
+		clockid = CLOCK_REALTIME;
+	}
+
+	umtxq_lock(&uq->uq_key);
+	umtxq_busy(&uq->uq_key);
+	umtxq_insert(uq);
+	umtxq_unlock(&uq->uq_key);
+
+	/*
+	 * Set c_has_waiters to 1 before releasing user mutex, also
+	 * don't modify cache line when unnecessary.
+	 */
+	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
+		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
+
+	umtxq_lock(&uq->uq_key);
+	umtxq_unbusy(&uq->uq_key);
+	umtxq_unlock(&uq->uq_key);
+
+	error = do_unlock_umutex(td, m);
+
+	if (timeout != NULL)
+		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
+			timeout);
+	
+	umtxq_lock(&uq->uq_key);
+	if (error == 0) {
+		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
+		    NULL : &timo);
+	}
+
+	if ((uq->uq_flags & UQF_UMTXQ) == 0)
+		error = 0;
+	else {
+		/*
+		 * This must be timeout,interrupted by signal or
+		 * surprious wakeup, clear c_has_waiter flag when
+		 * necessary.
+		 */
+		umtxq_busy(&uq->uq_key);
+		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
+			int oldlen = uq->uq_cur_queue->length;
+			umtxq_remove(uq);
+			if (oldlen == 1) {
+				umtxq_unlock(&uq->uq_key);
+				suword32(
+				    __DEVOLATILE(uint32_t *,
+					 &cv->c_has_waiters), 0);
+				umtxq_lock(&uq->uq_key);
+			}
+		}
+		umtxq_unbusy(&uq->uq_key);
+		if (error == ERESTART)
+			error = EINTR;
+	}
+
+	umtxq_unlock(&uq->uq_key);
+	umtx_key_release(&uq->uq_key);
+	return (error);
+}
+
+/*
+ * Signal a userland condition variable.
+ */
+static int
+do_cv_signal(struct thread *td, struct ucond *cv)
+{
+	struct umtx_key key;
+	int error, cnt, nwake;
+	uint32_t flags;
+
+	flags = fuword32(&cv->c_flags);
+	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
+		return (error);	
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	cnt = umtxq_count(&key);
+	nwake = umtxq_signal(&key, 1);
+	if (cnt <= nwake) {
+		umtxq_unlock(&key);
+		error = suword32(
+		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
+		umtxq_lock(&key);
+	}
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+	umtx_key_release(&key);
+	return (error);
+}
+
+static int
+do_cv_broadcast(struct thread *td, struct ucond *cv)
+{
+	struct umtx_key key;
+	int error;
+	uint32_t flags;
+
+	flags = fuword32(&cv->c_flags);
+	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
+		return (error);	
+
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	umtxq_signal(&key, INT_MAX);
+	umtxq_unlock(&key);
+
+	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
+
+	umtxq_lock(&key);
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+
+	umtx_key_release(&key);
+	return (error);
+}
+
+static int
+do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
+{
+	struct abs_timeout timo;
+	struct umtx_q *uq;
+	uint32_t flags, wrflags;
+	int32_t state, oldstate;
+	int32_t blocked_readers;
+	int error;
+
+	uq = td->td_umtxq;
+	flags = fuword32(&rwlock->rw_flags);
+	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
+	if (error != 0)
+		return (error);
+
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
+
+	wrflags = URWLOCK_WRITE_OWNER;
+	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
+		wrflags |= URWLOCK_WRITE_WAITERS;
+
+	for (;;) {
+		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+		/* try to lock it */
+		while (!(state & wrflags)) {
+			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
+				umtx_key_release(&uq->uq_key);
+				return (EAGAIN);
+			}
+			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
+			if (oldstate == -1) {
+				umtx_key_release(&uq->uq_key);
+				return (EFAULT);
+			}
+			if (oldstate == state) {
+				umtx_key_release(&uq->uq_key);
+				return (0);
+			}
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
+			state = oldstate;
+		}
+
+		if (error)
+			break;
+
+		/* grab monitor lock */
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+
+		/*
+		 * re-read the state, in case it changed between the try-lock above
+		 * and the check below
+		 */
+		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+
+		/* set read contention bit */
+		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
+			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
+			if (oldstate == -1) {
+				error = EFAULT;
+				break;
+			}
+			if (oldstate == state)
+				goto sleep;
+			state = oldstate;
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
+		}
+		if (error != 0) {
+			umtxq_lock(&uq->uq_key);
+			umtxq_unbusy(&uq->uq_key);
+			umtxq_unlock(&uq->uq_key);
+			break;
+		}
+
+		/* state is changed while setting flags, restart */
+		if (!(state & wrflags)) {
+			umtxq_lock(&uq->uq_key);
+			umtxq_unbusy(&uq->uq_key);
+			umtxq_unlock(&uq->uq_key);
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
+			continue;
+		}
+
+sleep:
+		/* contention bit is set, before sleeping, increase read waiter count */
+		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
+		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
+
+		while (state & wrflags) {
+			umtxq_lock(&uq->uq_key);
+			umtxq_insert(uq);
+			umtxq_unbusy(&uq->uq_key);
+
+			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
+			    NULL : &timo);
+
+			umtxq_busy(&uq->uq_key);
+			umtxq_remove(uq);
+			umtxq_unlock(&uq->uq_key);
+			if (error)
+				break;
+			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+		}
+
+		/* decrease read waiter count, and may clear read contention bit */
+		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
+		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
+		if (blocked_readers == 1) {
+			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+			for (;;) {
+				oldstate = casuword32(&rwlock->rw_state, state,
+					 state & ~URWLOCK_READ_WAITERS);
+				if (oldstate == -1) {
+					error = EFAULT;
+					break;
+				}
+				if (oldstate == state)
+					break;
+				state = oldstate;
+				error = umtxq_check_susp(td);
+				if (error != 0)
+					break;
+			}
+		}
+
+		umtxq_lock(&uq->uq_key);
+		umtxq_unbusy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+		if (error != 0)
+			break;
+	}
+	umtx_key_release(&uq->uq_key);
+	if (error == ERESTART)
+		error = EINTR;
+	return (error);
+}
+
+static int
+do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
+{
+	struct abs_timeout timo;
+	struct umtx_q *uq;
+	uint32_t flags;
+	int32_t state, oldstate;
+	int32_t blocked_writers;
+	int32_t blocked_readers;
+	int error;
+
+	uq = td->td_umtxq;
+	flags = fuword32(&rwlock->rw_flags);
+	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
+	if (error != 0)
+		return (error);
+
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
+
+	blocked_readers = 0;
+	for (;;) {
+		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
+			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
+			if (oldstate == -1) {
+				umtx_key_release(&uq->uq_key);
+				return (EFAULT);
+			}
+			if (oldstate == state) {
+				umtx_key_release(&uq->uq_key);
+				return (0);
+			}
+			state = oldstate;
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
+		}
+
+		if (error) {
+			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
+			    blocked_readers != 0) {
+				umtxq_lock(&uq->uq_key);
+				umtxq_busy(&uq->uq_key);
+				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
+				umtxq_unbusy(&uq->uq_key);
+				umtxq_unlock(&uq->uq_key);
+			}
+
+			break;
+		}
+
+		/* grab monitor lock */
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+
+		/*
+		 * re-read the state, in case it changed between the try-lock above
+		 * and the check below
+		 */
+		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+
+		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
+		       (state & URWLOCK_WRITE_WAITERS) == 0) {
+			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
+			if (oldstate == -1) {
+				error = EFAULT;
+				break;
+			}
+			if (oldstate == state)
+				goto sleep;
+			state = oldstate;
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
+		}
+		if (error != 0) {
+			umtxq_lock(&uq->uq_key);
+			umtxq_unbusy(&uq->uq_key);
+			umtxq_unlock(&uq->uq_key);
+			break;
+		}
+
+		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
+			umtxq_lock(&uq->uq_key);
+			umtxq_unbusy(&uq->uq_key);
+			umtxq_unlock(&uq->uq_key);
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
+			continue;
+		}
+sleep:
+		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
+		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
+
+		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
+			umtxq_lock(&uq->uq_key);
+			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
+			umtxq_unbusy(&uq->uq_key);
+
+			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
+			    NULL : &timo);
+
+			umtxq_busy(&uq->uq_key);
+			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
+			umtxq_unlock(&uq->uq_key);
+			if (error)
+				break;
+			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+		}
+
+		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
+		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
+		if (blocked_writers == 1) {
+			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+			for (;;) {
+				oldstate = casuword32(&rwlock->rw_state, state,
+					 state & ~URWLOCK_WRITE_WAITERS);
+				if (oldstate == -1) {
+					error = EFAULT;
+					break;
+				}
+				if (oldstate == state)
+					break;
+				state = oldstate;
+				error = umtxq_check_susp(td);
+				/*
+				 * We are leaving the URWLOCK_WRITE_WAITERS
+				 * behind, but this should not harm the
+				 * correctness.
+				 */
+				if (error != 0)
+					break;
+			}
+			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
+		} else
+			blocked_readers = 0;
+
+		umtxq_lock(&uq->uq_key);
+		umtxq_unbusy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+	}
+
+	umtx_key_release(&uq->uq_key);
+	if (error == ERESTART)
+		error = EINTR;
+	return (error);
+}
+
+static int
+do_rw_unlock(struct thread *td, struct urwlock *rwlock)
+{
+	struct umtx_q *uq;
+	uint32_t flags;
+	int32_t state, oldstate;
+	int error, q, count;
+
+	uq = td->td_umtxq;
+	flags = fuword32(&rwlock->rw_flags);
+	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
+	if (error != 0)
+		return (error);
+
+	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+	if (state & URWLOCK_WRITE_OWNER) {
+		for (;;) {
+			oldstate = casuword32(&rwlock->rw_state, state, 
+				state & ~URWLOCK_WRITE_OWNER);
+			if (oldstate == -1) {
+				error = EFAULT;
+				goto out;
+			}
+			if (oldstate != state) {
+				state = oldstate;
+				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
+					error = EPERM;
+					goto out;
+				}
+				error = umtxq_check_susp(td);
+				if (error != 0)
+					goto out;
+			} else
+				break;
+		}
+	} else if (URWLOCK_READER_COUNT(state) != 0) {
+		for (;;) {
+			oldstate = casuword32(&rwlock->rw_state, state,
+				state - 1);
+			if (oldstate == -1) {
+				error = EFAULT;
+				goto out;
+			}
+			if (oldstate != state) {
+				state = oldstate;
+				if (URWLOCK_READER_COUNT(oldstate) == 0) {
+					error = EPERM;
+					goto out;
+				}
+				error = umtxq_check_susp(td);
+				if (error != 0)
+					goto out;
+			} else
+				break;
+		}
+	} else {
+		error = EPERM;
+		goto out;
+	}
+
+	count = 0;
+
+	if (!(flags & URWLOCK_PREFER_READER)) {
+		if (state & URWLOCK_WRITE_WAITERS) {
+			count = 1;
+			q = UMTX_EXCLUSIVE_QUEUE;
+		} else if (state & URWLOCK_READ_WAITERS) {
+			count = INT_MAX;
+			q = UMTX_SHARED_QUEUE;
+		}
+	} else {
+		if (state & URWLOCK_READ_WAITERS) {
+			count = INT_MAX;
+			q = UMTX_SHARED_QUEUE;
+		} else if (state & URWLOCK_WRITE_WAITERS) {
+			count = 1;
+			q = UMTX_EXCLUSIVE_QUEUE;
+		}
+	}
+
+	if (count) {
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_signal_queue(&uq->uq_key, count, q);
+		umtxq_unbusy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+	}
+out:
+	umtx_key_release(&uq->uq_key);
+	return (error);
+}
+
+static int
+do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
+{
+	struct abs_timeout timo;
+	struct umtx_q *uq;
+	uint32_t flags, count;
+	int error;
+
+	uq = td->td_umtxq;
+	flags = fuword32(&sem->_flags);
+	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
+	if (error != 0)
+		return (error);
+
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
+
+	umtxq_lock(&uq->uq_key);
+	umtxq_busy(&uq->uq_key);
+	umtxq_insert(uq);
+	umtxq_unlock(&uq->uq_key);
+	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
+	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
+	if (count != 0) {
+		umtxq_lock(&uq->uq_key);
+		umtxq_unbusy(&uq->uq_key);
+		umtxq_remove(uq);
+		umtxq_unlock(&uq->uq_key);
+		umtx_key_release(&uq->uq_key);
+		return (0);
+	}
+	umtxq_lock(&uq->uq_key);
+	umtxq_unbusy(&uq->uq_key);
+
+	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
+
+	if ((uq->uq_flags & UQF_UMTXQ) == 0)
+		error = 0;
+	else {
+		umtxq_remove(uq);
+		/* A relative timeout cannot be restarted. */
+		if (error == ERESTART && timeout != NULL &&
+		    (timeout->_flags & UMTX_ABSTIME) == 0)
+			error = EINTR;
+	}
+	umtxq_unlock(&uq->uq_key);
+	umtx_key_release(&uq->uq_key);
+	return (error);
+}
+
+/*
+ * Signal a userland condition variable.
+ */
+static int
+do_sem_wake(struct thread *td, struct _usem *sem)
+{
+	struct umtx_key key;
+	int error, cnt;
+	uint32_t flags;
+
+	flags = fuword32(&sem->_flags);
+	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
+		return (error);	
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	cnt = umtxq_count(&key);
+	if (cnt > 0) {
+		umtxq_signal(&key, 1);
+		/*
+		 * Check if count is greater than 0, this means the memory is
+		 * still being referenced by user code, so we can safely
+		 * update _has_waiters flag.
+		 */
+		if (cnt == 1) {
+			umtxq_unlock(&key);
+			error = suword32(
+			    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
+			umtxq_lock(&key);
+		}
+	}
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+	umtx_key_release(&key);
+	return (error);
+}
+
+int
+sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
+    /* struct umtx *umtx */
+{
+	return do_lock_umtx(td, uap->umtx, td->td_tid, 0);
+}
+
+int
+sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
+    /* struct umtx *umtx */
+{
+	return do_unlock_umtx(td, uap->umtx, td->td_tid);
+}
+
+inline int
+umtx_copyin_timeout(const void *addr, struct timespec *tsp)
+{
+	int error;
+
+	error = copyin(addr, tsp, sizeof(struct timespec));
+	if (error == 0) {
+		if (tsp->tv_sec < 0 ||
+		    tsp->tv_nsec >= 1000000000 ||
+		    tsp->tv_nsec < 0)
+			error = EINVAL;
+	}
+	return (error);
+}
+
+static inline int
+umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
+{
+	int error;
+	
+	if (size <= sizeof(struct timespec)) {
+		tp->_clockid = CLOCK_REALTIME;
+		tp->_flags = 0;
+		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
+	} else 
+		error = copyin(addr, tp, sizeof(struct _umtx_time));
+	if (error != 0)
+		return (error);
+	if (tp->_timeout.tv_sec < 0 ||
+	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
+		return (EINVAL);
+	return (0);
+}
+
+static int
+__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct timespec *ts, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		ts = NULL;
+	else {
+		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
+		if (error != 0)
+			return (error);
+		ts = &timeout;
+	}
+	return (do_lock_umtx(td, uap->obj, uap->val, ts));
+}
+
+static int
+__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
+{
+	return (do_unlock_umtx(td, uap->obj, uap->val));
+}
+
+static int
+__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time timeout, *tm_p;
+	int error;
+
+	if (uap->uaddr2 == NULL)
+		tm_p = NULL;
+	else {
+		error = umtx_copyin_umtx_time(
+		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+		tm_p = &timeout;
+	}
+	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
+}
+
+static int
+__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time timeout, *tm_p;
+	int error;
+
+	if (uap->uaddr2 == NULL)
+		tm_p = NULL;
+	else {
+		error = umtx_copyin_umtx_time(
+		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+		tm_p = &timeout;
+	}
+	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
+}
+
+static int
+__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time *tm_p, timeout;
+	int error;
+
+	if (uap->uaddr2 == NULL)
+		tm_p = NULL;
+	else {
+		error = umtx_copyin_umtx_time(
+		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+		tm_p = &timeout;
+	}
+	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
+}
+
+static int
+__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
+{
+	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
+}
+
+#define BATCH_SIZE	128
+static int
+__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
+{
+	int count = uap->val;
+	void *uaddrs[BATCH_SIZE];
+	char **upp = (char **)uap->obj;
+	int tocopy;
+	int error = 0;
+	int i, pos = 0;
+
+	while (count > 0) {
+		tocopy = count;
+		if (tocopy > BATCH_SIZE)
+			tocopy = BATCH_SIZE;
+		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
+		if (error != 0)
+			break;
+		for (i = 0; i < tocopy; ++i)
+			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
+		count -= tocopy;
+		pos += tocopy;
+	}
+	return (error);
+}
+
+static int
+__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
+{
+	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
+}
+
+static int
+__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time *tm_p, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		tm_p = NULL;
+	else {
+		error = umtx_copyin_umtx_time(
+		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+		tm_p = &timeout;
+	}
+	return do_lock_umutex(td, uap->obj, tm_p, 0);
+}
+
+static int
+__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
+}
+
+static int
+__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time *tm_p, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		tm_p = NULL;
+	else {
+		error = umtx_copyin_umtx_time(
+		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+		tm_p = &timeout;
+	}
+	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
+}
+
+static int
+__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_wake_umutex(td, uap->obj);
+}
+
+static int
+__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_unlock_umutex(td, uap->obj);
+}
+
+static int
+__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
+}
+
+static int
+__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct timespec *ts, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		ts = NULL;
+	else {
+		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
+		if (error != 0)
+			return (error);
+		ts = &timeout;
+	}
+	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
+}
+
+static int
+__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_cv_signal(td, uap->obj);
+}
+
+static int
+__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_cv_broadcast(td, uap->obj);
+}
+
+static int
+__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL) {
+		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
+	} else {
+		error = umtx_copyin_umtx_time(uap->uaddr2,
+		   (size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
+	}
+	return (error);
+}
+
+static int
+__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL) {
+		error = do_rw_wrlock(td, uap->obj, 0);
+	} else {
+		error = umtx_copyin_umtx_time(uap->uaddr2, 
+		   (size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+
+		error = do_rw_wrlock(td, uap->obj, &timeout);
+	}
+	return (error);
+}
+
+static int
+__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_rw_unlock(td, uap->obj);
+}
+
+static int
+__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time *tm_p, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		tm_p = NULL;
+	else {
+		error = umtx_copyin_umtx_time(
+		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+		tm_p = &timeout;
+	}
+	return (do_sem_wait(td, uap->obj, tm_p));
+}
+
+static int
+__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_sem_wake(td, uap->obj);
+}
+
+static int
+__umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_wake2_umutex(td, uap->obj, uap->val);
+}
+
+typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
+
+static _umtx_op_func op_table[] = {
+	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
+	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
+	__umtx_op_wait,			/* UMTX_OP_WAIT */
+	__umtx_op_wake,			/* UMTX_OP_WAKE */
+	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
+	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
+	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
+	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
+	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
+	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
+	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
+	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
+	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
+	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
+	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
+	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
+	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
+	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
+	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
+	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
+	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
+	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
+	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
+};
+
+int
+sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
+{
+	if ((unsigned)uap->op < UMTX_OP_MAX)
+		return (*op_table[uap->op])(td, uap);
+	return (EINVAL);
+}
+
+#ifdef COMPAT_FREEBSD32
+int
+freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
+    /* struct umtx *umtx */
+{
+	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
+}
+
+int
+freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
+    /* struct umtx *umtx */
+{
+	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
+}
+
+struct timespec32 {
+	int32_t tv_sec;
+	int32_t tv_nsec;
+};
+
+struct umtx_time32 {
+	struct	timespec32	timeout;
+	uint32_t		flags;
+	uint32_t		clockid;
+};
+
+static inline int
+umtx_copyin_timeout32(void *addr, struct timespec *tsp)
+{
+	struct timespec32 ts32;
+	int error;
+
+	error = copyin(addr, &ts32, sizeof(struct timespec32));
+	if (error == 0) {
+		if (ts32.tv_sec < 0 ||
+		    ts32.tv_nsec >= 1000000000 ||
+		    ts32.tv_nsec < 0)
+			error = EINVAL;
+		else {
+			tsp->tv_sec = ts32.tv_sec;
+			tsp->tv_nsec = ts32.tv_nsec;
+		}
+	}
+	return (error);
+}
+
+static inline int
+umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
+{
+	struct umtx_time32 t32;
+	int error;
+	
+	t32.clockid = CLOCK_REALTIME;
+	t32.flags   = 0;
+	if (size <= sizeof(struct timespec32))
+		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
+	else 
+		error = copyin(addr, &t32, sizeof(struct umtx_time32));
+	if (error != 0)
+		return (error);
+	if (t32.timeout.tv_sec < 0 ||
+	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
+		return (EINVAL);
+	tp->_timeout.tv_sec = t32.timeout.tv_sec;
+	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
+	tp->_flags = t32.flags;
+	tp->_clockid = t32.clockid;
+	return (0);
+}
+
+static int
+__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct timespec *ts, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		ts = NULL;
+	else {
+		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
+		if (error != 0)
+			return (error);
+		ts = &timeout;
+	}
+	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
+}
+
+static int
+__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
+}
+
+static int
+__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time *tm_p, timeout;
+	int error;
+
+	if (uap->uaddr2 == NULL)
+		tm_p = NULL;
+	else {
+		error = umtx_copyin_umtx_time32(uap->uaddr2,
+			(size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+		tm_p = &timeout;
+	}
+	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
+}
+
+static int
+__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time *tm_p, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		tm_p = NULL;
+	else {
+		error = umtx_copyin_umtx_time(uap->uaddr2,
+			    (size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+		tm_p = &timeout;
+	}
+	return do_lock_umutex(td, uap->obj, tm_p, 0);
+}
+
+static int
+__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time *tm_p, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		tm_p = NULL;
+	else {
+		error = umtx_copyin_umtx_time32(uap->uaddr2, 
+		    (size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+		tm_p = &timeout;
+	}
+	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
+}
+
+static int
+__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct timespec *ts, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		ts = NULL;
+	else {
+		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
+		if (error != 0)
+			return (error);
+		ts = &timeout;
+	}
+	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
+}
+
+static int
+__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL) {
+		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
+	} else {
+		error = umtx_copyin_umtx_time32(uap->uaddr2,
+		    (size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
+	}
+	return (error);
+}
+
+static int
+__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL) {
+		error = do_rw_wrlock(td, uap->obj, 0);
+	} else {
+		error = umtx_copyin_umtx_time32(uap->uaddr2,
+		    (size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+		error = do_rw_wrlock(td, uap->obj, &timeout);
+	}
+	return (error);
+}
+
+static int
+__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time *tm_p, timeout;
+	int error;
+
+	if (uap->uaddr2 == NULL)
+		tm_p = NULL;
+	else {
+		error = umtx_copyin_umtx_time32(
+		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
+		if (error != 0)
+			return (error);
+		tm_p = &timeout;
+	}
+	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
+}
+
+static int
+__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct _umtx_time *tm_p, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		tm_p = NULL;
+	else {
+		error = umtx_copyin_umtx_time32(uap->uaddr2,
+		    (size_t)uap->uaddr1, &timeout);
+		if (error != 0)
+			return (error);
+		tm_p = &timeout;
+	}
+	return (do_sem_wait(td, uap->obj, tm_p));
+}
+
+static int
+__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
+{
+	int count = uap->val;
+	uint32_t uaddrs[BATCH_SIZE];
+	uint32_t **upp = (uint32_t **)uap->obj;
+	int tocopy;
+	int error = 0;
+	int i, pos = 0;
+
+	while (count > 0) {
+		tocopy = count;
+		if (tocopy > BATCH_SIZE)
+			tocopy = BATCH_SIZE;
+		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
+		if (error != 0)
+			break;
+		for (i = 0; i < tocopy; ++i)
+			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
+				INT_MAX, 1);
+		count -= tocopy;
+		pos += tocopy;
+	}
+	return (error);
+}
+
+static _umtx_op_func op_table_compat32[] = {
+	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
+	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
+	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
+	__umtx_op_wake,			/* UMTX_OP_WAKE */
+	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
+	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
+	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
+	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
+	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
+	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
+	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
+	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
+	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
+	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
+	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
+	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
+	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
+	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
+	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
+	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
+	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
+	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
+	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
+};
+
+int
+freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
+{
+	if ((unsigned)uap->op < UMTX_OP_MAX)
+		return (*op_table_compat32[uap->op])(td,
+			(struct _umtx_op_args *)uap);
+	return (EINVAL);
+}
+#endif
+
+void
+umtx_thread_init(struct thread *td)
+{
+	td->td_umtxq = umtxq_alloc();
+	td->td_umtxq->uq_thread = td;
+}
+
+void
+umtx_thread_fini(struct thread *td)
+{
+	umtxq_free(td->td_umtxq);
+}
+
+/*
+ * It will be called when new thread is created, e.g fork().
+ */
+void
+umtx_thread_alloc(struct thread *td)
+{
+	struct umtx_q *uq;
+
+	uq = td->td_umtxq;
+	uq->uq_inherited_pri = PRI_MAX;
+
+	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
+	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
+	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
+	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
+}
+
+/*
+ * exec() hook.
+ */
+static void
+umtx_exec_hook(void *arg __unused, struct proc *p __unused,
+	struct image_params *imgp __unused)
+{
+	umtx_thread_cleanup(curthread);
+}
+
+/*
+ * thread_exit() hook.
+ */
+void
+umtx_thread_exit(struct thread *td)
+{
+	umtx_thread_cleanup(td);
+}
+
+/*
+ * clean up umtx data.
+ */
+static void
+umtx_thread_cleanup(struct thread *td)
+{
+	struct umtx_q *uq;
+	struct umtx_pi *pi;
+
+	if ((uq = td->td_umtxq) == NULL)
+		return;
+
+	mtx_lock_spin(&umtx_lock);
+	uq->uq_inherited_pri = PRI_MAX;
+	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
+		pi->pi_owner = NULL;
+		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
+	}
+	mtx_unlock_spin(&umtx_lock);
+	thread_lock(td);
+	sched_lend_user_prio(td, PRI_MAX);
+	thread_unlock(td);
+}
diff --git a/sys/kern/kern_uuid.c b/sys/kern/kern_uuid.c
new file mode 100644
index 0000000..fd4027b
--- /dev/null
+++ b/sys/kern/kern_uuid.c
@@ -0,0 +1,426 @@
+/*-
+ * Copyright (c) 2002 Marcel Moolenaar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/socket.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/jail.h>
+#include <sys/uuid.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/vnet.h>
+
+/*
+ * See also:
+ *	http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
+ *	http://www.opengroup.org/onlinepubs/009629399/apdxa.htm
+ *
+ * Note that the generator state is itself an UUID, but the time and clock
+ * sequence fields are written in the native byte order.
+ */
+
+CTASSERT(sizeof(struct uuid) == 16);
+
+/* We use an alternative, more convenient representation in the generator. */
+struct uuid_private {
+	union {
+		uint64_t	ll;		/* internal. */
+		struct {
+			uint32_t	low;
+			uint16_t	mid;
+			uint16_t	hi;
+		} x;
+	} time;
+	uint16_t	seq;			/* Big-endian. */
+	uint16_t	node[UUID_NODE_LEN>>1];
+};
+
+CTASSERT(sizeof(struct uuid_private) == 16);
+
+struct uuid_macaddr {
+	uint16_t	state;
+#define	UUID_ETHER_EMPTY	0
+#define	UUID_ETHER_RANDOM	1
+#define	UUID_ETHER_UNIQUE	2
+	uint16_t	node[UUID_NODE_LEN>>1];
+};
+
+static struct uuid_private uuid_last;
+
+#define UUID_NETHER	4
+static struct uuid_macaddr uuid_ether[UUID_NETHER];
+
+static struct mtx uuid_mutex;
+MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF);
+
+/*
+ * Return the first MAC address added in the array. If it's empty, then
+ * construct a sufficiently random multicast MAC address first. Any
+ * addresses added later will bump the random MAC address up tp the next
+ * index.
+ */
+static void
+uuid_node(uint16_t *node)
+{
+	int i;
+
+	if (uuid_ether[0].state == UUID_ETHER_EMPTY) {
+		for (i = 0; i < (UUID_NODE_LEN>>1); i++)
+			uuid_ether[0].node[i] = (uint16_t)arc4random();
+		*((uint8_t*)uuid_ether[0].node) |= 0x01;
+		uuid_ether[0].state = UUID_ETHER_RANDOM;
+	}
+	for (i = 0; i < (UUID_NODE_LEN>>1); i++)
+		node[i] = uuid_ether[0].node[i];
+}
+
+/*
+ * Get the current time as a 60 bit count of 100-nanosecond intervals
+ * since 00:00:00.00, October 15,1582. We apply a magic offset to convert
+ * the Unix time since 00:00:00.00, January 1, 1970 to the date of the
+ * Gregorian reform to the Christian calendar.
+ */
+static uint64_t
+uuid_time(void)
+{
+	struct bintime bt;
+	uint64_t time = 0x01B21DD213814000LL;
+
+	bintime(&bt);
+	time += (uint64_t)bt.sec * 10000000LL;
+	time += (10000000LL * (uint32_t)(bt.frac >> 32)) >> 32;
+	return (time & ((1LL << 60) - 1LL));
+}
+
+struct uuid *
+kern_uuidgen(struct uuid *store, size_t count)
+{
+	struct uuid_private uuid;
+	uint64_t time;
+	size_t n;
+
+	mtx_lock(&uuid_mutex);
+
+	uuid_node(uuid.node);
+	time = uuid_time();
+
+	if (uuid_last.time.ll == 0LL || uuid_last.node[0] != uuid.node[0] ||
+	    uuid_last.node[1] != uuid.node[1] ||
+	    uuid_last.node[2] != uuid.node[2])
+		uuid.seq = (uint16_t)arc4random() & 0x3fff;
+	else if (uuid_last.time.ll >= time)
+		uuid.seq = (uuid_last.seq + 1) & 0x3fff;
+	else
+		uuid.seq = uuid_last.seq;
+
+	uuid_last = uuid;
+	uuid_last.time.ll = (time + count - 1) & ((1LL << 60) - 1LL);
+
+	mtx_unlock(&uuid_mutex);
+
+	/* Set sequence and variant and deal with byte order. */
+	uuid.seq = htobe16(uuid.seq | 0x8000);
+
+	for (n = 0; n < count; n++) {
+		/* Set time and version (=1). */
+		uuid.time.x.low = (uint32_t)time;
+		uuid.time.x.mid = (uint16_t)(time >> 32);
+		uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) | (1 << 12);
+		store[n] = *(struct uuid *)&uuid;
+		time++;
+	}
+
+	return (store);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct uuidgen_args {
+	struct uuid *store;
+	int	count;
+};
+#endif
+int
+sys_uuidgen(struct thread *td, struct uuidgen_args *uap)
+{
+	struct uuid *store;
+	size_t count;
+	int error;
+
+	/*
+	 * Limit the number of UUIDs that can be created at the same time
+	 * to some arbitrary number. This isn't really necessary, but I
+	 * like to have some sort of upper-bound that's less than 2G :-)
+	 * XXX probably needs to be tunable.
+	 */
+	if (uap->count < 1 || uap->count > 2048)
+		return (EINVAL);
+
+	count = uap->count;
+	store = malloc(count * sizeof(struct uuid), M_TEMP, M_WAITOK);
+	kern_uuidgen(store, count);
+	error = copyout(store, uap->store, count * sizeof(struct uuid));
+	free(store, M_TEMP);
+	return (error);
+}
+
+int
+uuid_ether_add(const uint8_t *addr)
+{
+	int i, sum;
+
+	/*
+	 * Validate input. No multicast (flag 0x1), no locally administered
+	 * (flag 0x2) and no 'all-zeroes' addresses.
+	 */
+	if (addr[0] & 0x03)
+		return (EINVAL);
+	sum = 0;
+	for (i = 0; i < UUID_NODE_LEN; i++)
+		sum += addr[i];
+	if (sum == 0)
+		return (EINVAL);
+
+	mtx_lock(&uuid_mutex);
+
+	/* Make sure the MAC isn't known already and that there's space. */
+	i = 0;
+	while (i < UUID_NETHER && uuid_ether[i].state == UUID_ETHER_UNIQUE) {
+		if (!bcmp(addr, uuid_ether[i].node, UUID_NODE_LEN)) {
+			mtx_unlock(&uuid_mutex);
+			return (EEXIST);
+		}
+		i++;
+	}
+	if (i == UUID_NETHER) {
+		mtx_unlock(&uuid_mutex);
+		return (ENOSPC);
+	}
+
+	/* Insert MAC at index, moving the non-empty entry if possible. */
+	if (uuid_ether[i].state == UUID_ETHER_RANDOM && i < UUID_NETHER - 1)
+		uuid_ether[i + 1] = uuid_ether[i];
+	uuid_ether[i].state = UUID_ETHER_UNIQUE;
+	bcopy(addr, uuid_ether[i].node, UUID_NODE_LEN);
+	mtx_unlock(&uuid_mutex);
+	return (0);
+}
+
+int
+uuid_ether_del(const uint8_t *addr)
+{
+	int i;
+
+	mtx_lock(&uuid_mutex);
+	i = 0;
+	while (i < UUID_NETHER && uuid_ether[i].state == UUID_ETHER_UNIQUE &&
+	    bcmp(addr, uuid_ether[i].node, UUID_NODE_LEN))
+		i++;
+	if (i == UUID_NETHER || uuid_ether[i].state != UUID_ETHER_UNIQUE) {
+		mtx_unlock(&uuid_mutex);
+		return (ENOENT);
+	}
+
+	/* Remove it by shifting higher index entries down. */
+	while (i < UUID_NETHER - 1 && uuid_ether[i].state != UUID_ETHER_EMPTY) {
+		uuid_ether[i] = uuid_ether[i + 1];
+		i++;
+	}
+	if (uuid_ether[i].state != UUID_ETHER_EMPTY) {
+		uuid_ether[i].state = UUID_ETHER_EMPTY;
+		bzero(uuid_ether[i].node, UUID_NODE_LEN);
+	}
+	mtx_unlock(&uuid_mutex);
+	return (0);
+}
+
+int
+snprintf_uuid(char *buf, size_t sz, struct uuid *uuid)
+{
+	struct uuid_private *id;
+	int cnt;
+
+	id = (struct uuid_private *)uuid;
+	cnt = snprintf(buf, sz, "%08x-%04x-%04x-%04x-%04x%04x%04x",
+	    id->time.x.low, id->time.x.mid, id->time.x.hi, be16toh(id->seq),
+	    be16toh(id->node[0]), be16toh(id->node[1]), be16toh(id->node[2]));
+	return (cnt);
+}
+
+int
+printf_uuid(struct uuid *uuid)
+{
+	char buf[38];
+
+	snprintf_uuid(buf, sizeof(buf), uuid);
+	return (printf("%s", buf));
+}
+
+int
+sbuf_printf_uuid(struct sbuf *sb, struct uuid *uuid)
+{
+	char buf[38];
+
+	snprintf_uuid(buf, sizeof(buf), uuid);
+	return (sbuf_printf(sb, "%s", buf));
+}
+
+/*
+ * Encode/Decode UUID into byte-stream.
+ *   http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
+ *
+ * 0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                          time_low                             |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |       time_mid                |         time_hi_and_version   |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |clk_seq_hi_res |  clk_seq_low  |         node (0-1)            |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |                         node (2-5)                            |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+
+void
+le_uuid_enc(void *buf, struct uuid const *uuid)
+{
+	u_char *p;
+	int i;
+
+	p = buf;
+	le32enc(p, uuid->time_low);
+	le16enc(p + 4, uuid->time_mid);
+	le16enc(p + 6, uuid->time_hi_and_version);
+	p[8] = uuid->clock_seq_hi_and_reserved;
+	p[9] = uuid->clock_seq_low;
+	for (i = 0; i < _UUID_NODE_LEN; i++)
+		p[10 + i] = uuid->node[i];
+}
+
+void
+le_uuid_dec(void const *buf, struct uuid *uuid)
+{
+	u_char const *p;
+	int i;
+
+	p = buf;
+	uuid->time_low = le32dec(p);
+	uuid->time_mid = le16dec(p + 4);
+	uuid->time_hi_and_version = le16dec(p + 6);
+	uuid->clock_seq_hi_and_reserved = p[8];
+	uuid->clock_seq_low = p[9];
+	for (i = 0; i < _UUID_NODE_LEN; i++)
+		uuid->node[i] = p[10 + i];
+}
+
+void
+be_uuid_enc(void *buf, struct uuid const *uuid)
+{
+	u_char *p;
+	int i;
+
+	p = buf;
+	be32enc(p, uuid->time_low);
+	be16enc(p + 4, uuid->time_mid);
+	be16enc(p + 6, uuid->time_hi_and_version);
+	p[8] = uuid->clock_seq_hi_and_reserved;
+	p[9] = uuid->clock_seq_low;
+	for (i = 0; i < _UUID_NODE_LEN; i++)
+		p[10 + i] = uuid->node[i];
+}
+
+void
+be_uuid_dec(void const *buf, struct uuid *uuid)
+{
+	u_char const *p;
+	int i;
+
+	p = buf;
+	uuid->time_low = be32dec(p);
+	uuid->time_mid = le16dec(p + 4);
+	uuid->time_hi_and_version = be16dec(p + 6);
+	uuid->clock_seq_hi_and_reserved = p[8];
+	uuid->clock_seq_low = p[9];
+	for (i = 0; i < _UUID_NODE_LEN; i++)
+		uuid->node[i] = p[10 + i];
+}
+
+int
+parse_uuid(const char *str, struct uuid *uuid)
+{
+	u_int c[11];
+	int n;
+
+	/* An empty string represents a nil UUID. */
+	if (*str == '\0') {
+		bzero(uuid, sizeof(*uuid));
+		return (0);
+	}
+
+	/* The UUID string representation has a fixed length. */
+	if (strlen(str) != 36)
+		return (EINVAL);
+
+	/*
+	 * We only work with "new" UUIDs. New UUIDs have the form:
+	 *      01234567-89ab-cdef-0123-456789abcdef
+	 * The so called "old" UUIDs, which we don't support, have the form:
+	 *      0123456789ab.cd.ef.01.23.45.67.89.ab
+	 */
+	if (str[8] != '-')
+		return (EINVAL);
+
+	n = sscanf(str, "%8x-%4x-%4x-%2x%2x-%2x%2x%2x%2x%2x%2x", c + 0, c + 1,
+	    c + 2, c + 3, c + 4, c + 5, c + 6, c + 7, c + 8, c + 9, c + 10);
+	/* Make sure we have all conversions. */
+	if (n != 11)
+		return (EINVAL);
+
+	/* Successful scan. Build the UUID. */
+	uuid->time_low = c[0];
+	uuid->time_mid = c[1];
+	uuid->time_hi_and_version = c[2];
+	uuid->clock_seq_hi_and_reserved = c[3];
+	uuid->clock_seq_low = c[4];
+	for (n = 0; n < 6; n++)
+		uuid->node[n] = c[n + 5];
+
+	/* Check semantics... */
+	return (((c[3] & 0x80) != 0x00 &&		/* variant 0? */
+	    (c[3] & 0xc0) != 0x80 &&			/* variant 1? */
+	    (c[3] & 0xe0) != 0xc0) ? EINVAL : 0);	/* variant 2? */
+}
diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c
new file mode 100644
index 0000000..095e3ff
--- /dev/null
+++ b/sys/kern/kern_xxx.c
@@ -0,0 +1,471 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_xxx.c	8.2 (Berkeley) 11/14/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/utsname.h>
+
+#include <vm/vm_param.h>
+
+#if defined(COMPAT_43)
+
+#ifndef _SYS_SYSPROTO_H_
+struct gethostname_args {
+	char	*hostname;
+	u_int	len;
+};
+#endif
+/* ARGSUSED */
+int
+ogethostname(td, uap)
+	struct thread *td;
+	struct gethostname_args *uap;
+{
+	int name[2];
+	size_t len = uap->len;
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_HOSTNAME;
+	return (userland_sysctl(td, name, 2, uap->hostname, &len,
+	    1, 0, 0, 0, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sethostname_args {
+	char	*hostname;
+	u_int	len;
+};
+#endif
+/* ARGSUSED */
+int
+osethostname(td, uap)
+	struct thread *td;
+	register struct sethostname_args *uap;
+{
+	int name[2];
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_HOSTNAME;
+	return (userland_sysctl(td, name, 2, 0, 0, 0, uap->hostname,
+	    uap->len, 0, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ogethostid_args {
+	int	dummy;
+};
+#endif
+/* ARGSUSED */
+int
+ogethostid(td, uap)
+	struct thread *td;
+	struct ogethostid_args *uap;
+{
+	size_t len = sizeof(long);
+	int name[2];
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_HOSTID;
+	return (kernel_sysctl(td, name, 2, (long *)td->td_retval, &len,
+	    NULL, 0, NULL, 0));
+}
+#endif /* COMPAT_43 */
+
+#ifdef COMPAT_43
+#ifndef _SYS_SYSPROTO_H_
+struct osethostid_args {
+	long	hostid;
+};
+#endif
+/* ARGSUSED */
+int
+osethostid(td, uap)
+	struct thread *td;
+	struct osethostid_args *uap;
+{
+	int name[2];
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_HOSTID;
+	return (kernel_sysctl(td, name, 2, NULL, NULL, &uap->hostid,
+	    sizeof(uap->hostid), NULL, 0));
+}
+
+int
+oquota(td, uap)
+	struct thread *td;
+	struct oquota_args *uap;
+{
+
+	return (ENOSYS);
+}
+
+#define	KINFO_PROC		(0<<8)
+#define	KINFO_RT		(1<<8)
+#define	KINFO_VNODE		(2<<8)
+#define	KINFO_FILE		(3<<8)
+#define	KINFO_METER		(4<<8)
+#define	KINFO_LOADAVG		(5<<8)
+#define	KINFO_CLOCKRATE		(6<<8)
+
+/* Non-standard BSDI extension - only present on their 4.3 net-2 releases */
+#define	KINFO_BSDI_SYSINFO	(101<<8)
+
+/*
+ * XXX this is bloat, but I hope it's better here than on the potentially
+ * limited kernel stack...  -Peter
+ */
+
+static struct {
+	int	bsdi_machine;		/* "i386" on BSD/386 */
+/*      ^^^ this is an offset to the string, relative to the struct start */
+	char	*pad0;
+	long	pad1;
+	long	pad2;
+	long	pad3;
+	u_long	pad4;
+	u_long	pad5;
+	u_long	pad6;
+
+	int	bsdi_ostype;		/* "BSD/386" on BSD/386 */
+	int	bsdi_osrelease;		/* "1.1" on BSD/386 */
+	long	pad7;
+	long	pad8;
+	char	*pad9;
+
+	long	pad10;
+	long	pad11;
+	int	pad12;
+	long	pad13;
+	quad_t	pad14;
+	long	pad15;
+
+	struct	timeval pad16;
+	/* we dont set this, because BSDI's uname used gethostname() instead */
+	int	bsdi_hostname;		/* hostname on BSD/386 */
+
+	/* the actual string data is appended here */
+
+} bsdi_si;
+
+/*
+ * this data is appended to the end of the bsdi_si structure during copyout.
+ * The "char *" offsets are relative to the base of the bsdi_si struct.
+ * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings
+ * should not exceed the length of the buffer here... (or else!! :-)
+ */
+static char bsdi_strings[80];	/* It had better be less than this! */
+
+#ifndef _SYS_SYSPROTO_H_
+struct getkerninfo_args {
+	int	op;
+	char	*where;
+	size_t	*size;
+	int	arg;
+};
+#endif
+int
+ogetkerninfo(struct thread *td, struct getkerninfo_args *uap)
+{
+	int error, name[6];
+	size_t size;
+	u_int needed = 0;
+
+	switch (uap->op & 0xff00) {
+
+	case KINFO_RT:
+		name[0] = CTL_NET;
+		name[1] = PF_ROUTE;
+		name[2] = 0;
+		name[3] = (uap->op & 0xff0000) >> 16;
+		name[4] = uap->op & 0xff;
+		name[5] = uap->arg;
+		error = userland_sysctl(td, name, 6, uap->where, uap->size,
+			0, 0, 0, &size, 0);
+		break;
+
+	case KINFO_VNODE:
+		name[0] = CTL_KERN;
+		name[1] = KERN_VNODE;
+		error = userland_sysctl(td, name, 2, uap->where, uap->size,
+			0, 0, 0, &size, 0);
+		break;
+
+	case KINFO_PROC:
+		name[0] = CTL_KERN;
+		name[1] = KERN_PROC;
+		name[2] = uap->op & 0xff;
+		name[3] = uap->arg;
+		error = userland_sysctl(td, name, 4, uap->where, uap->size,
+			0, 0, 0, &size, 0);
+		break;
+
+	case KINFO_FILE:
+		name[0] = CTL_KERN;
+		name[1] = KERN_FILE;
+		error = userland_sysctl(td, name, 2, uap->where, uap->size,
+			0, 0, 0, &size, 0);
+		break;
+
+	case KINFO_METER:
+		name[0] = CTL_VM;
+		name[1] = VM_TOTAL;
+		error = userland_sysctl(td, name, 2, uap->where, uap->size,
+			0, 0, 0, &size, 0);
+		break;
+
+	case KINFO_LOADAVG:
+		name[0] = CTL_VM;
+		name[1] = VM_LOADAVG;
+		error = userland_sysctl(td, name, 2, uap->where, uap->size,
+			0, 0, 0, &size, 0);
+		break;
+
+	case KINFO_CLOCKRATE:
+		name[0] = CTL_KERN;
+		name[1] = KERN_CLOCKRATE;
+		error = userland_sysctl(td, name, 2, uap->where, uap->size,
+			0, 0, 0, &size, 0);
+		break;
+
+	case KINFO_BSDI_SYSINFO: {
+		/*
+		 * this is pretty crude, but it's just enough for uname()
+		 * from BSDI's 1.x libc to work.
+		 *
+		 * *size gives the size of the buffer before the call, and
+		 * the amount of data copied after a successful call.
+		 * If successful, the return value is the amount of data
+		 * available, which can be larger than *size.
+		 *
+		 * BSDI's 2.x product apparently fails with ENOMEM if *size
+		 * is too small.
+		 */
+
+		u_int left;
+		char *s;
+
+		bzero((char *)&bsdi_si, sizeof(bsdi_si));
+		bzero(bsdi_strings, sizeof(bsdi_strings));
+
+		s = bsdi_strings;
+
+		bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si);
+		strcpy(s, ostype);
+		s += strlen(s) + 1;
+
+		bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si);
+		strcpy(s, osrelease);
+		s += strlen(s) + 1;
+
+		bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si);
+		strcpy(s, machine);
+		s += strlen(s) + 1;
+
+		needed = sizeof(bsdi_si) + (s - bsdi_strings);
+
+		if ((uap->where == NULL) || (uap->size == NULL)) {
+			/* process is asking how much buffer to supply.. */
+			size = needed;
+			error = 0;
+			break;
+		}
+
+		if ((error = copyin(uap->size, &size, sizeof(size))) != 0)
+			break;
+
+		/* if too much buffer supplied, trim it down */
+		if (size > needed)
+			size = needed;
+
+		/* how much of the buffer is remaining */
+		left = size;
+
+		if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0)
+			break;
+
+		/* is there any point in continuing? */
+		if (left > sizeof(bsdi_si)) {
+			left -= sizeof(bsdi_si);
+			error = copyout(&bsdi_strings,
+					uap->where + sizeof(bsdi_si), left);
+		}
+		break;
+	}
+
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+	if (error == 0) {
+		td->td_retval[0] = needed ? needed : size;
+		if (uap->size) {
+			error = copyout(&size, uap->size, sizeof(size));
+		}
+	}
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+#ifdef COMPAT_FREEBSD4
+/*
+ * This is the FreeBSD-1.1 compatible uname(2) interface.  These days it is
+ * done in libc as a wrapper around a bunch of sysctl's.  This must maintain
+ * the old 1.1 binary ABI.
+ */
+#if SYS_NMLN != 32
+#error "FreeBSD-1.1 uname syscall has been broken"
+#endif
+#ifndef _SYS_SYSPROTO_H_
+struct uname_args {
+	struct utsname  *name;
+};
+#endif
+/* ARGSUSED */
+int
+freebsd4_uname(struct thread *td, struct freebsd4_uname_args *uap)
+{
+	int name[2], error;
+	size_t len;
+	char *s, *us;
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_OSTYPE;
+	len = sizeof (uap->name->sysname);
+	error = userland_sysctl(td, name, 2, uap->name->sysname, &len, 
+		1, 0, 0, 0, 0);
+	if (error)
+		return (error);
+	subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0);
+
+	name[1] = KERN_HOSTNAME;
+	len = sizeof uap->name->nodename;
+	error = userland_sysctl(td, name, 2, uap->name->nodename, &len, 
+		1, 0, 0, 0, 0);
+	if (error)
+		return (error);
+	subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0);
+
+	name[1] = KERN_OSRELEASE;
+	len = sizeof uap->name->release;
+	error = userland_sysctl(td, name, 2, uap->name->release, &len, 
+		1, 0, 0, 0, 0);
+	if (error)
+		return (error);
+	subyte( uap->name->release + sizeof(uap->name->release) - 1, 0);
+
+/*
+	name = KERN_VERSION;
+	len = sizeof uap->name->version;
+	error = userland_sysctl(td, name, 2, uap->name->version, &len, 
+		1, 0, 0, 0, 0);
+	if (error)
+		return (error);
+	subyte( uap->name->version + sizeof(uap->name->version) - 1, 0);
+*/
+
+/*
+ * this stupid hackery to make the version field look like FreeBSD 1.1
+ */
+	for(s = version; *s && *s != '#'; s++);
+
+	for(us = uap->name->version; *s && *s != ':'; s++) {
+		error = subyte( us++, *s);
+		if (error)
+			return (error);
+	}
+	error = subyte( us++, 0);
+	if (error)
+		return (error);
+
+	name[0] = CTL_HW;
+	name[1] = HW_MACHINE;
+	len = sizeof uap->name->machine;
+	error = userland_sysctl(td, name, 2, uap->name->machine, &len, 
+		1, 0, 0, 0, 0);
+	if (error)
+		return (error);
+	subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getdomainname_args {
+	char    *domainname;
+	int     len;
+};
+#endif
+/* ARGSUSED */
+int
+freebsd4_getdomainname(struct thread *td,
+    struct freebsd4_getdomainname_args *uap)
+{
+	int name[2];
+	size_t len = uap->len;
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_NISDOMAINNAME;
+	return (userland_sysctl(td, name, 2, uap->domainname, &len,
+	    1, 0, 0, 0, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setdomainname_args {
+	char    *domainname;
+	int     len;
+};
+#endif
+/* ARGSUSED */
+int
+freebsd4_setdomainname(struct thread *td,
+    struct freebsd4_setdomainname_args *uap)
+{
+	int name[2];
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_NISDOMAINNAME;
+	return (userland_sysctl(td, name, 2, 0, 0, 0, uap->domainname,
+	    uap->len, 0, 0));
+}
+#endif /* COMPAT_FREEBSD4 */
diff --git a/sys/kern/ksched.c b/sys/kern/ksched.c
new file mode 100644
index 0000000..efb673e
--- /dev/null
+++ b/sys/kern/ksched.c
@@ -0,0 +1,292 @@
+/*-
+ * Copyright (c) 1996, 1997
+ *	HD Associates, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* ksched: Soft real time scheduling based on "rtprio".
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_posix.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/resource.h>
+#include <sys/sched.h>
+
+FEATURE(kposix_priority_scheduling, "POSIX P1003.1B realtime extensions");
+
+/* ksched: Real-time extension to support POSIX priority scheduling.
+ */
+
+struct ksched {
+	struct timespec rr_interval;
+};
+
+int
+ksched_attach(struct ksched **p)
+{
+	struct ksched *ksched= p31b_malloc(sizeof(*ksched));
+
+	ksched->rr_interval.tv_sec = 0;
+	ksched->rr_interval.tv_nsec = 1000000000L / hz * sched_rr_interval();
+
+	*p = ksched;
+	return 0;
+}
+
+int
+ksched_detach(struct ksched *ks)
+{
+	p31b_free(ks);
+
+	return 0;
+}
+
+/*
+ * XXX About priorities
+ *
+ *	POSIX 1003.1b requires that numerically higher priorities be of
+ *	higher priority.  It also permits sched_setparam to be
+ *	implementation defined for SCHED_OTHER.  I don't like
+ *	the notion of inverted priorites for normal processes when
+ *      you can use "setpriority" for that.
+ *
+ */
+
+/* Macros to convert between the unix (lower numerically is higher priority)
+ * and POSIX 1003.1b (higher numerically is higher priority)
+ */
+
+#define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P))
+#define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P))
+
+#define p4prio_to_tsprio(P) ((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) - (P))
+#define tsprio_to_p4prio(P) ((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) - (P))
+
+/* These improve readability a bit for me:
+ */
+#define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX)
+#define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN)
+
+static __inline int
+getscheduler(struct ksched *ksched, struct thread *td, int *policy)
+{
+	struct rtprio rtp;
+	int e = 0;
+
+	pri_to_rtp(td, &rtp);
+	switch (rtp.type)
+	{
+		case RTP_PRIO_FIFO:
+		*policy = SCHED_FIFO;
+		break;
+
+		case RTP_PRIO_REALTIME:
+		*policy = SCHED_RR;
+		break;
+
+		default:
+		*policy = SCHED_OTHER;
+		break;
+	}
+
+	return e;
+}
+
+int
+ksched_setparam(struct ksched *ksched,
+    struct thread *td, const struct sched_param *param)
+{
+	int policy;
+	int e;
+
+	e = getscheduler(ksched, td, &policy);
+
+	if (e == 0)
+	{
+			e = ksched_setscheduler(ksched, td, policy, param);
+	}
+
+	return e;
+}
+
+int
+ksched_getparam(struct ksched *ksched,
+    struct thread *td, struct sched_param *param)
+{
+	struct rtprio rtp;
+
+	pri_to_rtp(td, &rtp);
+	if (RTP_PRIO_IS_REALTIME(rtp.type))
+		param->sched_priority = rtpprio_to_p4prio(rtp.prio);
+	else {
+		if (PRI_MIN_TIMESHARE < rtp.prio) 
+			/*
+		 	 * The interactive score has it to min realtime
+			 * so we must show max (64 most likely
+			 */ 
+			param->sched_priority = (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE);
+		else
+			param->sched_priority = tsprio_to_p4prio(rtp.prio);
+	}
+	return 0;
+}
+
+/*
+ * XXX The priority and scheduler modifications should
+ *     be moved into published interfaces in kern/kern_sync.
+ *
+ * The permissions to modify process p were checked in "p31b_proc()".
+ *
+ */
+int
+ksched_setscheduler(struct ksched *ksched,
+    struct thread *td, int policy, const struct sched_param *param)
+{
+	int e = 0;
+	struct rtprio rtp;
+
+	switch(policy)
+	{
+		case SCHED_RR:
+		case SCHED_FIFO:
+
+		if (param->sched_priority >= P1B_PRIO_MIN &&
+		    param->sched_priority <= P1B_PRIO_MAX)
+		{
+			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+			rtp.type = (policy == SCHED_FIFO)
+				? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;
+
+			rtp_to_pri(&rtp, td);
+		}
+		else
+			e = EPERM;
+
+
+		break;
+
+		case SCHED_OTHER:
+		if (param->sched_priority >= 0 &&
+			param->sched_priority <= (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE)) {
+			rtp.type = RTP_PRIO_NORMAL;
+			rtp.prio = p4prio_to_tsprio(param->sched_priority);
+			rtp_to_pri(&rtp, td);
+		} else
+			e = EINVAL;
+
+		break;
+		
+		default:
+			e = EINVAL;
+			break;
+	}
+
+	return e;
+}
+
+int
+ksched_getscheduler(struct ksched *ksched, struct thread *td, int *policy)
+{
+	return getscheduler(ksched, td, policy);
+}
+
+/* ksched_yield: Yield the CPU.
+ */
+int
+ksched_yield(struct ksched *ksched)
+{
+	sched_relinquish(curthread);
+	return 0;
+}
+
+int
+ksched_get_priority_max(struct ksched *ksched, int policy, int *prio)
+{
+	int e = 0;
+
+	switch (policy)
+	{
+		case SCHED_FIFO:
+		case SCHED_RR:
+		*prio = RTP_PRIO_MAX;
+		break;
+
+		case SCHED_OTHER:
+		*prio = PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
+		break;
+
+		default:
+		e = EINVAL;
+	}
+
+	return e;
+}
+
+int
+ksched_get_priority_min(struct ksched *ksched, int policy, int *prio)
+{
+	int e = 0;
+
+	switch (policy)
+	{
+		case SCHED_FIFO:
+		case SCHED_RR:
+		*prio = P1B_PRIO_MIN;
+		break;
+
+		case SCHED_OTHER:
+		*prio = 0;
+		break;
+
+		default:
+		e = EINVAL;
+	}
+
+	return e;
+}
+
+int
+ksched_rr_get_interval(struct ksched *ksched,
+   struct thread *td, struct timespec *timespec)
+{
+	*timespec = ksched->rr_interval;
+
+	return 0;
+}
diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c
new file mode 100644
index 0000000..6252a8d
--- /dev/null
+++ b/sys/kern/link_elf.c
@@ -0,0 +1,1605 @@
+/*-
+ * Copyright (c) 1998-2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_gdb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/mount.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+
+#include <machine/elf.h>
+
+#include <net/vnet.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#ifdef SPARSE_MAPPING
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#endif
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#include <sys/link_elf.h>
+
+#ifdef DDB_CTF
+#include <net/zlib.h>
+#endif
+
+#include "linker_if.h"
+
+#define MAXSEGS 4
+
+typedef struct elf_file {
+	struct linker_file lf;		/* Common fields */
+	int		preloaded;	/* Was file pre-loaded */
+	caddr_t		address;	/* Relocation address */
+#ifdef SPARSE_MAPPING
+	vm_object_t	object;		/* VM object to hold file pages */
+#endif
+	Elf_Dyn		*dynamic;	/* Symbol table etc. */
+	Elf_Hashelt	nbuckets;	/* DT_HASH info */
+	Elf_Hashelt	nchains;
+	const Elf_Hashelt *buckets;
+	const Elf_Hashelt *chains;
+	caddr_t		hash;
+	caddr_t		strtab;		/* DT_STRTAB */
+	int		strsz;		/* DT_STRSZ */
+	const Elf_Sym	*symtab;		/* DT_SYMTAB */
+	Elf_Addr	*got;		/* DT_PLTGOT */
+	const Elf_Rel	*pltrel;	/* DT_JMPREL */
+	int		pltrelsize;	/* DT_PLTRELSZ */
+	const Elf_Rela	*pltrela;	/* DT_JMPREL */
+	int		pltrelasize;	/* DT_PLTRELSZ */
+	const Elf_Rel	*rel;		/* DT_REL */
+	int		relsize;	/* DT_RELSZ */
+	const Elf_Rela	*rela;		/* DT_RELA */
+	int		relasize;	/* DT_RELASZ */
+	caddr_t		modptr;
+	const Elf_Sym	*ddbsymtab;	/* The symbol table we are using */
+	long		ddbsymcnt;	/* Number of symbols */
+	caddr_t		ddbstrtab;	/* String table */
+	long		ddbstrcnt;	/* number of bytes in string table */
+	caddr_t		symbase;	/* malloc'ed symbold base */
+	caddr_t		strbase;	/* malloc'ed string base */
+	caddr_t		ctftab;		/* CTF table */
+	long		ctfcnt;		/* number of bytes in CTF table */
+	caddr_t		ctfoff;		/* CTF offset table */
+	caddr_t		typoff;		/* Type offset table */
+	long		typlen;		/* Number of type entries. */
+	Elf_Addr	pcpu_start;	/* Pre-relocation pcpu set start. */
+	Elf_Addr	pcpu_stop;	/* Pre-relocation pcpu set stop. */
+	Elf_Addr	pcpu_base;	/* Relocated pcpu set address. */
+#ifdef VIMAGE
+	Elf_Addr	vnet_start;	/* Pre-relocation vnet set start. */
+	Elf_Addr	vnet_stop;	/* Pre-relocation vnet set stop. */
+	Elf_Addr	vnet_base;	/* Relocated vnet set address. */
+#endif
+#ifdef GDB
+	struct link_map	gdb;		/* hooks for gdb */
+#endif
+} *elf_file_t;
+
+struct elf_set {
+	Elf_Addr	es_start;
+	Elf_Addr	es_stop;
+	Elf_Addr	es_base;
+	TAILQ_ENTRY(elf_set)	es_link;
+};
+
+TAILQ_HEAD(elf_set_head, elf_set);
+
+#include <kern/kern_ctf.c>
+
+static int	link_elf_link_common_finish(linker_file_t);
+static int	link_elf_link_preload(linker_class_t cls,
+				      const char *, linker_file_t *);
+static int	link_elf_link_preload_finish(linker_file_t);
+static int	link_elf_load_file(linker_class_t, const char *,
+		    linker_file_t *);
+static int	link_elf_lookup_symbol(linker_file_t, const char *,
+		    c_linker_sym_t *);
+static int	link_elf_symbol_values(linker_file_t, c_linker_sym_t,
+		    linker_symval_t *);
+static int	link_elf_search_symbol(linker_file_t, caddr_t,
+		    c_linker_sym_t *, long *);
+
+static void	link_elf_unload_file(linker_file_t);
+static void	link_elf_unload_preload(linker_file_t);
+static int	link_elf_lookup_set(linker_file_t, const char *,
+		    void ***, void ***, int *);
+static int	link_elf_each_function_name(linker_file_t,
+		    int (*)(const char *, void *), void *);
+static int	link_elf_each_function_nameval(linker_file_t,
+		    linker_function_nameval_callback_t, void *);
+static void	link_elf_reloc_local(linker_file_t);
+static long	link_elf_symtab_get(linker_file_t, const Elf_Sym **);
+static long	link_elf_strtab_get(linker_file_t, caddr_t *);
+static Elf_Addr	elf_lookup(linker_file_t, Elf_Size, int);
+
+static kobj_method_t link_elf_methods[] = {
+	KOBJMETHOD(linker_lookup_symbol,	link_elf_lookup_symbol),
+	KOBJMETHOD(linker_symbol_values,	link_elf_symbol_values),
+	KOBJMETHOD(linker_search_symbol,	link_elf_search_symbol),
+	KOBJMETHOD(linker_unload,		link_elf_unload_file),
+	KOBJMETHOD(linker_load_file,		link_elf_load_file),
+	KOBJMETHOD(linker_link_preload,		link_elf_link_preload),
+	KOBJMETHOD(linker_link_preload_finish,	link_elf_link_preload_finish),
+	KOBJMETHOD(linker_lookup_set,		link_elf_lookup_set),
+	KOBJMETHOD(linker_each_function_name,	link_elf_each_function_name),
+	KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval),
+	KOBJMETHOD(linker_ctf_get,		link_elf_ctf_get),
+	KOBJMETHOD(linker_symtab_get,		link_elf_symtab_get),
+	KOBJMETHOD(linker_strtab_get,		link_elf_strtab_get),
+	{ 0, 0 }
+};
+
+static struct linker_class link_elf_class = {
+#if ELF_TARG_CLASS == ELFCLASS32
+	"elf32",
+#else
+	"elf64",
+#endif
+	link_elf_methods, sizeof(struct elf_file)
+};
+
+static int	parse_dynamic(elf_file_t);
+static int	relocate_file(elf_file_t);
+static int	link_elf_preload_parse_symbols(elf_file_t);
+
+static struct elf_set_head set_pcpu_list;
+#ifdef VIMAGE
+static struct elf_set_head set_vnet_list;
+#endif
+
+static void
+elf_set_add(struct elf_set_head *list, Elf_Addr start, Elf_Addr stop, Elf_Addr base)
+{
+	struct elf_set *set, *iter;
+
+	set = malloc(sizeof(*set), M_LINKER, M_WAITOK);
+	set->es_start = start;
+	set->es_stop = stop;
+	set->es_base = base;
+
+	TAILQ_FOREACH(iter, list, es_link) {
+
+		KASSERT((set->es_start < iter->es_start && set->es_stop < iter->es_stop) ||
+		    (set->es_start > iter->es_start && set->es_stop > iter->es_stop),
+		    ("linker sets intersection: to insert: 0x%jx-0x%jx; inserted: 0x%jx-0x%jx",
+		    (uintmax_t)set->es_start, (uintmax_t)set->es_stop,
+		    (uintmax_t)iter->es_start, (uintmax_t)iter->es_stop));
+
+		if (iter->es_start > set->es_start) {
+			TAILQ_INSERT_BEFORE(iter, set, es_link);
+			break;
+		}
+	}
+
+	if (iter == NULL)
+		TAILQ_INSERT_TAIL(list, set, es_link);
+}
+
+static int
+elf_set_find(struct elf_set_head *list, Elf_Addr addr, Elf_Addr *start, Elf_Addr *base)
+{
+	struct elf_set *set;
+
+	TAILQ_FOREACH(set, list, es_link) {
+		if (addr < set->es_start)
+			return (0);
+		if (addr < set->es_stop) {
+			*start = set->es_start;
+			*base = set->es_base;
+			return (1);
+		}
+	}
+
+	return (0);
+}
+
+static void
+elf_set_delete(struct elf_set_head *list, Elf_Addr start)
+{
+	struct elf_set *set;
+
+	TAILQ_FOREACH(set, list, es_link) {
+		if (start < set->es_start)
+			break;
+		if (start == set->es_start) {
+			TAILQ_REMOVE(list, set, es_link);
+			free(set, M_LINKER);
+			return;
+		}
+	}
+	KASSERT(0, ("deleting unknown linker set (start = 0x%jx)",
+	    (uintmax_t)start));
+}
+
+#ifdef GDB
+static void	r_debug_state(struct r_debug *, struct link_map *);
+
+/*
+ * A list of loaded modules for GDB to use for loading symbols.
+ */
+struct r_debug r_debug;
+
+#define GDB_STATE(s) do {				\
+	r_debug.r_state = s; r_debug_state(NULL, NULL);	\
+} while (0)
+
+/*
+ * Function for the debugger to set a breakpoint on to gain control.
+ */
+static void
+r_debug_state(struct r_debug *dummy_one __unused,
+	      struct link_map *dummy_two __unused)
+{
+}
+
+static void
+link_elf_add_gdb(struct link_map *l)
+{
+	struct link_map *prev;
+
+	l->l_next = NULL;
+
+	if (r_debug.r_map == NULL) {
+		/* Add first. */
+		l->l_prev = NULL;
+		r_debug.r_map = l;
+	} else {
+		/* Append to list. */
+		for (prev = r_debug.r_map;
+		    prev->l_next != NULL;
+		    prev = prev->l_next)
+			;
+		l->l_prev = prev;
+		prev->l_next = l;
+	}
+}
+
+static void
+link_elf_delete_gdb(struct link_map *l)
+{
+	if (l->l_prev == NULL) {
+		/* Remove first. */
+		if ((r_debug.r_map = l->l_next) != NULL)
+			l->l_next->l_prev = NULL;
+	} else {
+		/* Remove any but first. */
+		if ((l->l_prev->l_next = l->l_next) != NULL)
+			l->l_next->l_prev = l->l_prev;
+	}
+}
+#endif /* GDB */
+
+#ifdef __ia64__
+Elf_Addr link_elf_get_gp(linker_file_t);
+#endif
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic _DYNAMIC;
+
+static void
+link_elf_error(const char *filename, const char *s)
+{
+	if (filename == NULL)
+		printf("kldload: %s\n", s);
+	else
+		printf("kldload: %s: %s\n", filename, s);
+}
+
+/*
+ * Actions performed after linking/loading both the preloaded kernel and any
+ * modules; whether preloaded or dynamicly loaded.
+ */
+static int
+link_elf_link_common_finish(linker_file_t lf)
+{
+#ifdef GDB
+	elf_file_t ef = (elf_file_t)lf;
+	char *newfilename;
+#endif
+	int error;
+
+	/* Notify MD code that a module is being loaded. */
+	error = elf_cpu_load_file(lf);
+	if (error != 0)
+		return (error);
+
+#ifdef GDB
+	GDB_STATE(RT_ADD);
+	ef->gdb.l_addr = lf->address;
+	newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK);
+	strcpy(newfilename, lf->filename);
+	ef->gdb.l_name = newfilename;
+	ef->gdb.l_ld = ef->dynamic;
+	link_elf_add_gdb(&ef->gdb);
+	GDB_STATE(RT_CONSISTENT);
+#endif
+
+	return (0);
+}
+
+static void
+link_elf_init(void* arg)
+{
+	Elf_Dyn *dp;
+	caddr_t modptr, baseptr, sizeptr;
+	elf_file_t ef;
+	char *modname;
+
+	linker_add_class(&link_elf_class);
+
+	dp = (Elf_Dyn *)&_DYNAMIC;
+	modname = NULL;
+	modptr = preload_search_by_type("elf" __XSTRING(__ELF_WORD_SIZE) " kernel");
+	if (modptr == NULL)
+		modptr = preload_search_by_type("elf kernel");
+	if (modptr != NULL)
+		modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+	if (modname == NULL)
+		modname = "kernel";
+	linker_kernel_file = linker_make_file(modname, &link_elf_class);
+	if (linker_kernel_file == NULL)
+		panic("%s: Can't create linker structures for kernel",
+		    __func__);
+
+	ef = (elf_file_t) linker_kernel_file;
+	ef->preloaded = 1;
+	ef->address = 0;
+#ifdef SPARSE_MAPPING
+	ef->object = 0;
+#endif
+	ef->dynamic = dp;
+
+	if (dp != NULL)
+		parse_dynamic(ef);
+	linker_kernel_file->address = (caddr_t) KERNBASE;
+	linker_kernel_file->size = -(intptr_t)linker_kernel_file->address;
+
+	if (modptr != NULL) {
+		ef->modptr = modptr;
+		baseptr = preload_search_info(modptr, MODINFO_ADDR);
+		if (baseptr != NULL)
+			linker_kernel_file->address = *(caddr_t *)baseptr;
+		sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+		if (sizeptr != NULL)
+			linker_kernel_file->size = *(size_t *)sizeptr;
+	}
+	(void)link_elf_preload_parse_symbols(ef);
+
+#ifdef GDB
+	r_debug.r_map = NULL;
+	r_debug.r_brk = r_debug_state;
+	r_debug.r_state = RT_CONSISTENT;
+#endif
+
+	(void)link_elf_link_common_finish(linker_kernel_file);
+	linker_kernel_file->flags |= LINKER_FILE_LINKED;
+	TAILQ_INIT(&set_pcpu_list);
+#ifdef VIMAGE
+	TAILQ_INIT(&set_vnet_list);
+#endif
+}
+
+SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_THIRD, link_elf_init, 0);
+
+static int
+link_elf_preload_parse_symbols(elf_file_t ef)
+{
+	caddr_t pointer;
+	caddr_t ssym, esym, base;
+	caddr_t strtab;
+	int strcnt;
+	Elf_Sym *symtab;
+	int symcnt;
+
+	if (ef->modptr == NULL)
+		return (0);
+	pointer = preload_search_info(ef->modptr,
+	    MODINFO_METADATA | MODINFOMD_SSYM);
+	if (pointer == NULL)
+		return (0);
+	ssym = *(caddr_t *)pointer;
+	pointer = preload_search_info(ef->modptr,
+	    MODINFO_METADATA | MODINFOMD_ESYM);
+	if (pointer == NULL)
+		return (0);
+	esym = *(caddr_t *)pointer;
+
+	base = ssym;
+
+	symcnt = *(long *)base;
+	base += sizeof(long);
+	symtab = (Elf_Sym *)base;
+	base += roundup(symcnt, sizeof(long));
+
+	if (base > esym || base < ssym) {
+		printf("Symbols are corrupt!\n");
+		return (EINVAL);
+	}
+
+	strcnt = *(long *)base;
+	base += sizeof(long);
+	strtab = base;
+	base += roundup(strcnt, sizeof(long));
+
+	if (base > esym || base < ssym) {
+		printf("Symbols are corrupt!\n");
+		return (EINVAL);
+	}
+
+	ef->ddbsymtab = symtab;
+	ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+	ef->ddbstrtab = strtab;
+	ef->ddbstrcnt = strcnt;
+
+	return (0);
+}
+
+static int
+parse_dynamic(elf_file_t ef)
+{
+	Elf_Dyn *dp;
+	int plttype = DT_REL;
+
+	for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+		switch (dp->d_tag) {
+		case DT_HASH:
+		{
+			/* From src/libexec/rtld-elf/rtld.c */
+			const Elf_Hashelt *hashtab = (const Elf_Hashelt *)
+			    (ef->address + dp->d_un.d_ptr);
+			ef->nbuckets = hashtab[0];
+			ef->nchains = hashtab[1];
+			ef->buckets = hashtab + 2;
+			ef->chains = ef->buckets + ef->nbuckets;
+			break;
+		}
+		case DT_STRTAB:
+			ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr);
+			break;
+		case DT_STRSZ:
+			ef->strsz = dp->d_un.d_val;
+			break;
+		case DT_SYMTAB:
+			ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr);
+			break;
+		case DT_SYMENT:
+			if (dp->d_un.d_val != sizeof(Elf_Sym))
+				return (ENOEXEC);
+			break;
+		case DT_PLTGOT:
+			ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr);
+			break;
+		case DT_REL:
+			ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+			break;
+		case DT_RELSZ:
+			ef->relsize = dp->d_un.d_val;
+			break;
+		case DT_RELENT:
+			if (dp->d_un.d_val != sizeof(Elf_Rel))
+				return (ENOEXEC);
+			break;
+		case DT_JMPREL:
+			ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+			break;
+		case DT_PLTRELSZ:
+			ef->pltrelsize = dp->d_un.d_val;
+			break;
+		case DT_RELA:
+			ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr);
+			break;
+		case DT_RELASZ:
+			ef->relasize = dp->d_un.d_val;
+			break;
+		case DT_RELAENT:
+			if (dp->d_un.d_val != sizeof(Elf_Rela))
+				return (ENOEXEC);
+			break;
+		case DT_PLTREL:
+			plttype = dp->d_un.d_val;
+			if (plttype != DT_REL && plttype != DT_RELA)
+				return (ENOEXEC);
+			break;
+#ifdef GDB
+		case DT_DEBUG:
+			dp->d_un.d_ptr = (Elf_Addr)&r_debug;
+			break;
+#endif
+		}
+	}
+
+	if (plttype == DT_RELA) {
+		ef->pltrela = (const Elf_Rela *)ef->pltrel;
+		ef->pltrel = NULL;
+		ef->pltrelasize = ef->pltrelsize;
+		ef->pltrelsize = 0;
+	}
+
+	ef->ddbsymtab = ef->symtab;
+	ef->ddbsymcnt = ef->nchains;
+	ef->ddbstrtab = ef->strtab;
+	ef->ddbstrcnt = ef->strsz;
+
+	return (0);
+}
+
+static int
+parse_dpcpu(elf_file_t ef)
+{ 
+	int count;
+	int error;
+
+	ef->pcpu_start = 0;
+	ef->pcpu_stop = 0;
+	error = link_elf_lookup_set(&ef->lf, "pcpu", (void ***)&ef->pcpu_start,
+	    (void ***)&ef->pcpu_stop, &count);
+	/* Error just means there is no pcpu set to relocate. */
+	if (error != 0)
+		return (0);
+	count *= sizeof(void *);
+	/*
+	 * Allocate space in the primary pcpu area.  Copy in our
+	 * initialization from the data section and then initialize
+	 * all per-cpu storage from that.
+	 */
+	ef->pcpu_base = (Elf_Addr)(uintptr_t)dpcpu_alloc(count);
+	if (ef->pcpu_base == 0)
+		return (ENOSPC);
+	memcpy((void *)ef->pcpu_base, (void *)ef->pcpu_start, count);
+	dpcpu_copy((void *)ef->pcpu_base, count);
+	elf_set_add(&set_pcpu_list, ef->pcpu_start, ef->pcpu_stop,
+	    ef->pcpu_base);
+
+	return (0);
+}
+
+#ifdef VIMAGE
+static int
+parse_vnet(elf_file_t ef)
+{ 
+	int count;
+	int error;
+
+	ef->vnet_start = 0;
+	ef->vnet_stop = 0;
+	error = link_elf_lookup_set(&ef->lf, "vnet", (void ***)&ef->vnet_start,
+	    (void ***)&ef->vnet_stop, &count);
+	/* Error just means there is no vnet data set to relocate. */
+	if (error != 0)
+		return (0);
+	count *= sizeof(void *);
+	/*
+	 * Allocate space in the primary vnet area.  Copy in our
+	 * initialization from the data section and then initialize
+	 * all per-vnet storage from that.
+	 */
+	ef->vnet_base = (Elf_Addr)(uintptr_t)vnet_data_alloc(count);
+	if (ef->vnet_base == 0)
+		return (ENOSPC);
+	memcpy((void *)ef->vnet_base, (void *)ef->vnet_start, count);
+	vnet_data_copy((void *)ef->vnet_base, count);
+	elf_set_add(&set_vnet_list, ef->vnet_start, ef->vnet_stop,
+	    ef->vnet_base);
+
+	return (0);
+}
+#endif
+
+static int
+link_elf_link_preload(linker_class_t cls,
+    const char* filename, linker_file_t *result)
+{
+	caddr_t modptr, baseptr, sizeptr, dynptr;
+	char *type;
+	elf_file_t ef;
+	linker_file_t lf;
+	int error;
+	vm_offset_t dp;
+
+	/* Look to see if we have the file preloaded */
+	modptr = preload_search_by_name(filename);
+	if (modptr == NULL)
+		return (ENOENT);
+
+	type = (char *)preload_search_info(modptr, MODINFO_TYPE);
+	baseptr = preload_search_info(modptr, MODINFO_ADDR);
+	sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+	dynptr = preload_search_info(modptr,
+	    MODINFO_METADATA | MODINFOMD_DYNAMIC);
+	if (type == NULL ||
+	    (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE) " module") != 0 &&
+	     strcmp(type, "elf module") != 0))
+		return (EFTYPE);
+	if (baseptr == NULL || sizeptr == NULL || dynptr == NULL)
+		return (EINVAL);
+
+	lf = linker_make_file(filename, &link_elf_class);
+	if (lf == NULL)
+		return (ENOMEM);
+
+	ef = (elf_file_t) lf;
+	ef->preloaded = 1;
+	ef->modptr = modptr;
+	ef->address = *(caddr_t *)baseptr;
+#ifdef SPARSE_MAPPING
+	ef->object = 0;
+#endif
+	dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr;
+	ef->dynamic = (Elf_Dyn *)dp;
+	lf->address = ef->address;
+	lf->size = *(size_t *)sizeptr;
+
+	error = parse_dynamic(ef);
+	if (error == 0)
+		error = parse_dpcpu(ef);
+#ifdef VIMAGE
+	if (error == 0)
+		error = parse_vnet(ef);
+#endif
+	if (error != 0) {
+		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+		return (error);
+	}
+	link_elf_reloc_local(lf);
+	*result = lf;
+	return (0);
+}
+
+static int
+link_elf_link_preload_finish(linker_file_t lf)
+{
+	elf_file_t ef;
+	int error;
+
+	ef = (elf_file_t) lf;
+	error = relocate_file(ef);
+	if (error != 0)
+		return (error);
+	(void)link_elf_preload_parse_symbols(ef);
+
+	return (link_elf_link_common_finish(lf));
+}
+
+static int
+link_elf_load_file(linker_class_t cls, const char* filename,
+    linker_file_t* result)
+{
+	struct nameidata nd;
+	struct thread* td = curthread;	/* XXX */
+	Elf_Ehdr *hdr;
+	caddr_t firstpage;
+	int nbytes, i;
+	Elf_Phdr *phdr;
+	Elf_Phdr *phlimit;
+	Elf_Phdr *segs[MAXSEGS];
+	int nsegs;
+	Elf_Phdr *phdyn;
+	Elf_Phdr *phphdr;
+	caddr_t mapbase;
+	size_t mapsize;
+	Elf_Off base_offset;
+	Elf_Addr base_vaddr;
+	Elf_Addr base_vlimit;
+	int error = 0;
+	ssize_t resid;
+	int flags;
+	elf_file_t ef;
+	linker_file_t lf;
+	Elf_Shdr *shdr;
+	int symtabindex;
+	int symstrindex;
+	int symcnt;
+	int strcnt;
+
+	shdr = NULL;
+	lf = NULL;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+	flags = FREAD;
+	error = vn_open(&nd, &flags, 0, NULL);
+	if (error != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (nd.ni_vp->v_type != VREG) {
+		error = ENOEXEC;
+		firstpage = NULL;
+		goto out;
+	}
+#ifdef MAC
+	error = mac_kld_check_load(curthread->td_ucred, nd.ni_vp);
+	if (error != 0) {
+		firstpage = NULL;
+		goto out;
+	}
+#endif
+
+	/*
+	 * Read the elf header from the file.
+	 */
+	firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK);
+	hdr = (Elf_Ehdr *)firstpage;
+	error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0,
+	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+	    &resid, td);
+	nbytes = PAGE_SIZE - resid;
+	if (error != 0)
+		goto out;
+
+	if (!IS_ELF(*hdr)) {
+		error = ENOEXEC;
+		goto out;
+	}
+
+	if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
+	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
+		link_elf_error(filename, "Unsupported file layout");
+		error = ENOEXEC;
+		goto out;
+	}
+	if (hdr->e_ident[EI_VERSION] != EV_CURRENT ||
+	    hdr->e_version != EV_CURRENT) {
+		link_elf_error(filename, "Unsupported file version");
+		error = ENOEXEC;
+		goto out;
+	}
+	if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
+		error = ENOSYS;
+		goto out;
+	}
+	if (hdr->e_machine != ELF_TARG_MACH) {
+		link_elf_error(filename, "Unsupported machine");
+		error = ENOEXEC;
+		goto out;
+	}
+
+	/*
+	 * We rely on the program header being in the first page.
+	 * This is not strictly required by the ABI specification, but
+	 * it seems to always true in practice.  And, it simplifies
+	 * things considerably.
+	 */
+	if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
+	      (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
+	      (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
+		link_elf_error(filename, "Unreadable program headers");
+
+	/*
+	 * Scan the program header entries, and save key information.
+	 *
+	 * We rely on there being exactly two load segments, text and data,
+	 * in that order.
+	 */
+	phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
+	phlimit = phdr + hdr->e_phnum;
+	nsegs = 0;
+	phdyn = NULL;
+	phphdr = NULL;
+	while (phdr < phlimit) {
+		switch (phdr->p_type) {
+		case PT_LOAD:
+			if (nsegs == MAXSEGS) {
+				link_elf_error(filename, "Too many sections");
+				error = ENOEXEC;
+				goto out;
+			}
+			/*
+			 * XXX: We just trust they come in right order ??
+			 */
+			segs[nsegs] = phdr;
+			++nsegs;
+			break;
+
+		case PT_PHDR:
+			phphdr = phdr;
+			break;
+
+		case PT_DYNAMIC:
+			phdyn = phdr;
+			break;
+
+		case PT_INTERP:
+			error = ENOSYS;
+			goto out;
+		}
+
+		++phdr;
+	}
+	if (phdyn == NULL) {
+		link_elf_error(filename, "Object is not dynamically-linked");
+		error = ENOEXEC;
+		goto out;
+	}
+	if (nsegs == 0) {
+		link_elf_error(filename, "No sections");
+		error = ENOEXEC;
+		goto out;
+	}
+
+	/*
+	 * Allocate the entire address space of the object, to stake
+	 * out our contiguous region, and to establish the base
+	 * address for relocation.
+	 */
+	base_offset = trunc_page(segs[0]->p_offset);
+	base_vaddr = trunc_page(segs[0]->p_vaddr);
+	base_vlimit = round_page(segs[nsegs - 1]->p_vaddr + 
+	    segs[nsegs - 1]->p_memsz);
+	mapsize = base_vlimit - base_vaddr;
+
+	lf = linker_make_file(filename, &link_elf_class);
+	if (lf == NULL) {
+		error = ENOMEM;
+		goto out;
+	}
+
+	ef = (elf_file_t) lf;
+#ifdef SPARSE_MAPPING
+	ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
+	if (ef->object == NULL) {
+		error = ENOMEM;
+		goto out;
+	}
+	ef->address = (caddr_t) vm_map_min(kernel_map);
+	error = vm_map_find(kernel_map, ef->object, 0,
+	    (vm_offset_t *) &ef->address, mapsize, 1,
+	    VM_PROT_ALL, VM_PROT_ALL, 0);
+	if (error != 0) {
+		vm_object_deallocate(ef->object);
+		ef->object = 0;
+		goto out;
+	}
+#else
+	ef->address = malloc(mapsize, M_LINKER, M_WAITOK);
+#endif
+	mapbase = ef->address;
+
+	/*
+	 * Read the text and data sections and zero the bss.
+	 */
+	for (i = 0; i < nsegs; i++) {
+		caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
+		error = vn_rdwr(UIO_READ, nd.ni_vp,
+		    segbase, segs[i]->p_filesz, segs[i]->p_offset,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+		    &resid, td);
+		if (error != 0)
+			goto out;
+		bzero(segbase + segs[i]->p_filesz,
+		    segs[i]->p_memsz - segs[i]->p_filesz);
+
+#ifdef SPARSE_MAPPING
+		/*
+		 * Wire down the pages
+		 */
+		error = vm_map_wire(kernel_map,
+		    (vm_offset_t) segbase,
+		    (vm_offset_t) segbase + segs[i]->p_memsz,
+		    VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
+		if (error != KERN_SUCCESS) {
+			error = ENOMEM;
+			goto out;
+		}
+#endif
+	}
+
+#ifdef GPROF
+	/* Update profiling information with the new text segment. */
+	mtx_lock(&Giant);
+	kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr +
+	    segs[0]->p_memsz));
+	mtx_unlock(&Giant);
+#endif
+
+	ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
+
+	lf->address = ef->address;
+	lf->size = mapsize;
+
+	error = parse_dynamic(ef);
+	if (error != 0)
+		goto out;
+	error = parse_dpcpu(ef);
+	if (error != 0)
+		goto out;
+#ifdef VIMAGE
+	error = parse_vnet(ef);
+	if (error != 0)
+		goto out;
+#endif
+	link_elf_reloc_local(lf);
+
+	VOP_UNLOCK(nd.ni_vp, 0);
+	error = linker_load_dependencies(lf);
+	vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
+	if (error != 0)
+		goto out;
+	error = relocate_file(ef);
+	if (error != 0)
+		goto out;
+
+	/*
+	 * Try and load the symbol table if it's present.  (you can
+	 * strip it!)
+	 */
+	nbytes = hdr->e_shnum * hdr->e_shentsize;
+	if (nbytes == 0 || hdr->e_shoff == 0)
+		goto nosyms;
+	shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO);
+	error = vn_rdwr(UIO_READ, nd.ni_vp,
+	    (caddr_t)shdr, nbytes, hdr->e_shoff,
+	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+	    &resid, td);
+	if (error != 0)
+		goto out;
+	symtabindex = -1;
+	symstrindex = -1;
+	for (i = 0; i < hdr->e_shnum; i++) {
+		if (shdr[i].sh_type == SHT_SYMTAB) {
+			symtabindex = i;
+			symstrindex = shdr[i].sh_link;
+		}
+	}
+	if (symtabindex < 0 || symstrindex < 0)
+		goto nosyms;
+
+	symcnt = shdr[symtabindex].sh_size;
+	ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK);
+	strcnt = shdr[symstrindex].sh_size;
+	ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK);
+
+	error = vn_rdwr(UIO_READ, nd.ni_vp,
+	    ef->symbase, symcnt, shdr[symtabindex].sh_offset,
+	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+	    &resid, td);
+	if (error != 0)
+		goto out;
+	error = vn_rdwr(UIO_READ, nd.ni_vp,
+	    ef->strbase, strcnt, shdr[symstrindex].sh_offset,
+	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+	    &resid, td);
+	if (error != 0)
+		goto out;
+
+	ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+	ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
+	ef->ddbstrcnt = strcnt;
+	ef->ddbstrtab = ef->strbase;
+
+nosyms:
+	error = link_elf_link_common_finish(lf);
+	if (error != 0)
+		goto out;
+
+	*result = lf;
+
+out:
+	VOP_UNLOCK(nd.ni_vp, 0);
+	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+	if (error != 0 && lf != NULL)
+		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+	if (shdr != NULL)
+		free(shdr, M_LINKER);
+	if (firstpage != NULL)
+		free(firstpage, M_LINKER);
+
+	return (error);
+}
+
+Elf_Addr
+elf_relocaddr(linker_file_t lf, Elf_Addr x)
+{
+	elf_file_t ef;
+
+	ef = (elf_file_t)lf;
+	if (x >= ef->pcpu_start && x < ef->pcpu_stop)
+		return ((x - ef->pcpu_start) + ef->pcpu_base);
+#ifdef VIMAGE
+	if (x >= ef->vnet_start && x < ef->vnet_stop)
+		return ((x - ef->vnet_start) + ef->vnet_base);
+#endif
+	return (x);
+}
+
+
+static void
+link_elf_unload_file(linker_file_t file)
+{
+	elf_file_t ef = (elf_file_t) file;
+
+	if (ef->pcpu_base != 0) {
+		dpcpu_free((void *)ef->pcpu_base,
+		    ef->pcpu_stop - ef->pcpu_start);
+		elf_set_delete(&set_pcpu_list, ef->pcpu_start);
+	}
+#ifdef VIMAGE
+	if (ef->vnet_base != 0) {
+		vnet_data_free((void *)ef->vnet_base,
+		    ef->vnet_stop - ef->vnet_start);
+		elf_set_delete(&set_vnet_list, ef->vnet_start);
+	}
+#endif
+#ifdef GDB
+	if (ef->gdb.l_ld != NULL) {
+		GDB_STATE(RT_DELETE);
+		free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER);
+		link_elf_delete_gdb(&ef->gdb);
+		GDB_STATE(RT_CONSISTENT);
+	}
+#endif
+
+	/* Notify MD code that a module is being unloaded. */
+	elf_cpu_unload_file(file);
+
+	if (ef->preloaded) {
+		link_elf_unload_preload(file);
+		return;
+	}
+
+#ifdef SPARSE_MAPPING
+	if (ef->object != NULL) {
+		vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+		    (vm_offset_t) ef->address
+		    + (ef->object->size << PAGE_SHIFT));
+	}
+#else
+	if (ef->address != NULL)
+		free(ef->address, M_LINKER);
+#endif
+	if (ef->symbase != NULL)
+		free(ef->symbase, M_LINKER);
+	if (ef->strbase != NULL)
+		free(ef->strbase, M_LINKER);
+	if (ef->ctftab != NULL)
+		free(ef->ctftab, M_LINKER);
+	if (ef->ctfoff != NULL)
+		free(ef->ctfoff, M_LINKER);
+	if (ef->typoff != NULL)
+		free(ef->typoff, M_LINKER);
+}
+
+static void
+link_elf_unload_preload(linker_file_t file)
+{
+	if (file->filename != NULL)
+		preload_delete_name(file->filename);
+}
+
+static const char *
+symbol_name(elf_file_t ef, Elf_Size r_info)
+{
+	const Elf_Sym *ref;
+
+	if (ELF_R_SYM(r_info)) {
+		ref = ef->symtab + ELF_R_SYM(r_info);
+		return (ef->strtab + ref->st_name);
+	}
+	return (NULL);
+}
+
+static int
+relocate_file(elf_file_t ef)
+{
+	const Elf_Rel *rellim;
+	const Elf_Rel *rel;
+	const Elf_Rela *relalim;
+	const Elf_Rela *rela;
+	const char *symname;
+
+	/* Perform relocations without addend if there are any: */
+	rel = ef->rel;
+	if (rel != NULL) {
+		rellim = (const Elf_Rel *)
+		    ((const char *)ef->rel + ef->relsize);
+		while (rel < rellim) {
+			if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel,
+			    ELF_RELOC_REL, elf_lookup)) {
+				symname = symbol_name(ef, rel->r_info);
+				printf("link_elf: symbol %s undefined\n", symname);
+				return (ENOENT);
+			}
+			rel++;
+		}
+	}
+
+	/* Perform relocations with addend if there are any: */
+	rela = ef->rela;
+	if (rela != NULL) {
+		relalim = (const Elf_Rela *)
+		    ((const char *)ef->rela + ef->relasize);
+		while (rela < relalim) {
+			if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela,
+			    ELF_RELOC_RELA, elf_lookup)) {
+				symname = symbol_name(ef, rela->r_info);
+				printf("link_elf: symbol %s undefined\n",
+				    symname);
+				return (ENOENT);
+			}
+			rela++;
+		}
+	}
+
+	/* Perform PLT relocations without addend if there are any: */
+	rel = ef->pltrel;
+	if (rel != NULL) {
+		rellim = (const Elf_Rel *)
+		    ((const char *)ef->pltrel + ef->pltrelsize);
+		while (rel < rellim) {
+			if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel,
+			    ELF_RELOC_REL, elf_lookup)) {
+				symname = symbol_name(ef, rel->r_info);
+				printf("link_elf: symbol %s undefined\n",
+				    symname);
+				return (ENOENT);
+			}
+			rel++;
+		}
+	}
+
+	/* Perform relocations with addend if there are any: */
+	rela = ef->pltrela;
+	if (rela != NULL) {
+		relalim = (const Elf_Rela *)
+		    ((const char *)ef->pltrela + ef->pltrelasize);
+		while (rela < relalim) {
+			if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela,
+			    ELF_RELOC_RELA, elf_lookup)) {
+				symname = symbol_name(ef, rela->r_info);
+				printf("link_elf: symbol %s undefined\n",
+				    symname);
+				return (ENOENT);
+			}
+			rela++;
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Hash function for symbol table lookup.  Don't even think about changing
+ * this.  It is specified by the System V ABI.
+ */
+static unsigned long
+elf_hash(const char *name)
+{
+	const unsigned char *p = (const unsigned char *) name;
+	unsigned long h = 0;
+	unsigned long g;
+
+	while (*p != '\0') {
+		h = (h << 4) + *p++;
+		if ((g = h & 0xf0000000) != 0)
+			h ^= g >> 24;
+		h &= ~g;
+	}
+	return (h);
+}
+
+static int
+link_elf_lookup_symbol(linker_file_t lf, const char* name, c_linker_sym_t* sym)
+{
+	elf_file_t ef = (elf_file_t) lf;
+	unsigned long symnum;
+	const Elf_Sym* symp;
+	const char *strp;
+	unsigned long hash;
+	int i;
+
+	/* If we don't have a hash, bail. */
+	if (ef->buckets == NULL || ef->nbuckets == 0) {
+		printf("link_elf_lookup_symbol: missing symbol hash table\n");
+		return (ENOENT);
+	}
+
+	/* First, search hashed global symbols */
+	hash = elf_hash(name);
+	symnum = ef->buckets[hash % ef->nbuckets];
+
+	while (symnum != STN_UNDEF) {
+		if (symnum >= ef->nchains) {
+			printf("%s: corrupt symbol table\n", __func__);
+			return (ENOENT);
+		}
+
+		symp = ef->symtab + symnum;
+		if (symp->st_name == 0) {
+			printf("%s: corrupt symbol table\n", __func__);
+			return (ENOENT);
+		}
+
+		strp = ef->strtab + symp->st_name;
+
+		if (strcmp(name, strp) == 0) {
+			if (symp->st_shndx != SHN_UNDEF ||
+			    (symp->st_value != 0 &&
+			     ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+				*sym = (c_linker_sym_t) symp;
+				return (0);
+			}
+			return (ENOENT);
+		}
+
+		symnum = ef->chains[symnum];
+	}
+
+	/* If we have not found it, look at the full table (if loaded) */
+	if (ef->symtab == ef->ddbsymtab)
+		return (ENOENT);
+
+	/* Exhaustive search */
+	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+		strp = ef->ddbstrtab + symp->st_name;
+		if (strcmp(name, strp) == 0) {
+			if (symp->st_shndx != SHN_UNDEF ||
+			    (symp->st_value != 0 &&
+			     ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+				*sym = (c_linker_sym_t) symp;
+				return (0);
+			}
+			return (ENOENT);
+		}
+	}
+
+	return (ENOENT);
+}
+
+static int
+link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym,
+    linker_symval_t *symval)
+{
+	elf_file_t ef = (elf_file_t) lf;
+	const Elf_Sym* es = (const Elf_Sym*) sym;
+
+	if (es >= ef->symtab && es < (ef->symtab + ef->nchains)) {
+		symval->name = ef->strtab + es->st_name;
+		symval->value = (caddr_t) ef->address + es->st_value;
+		symval->size = es->st_size;
+		return (0);
+	}
+	if (ef->symtab == ef->ddbsymtab)
+		return (ENOENT);
+	if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) {
+		symval->name = ef->ddbstrtab + es->st_name;
+		symval->value = (caddr_t) ef->address + es->st_value;
+		symval->size = es->st_size;
+		return (0);
+	}
+	return (ENOENT);
+}
+
+static int
+link_elf_search_symbol(linker_file_t lf, caddr_t value,
+    c_linker_sym_t *sym, long *diffp)
+{
+	elf_file_t ef = (elf_file_t) lf;
+	u_long off = (uintptr_t) (void *) value;
+	u_long diff = off;
+	u_long st_value;
+	const Elf_Sym* es;
+	const Elf_Sym* best = 0;
+	int i;
+
+	for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
+		if (es->st_name == 0)
+			continue;
+		st_value = es->st_value + (uintptr_t) (void *) ef->address;
+		if (off >= st_value) {
+			if (off - st_value < diff) {
+				diff = off - st_value;
+				best = es;
+				if (diff == 0)
+					break;
+			} else if (off - st_value == diff) {
+				best = es;
+			}
+		}
+	}
+	if (best == 0)
+		*diffp = off;
+	else
+		*diffp = diff;
+	*sym = (c_linker_sym_t) best;
+
+	return (0);
+}
+
+/*
+ * Look up a linker set on an ELF system.
+ */
+static int
+link_elf_lookup_set(linker_file_t lf, const char *name,
+    void ***startp, void ***stopp, int *countp)
+{
+	c_linker_sym_t sym;
+	linker_symval_t symval;
+	char *setsym;
+	void **start, **stop;
+	int len, error = 0, count;
+
+	len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */
+	setsym = malloc(len, M_LINKER, M_WAITOK);
+
+	/* get address of first entry */
+	snprintf(setsym, len, "%s%s", "__start_set_", name);
+	error = link_elf_lookup_symbol(lf, setsym, &sym);
+	if (error != 0)
+		goto out;
+	link_elf_symbol_values(lf, sym, &symval);
+	if (symval.value == 0) {
+		error = ESRCH;
+		goto out;
+	}
+	start = (void **)symval.value;
+
+	/* get address of last entry */
+	snprintf(setsym, len, "%s%s", "__stop_set_", name);
+	error = link_elf_lookup_symbol(lf, setsym, &sym);
+	if (error != 0)
+		goto out;
+	link_elf_symbol_values(lf, sym, &symval);
+	if (symval.value == 0) {
+		error = ESRCH;
+		goto out;
+	}
+	stop = (void **)symval.value;
+
+	/* and the number of entries */
+	count = stop - start;
+
+	/* and copy out */
+	if (startp != NULL)
+		*startp = start;
+	if (stopp != NULL)
+		*stopp = stop;
+	if (countp != NULL)
+		*countp = count;
+
+out:
+	free(setsym, M_LINKER);
+	return (error);
+}
+
+static int
+link_elf_each_function_name(linker_file_t file,
+  int (*callback)(const char *, void *), void *opaque)
+{
+	elf_file_t ef = (elf_file_t)file;
+	const Elf_Sym *symp;
+	int i, error;
+	
+	/* Exhaustive search */
+	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+		if (symp->st_value != 0 &&
+		    ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
+			error = callback(ef->ddbstrtab + symp->st_name, opaque);
+			if (error != 0)
+				return (error);
+		}
+	}
+	return (0);
+}
+
+static int
+link_elf_each_function_nameval(linker_file_t file,
+    linker_function_nameval_callback_t callback, void *opaque)
+{
+	linker_symval_t symval;
+	elf_file_t ef = (elf_file_t)file;
+	const Elf_Sym* symp;
+	int i, error;
+
+	/* Exhaustive search */
+	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+		if (symp->st_value != 0 &&
+		    ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
+			error = link_elf_symbol_values(file,
+			    (c_linker_sym_t) symp, &symval);
+			if (error != 0)
+				return (error);
+			error = callback(file, i, &symval, opaque);
+			if (error != 0)
+				return (error);
+		}
+	}
+	return (0);
+}
+
+#ifdef __ia64__
+/*
+ * Each KLD has its own GP. The GP value for each load module is given by
+ * DT_PLTGOT on ia64. We need GP to construct function descriptors, but
+ * don't have direct access to the ELF file structure. The link_elf_get_gp()
+ * function returns the GP given a pointer to a generic linker file struct.
+ */
+Elf_Addr
+link_elf_get_gp(linker_file_t lf)
+{
+	elf_file_t ef = (elf_file_t)lf;
+	return ((Elf_Addr)ef->got);
+}
+#endif
+
+const Elf_Sym *
+elf_get_sym(linker_file_t lf, Elf_Size symidx)
+{
+	elf_file_t ef = (elf_file_t)lf;
+
+	if (symidx >= ef->nchains)
+		return (NULL);
+	return (ef->symtab + symidx);
+}
+
+const char *
+elf_get_symname(linker_file_t lf, Elf_Size symidx)
+{
+	elf_file_t ef = (elf_file_t)lf;
+	const Elf_Sym *sym;
+
+	if (symidx >= ef->nchains)
+		return (NULL);
+	sym = ef->symtab + symidx;
+	return (ef->strtab + sym->st_name);
+}
+
+/*
+ * Symbol lookup function that can be used when the symbol index is known (ie
+ * in relocations). It uses the symbol index instead of doing a fully fledged
+ * hash table based lookup when such is valid. For example for local symbols.
+ * This is not only more efficient, it's also more correct. It's not always
+ * the case that the symbol can be found through the hash table.
+ */
+static Elf_Addr
+elf_lookup(linker_file_t lf, Elf_Size symidx, int deps)
+{
+	elf_file_t ef = (elf_file_t)lf;
+	const Elf_Sym *sym;
+	const char *symbol;
+	Elf_Addr addr, start, base;
+
+	/* Don't even try to lookup the symbol if the index is bogus. */
+	if (symidx >= ef->nchains)
+		return (0);
+
+	sym = ef->symtab + symidx;
+
+	/*
+	 * Don't do a full lookup when the symbol is local. It may even
+	 * fail because it may not be found through the hash table.
+	 */
+	if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) {
+		/* Force lookup failure when we have an insanity. */
+		if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0)
+			return (0);
+		return ((Elf_Addr)ef->address + sym->st_value);
+	}
+
+	/*
+	 * XXX we can avoid doing a hash table based lookup for global
+	 * symbols as well. This however is not always valid, so we'll
+	 * just do it the hard way for now. Performance tweaks can
+	 * always be added.
+	 */
+
+	symbol = ef->strtab + sym->st_name;
+
+	/* Force a lookup failure if the symbol name is bogus. */
+	if (*symbol == 0)
+		return (0);
+
+	addr = ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps));
+
+	if (elf_set_find(&set_pcpu_list, addr, &start, &base))
+		addr = addr - start + base;
+#ifdef VIMAGE
+	else if (elf_set_find(&set_vnet_list, addr, &start, &base))
+		addr = addr - start + base;
+#endif
+	return addr;
+}
+
+static void
+link_elf_reloc_local(linker_file_t lf)
+{
+	const Elf_Rel *rellim;
+	const Elf_Rel *rel;
+	const Elf_Rela *relalim;
+	const Elf_Rela *rela;
+	elf_file_t ef = (elf_file_t)lf;
+
+	/* Perform relocations without addend if there are any: */
+	if ((rel = ef->rel) != NULL) {
+		rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize);
+		while (rel < rellim) {
+			elf_reloc_local(lf, (Elf_Addr)ef->address, rel,
+			    ELF_RELOC_REL, elf_lookup);
+			rel++;
+		}
+	}
+
+	/* Perform relocations with addend if there are any: */
+	if ((rela = ef->rela) != NULL) {
+		relalim = (const Elf_Rela *)
+		    ((const char *)ef->rela + ef->relasize);
+		while (rela < relalim) {
+			elf_reloc_local(lf, (Elf_Addr)ef->address, rela,
+			    ELF_RELOC_RELA, elf_lookup);
+			rela++;
+		}
+	}
+}
+
+static long
+link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab)
+{
+	elf_file_t ef = (elf_file_t)lf;
+
+	*symtab = ef->ddbsymtab;
+
+	if (*symtab == NULL)
+		return (0);
+
+	return (ef->ddbsymcnt);
+}
+    
+static long
+link_elf_strtab_get(linker_file_t lf, caddr_t *strtab)
+{
+	elf_file_t ef = (elf_file_t)lf;
+
+	*strtab = ef->ddbstrtab;
+
+	if (*strtab == NULL)
+		return (0);
+
+	return (ef->ddbstrcnt);
+}
diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c
new file mode 100644
index 0000000..a9208df
--- /dev/null
+++ b/sys/kern/link_elf_obj.c
@@ -0,0 +1,1375 @@
+/*-
+ * Copyright (c) 1998-2000 Doug Rabson
+ * Copyright (c) 2004 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+
+#include <machine/elf.h>
+
+#include <net/vnet.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#include <sys/link_elf.h>
+
+#ifdef DDB_CTF
+#include <net/zlib.h>
+#endif
+
+#include "linker_if.h"
+
+typedef struct {
+	void		*addr;
+	Elf_Off		size;
+	int		flags;
+	int		sec;	/* Original section */
+	char		*name;
+} Elf_progent;
+
+typedef struct {
+	Elf_Rel		*rel;
+	int		nrel;
+	int		sec;
+} Elf_relent;
+
+typedef struct {
+	Elf_Rela	*rela;
+	int		nrela;
+	int		sec;
+} Elf_relaent;
+
+
+typedef struct elf_file {
+	struct linker_file lf;		/* Common fields */
+
+	int		preloaded;
+	caddr_t		address;	/* Relocation address */
+	vm_object_t	object;		/* VM object to hold file pages */
+	Elf_Shdr	*e_shdr;
+
+	Elf_progent	*progtab;
+	int		nprogtab;
+
+	Elf_relaent	*relatab;
+	int		nrelatab;
+
+	Elf_relent	*reltab;
+	int		nreltab;
+
+	Elf_Sym		*ddbsymtab;	/* The symbol table we are using */
+	long		ddbsymcnt;	/* Number of symbols */
+	caddr_t		ddbstrtab;	/* String table */
+	long		ddbstrcnt;	/* number of bytes in string table */
+
+	caddr_t		shstrtab;	/* Section name string table */
+	long		shstrcnt;	/* number of bytes in string table */
+
+	caddr_t		ctftab;		/* CTF table */
+	long		ctfcnt;		/* number of bytes in CTF table */
+	caddr_t		ctfoff;		/* CTF offset table */
+	caddr_t		typoff;		/* Type offset table */
+	long		typlen;		/* Number of type entries. */
+
+} *elf_file_t;
+
+#include <kern/kern_ctf.c>
+
+static int	link_elf_link_preload(linker_class_t cls,
+		    const char *, linker_file_t *);
+static int	link_elf_link_preload_finish(linker_file_t);
+static int	link_elf_load_file(linker_class_t, const char *, linker_file_t *);
+static int	link_elf_lookup_symbol(linker_file_t, const char *,
+		    c_linker_sym_t *);
+static int	link_elf_symbol_values(linker_file_t, c_linker_sym_t,
+		    linker_symval_t *);
+static int	link_elf_search_symbol(linker_file_t, caddr_t value,
+		    c_linker_sym_t *sym, long *diffp);
+
+static void	link_elf_unload_file(linker_file_t);
+static int	link_elf_lookup_set(linker_file_t, const char *,
+		    void ***, void ***, int *);
+static int	link_elf_each_function_name(linker_file_t,
+		    int (*)(const char *, void *), void *);
+static int	link_elf_each_function_nameval(linker_file_t,
+				linker_function_nameval_callback_t,
+				void *);
+static void	link_elf_reloc_local(linker_file_t);
+static long	link_elf_symtab_get(linker_file_t, const Elf_Sym **);
+static long	link_elf_strtab_get(linker_file_t, caddr_t *);
+
+static Elf_Addr elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps);
+
+static kobj_method_t link_elf_methods[] = {
+	KOBJMETHOD(linker_lookup_symbol,	link_elf_lookup_symbol),
+	KOBJMETHOD(linker_symbol_values,	link_elf_symbol_values),
+	KOBJMETHOD(linker_search_symbol,	link_elf_search_symbol),
+	KOBJMETHOD(linker_unload,		link_elf_unload_file),
+	KOBJMETHOD(linker_load_file,		link_elf_load_file),
+	KOBJMETHOD(linker_link_preload,		link_elf_link_preload),
+	KOBJMETHOD(linker_link_preload_finish,	link_elf_link_preload_finish),
+	KOBJMETHOD(linker_lookup_set,		link_elf_lookup_set),
+	KOBJMETHOD(linker_each_function_name,	link_elf_each_function_name),
+	KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval),
+	KOBJMETHOD(linker_ctf_get,		link_elf_ctf_get),
+	KOBJMETHOD(linker_symtab_get, 		link_elf_symtab_get),
+	KOBJMETHOD(linker_strtab_get, 		link_elf_strtab_get),
+	{ 0, 0 }
+};
+
+static struct linker_class link_elf_class = {
+#if ELF_TARG_CLASS == ELFCLASS32
+	"elf32_obj",
+#else
+	"elf64_obj",
+#endif
+	link_elf_methods, sizeof(struct elf_file)
+};
+
+static int	relocate_file(elf_file_t ef);
+
+static void
+link_elf_error(const char *filename, const char *s)
+{
+	if (filename == NULL)
+		printf("kldload: %s\n", s);
+	else
+		printf("kldload: %s: %s\n", filename, s);
+}
+
+static void
+link_elf_init(void *arg)
+{
+
+	linker_add_class(&link_elf_class);
+}
+
+SYSINIT(link_elf_obj, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
+
+static int
+link_elf_link_preload(linker_class_t cls, const char *filename,
+    linker_file_t *result)
+{
+	Elf_Ehdr *hdr;
+	Elf_Shdr *shdr;
+	Elf_Sym *es;
+	void *modptr, *baseptr, *sizeptr;
+	char *type;
+	elf_file_t ef;
+	linker_file_t lf;
+	Elf_Addr off;
+	int error, i, j, pb, ra, rl, shstrindex, symstrindex, symtabindex;
+
+	/* Look to see if we have the file preloaded */
+	modptr = preload_search_by_name(filename);
+	if (modptr == NULL)
+		return ENOENT;
+
+	type = (char *)preload_search_info(modptr, MODINFO_TYPE);
+	baseptr = preload_search_info(modptr, MODINFO_ADDR);
+	sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+	hdr = (Elf_Ehdr *)preload_search_info(modptr, MODINFO_METADATA |
+	    MODINFOMD_ELFHDR);
+	shdr = (Elf_Shdr *)preload_search_info(modptr, MODINFO_METADATA |
+	    MODINFOMD_SHDR);
+	if (type == NULL || (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE)
+	    " obj module") != 0 &&
+	    strcmp(type, "elf obj module") != 0)) {
+		return (EFTYPE);
+	}
+	if (baseptr == NULL || sizeptr == NULL || hdr == NULL ||
+	    shdr == NULL)
+		return (EINVAL);
+
+	lf = linker_make_file(filename, &link_elf_class);
+	if (lf == NULL)
+		return (ENOMEM);
+
+	ef = (elf_file_t)lf;
+	ef->preloaded = 1;
+	ef->address = *(caddr_t *)baseptr;
+	lf->address = *(caddr_t *)baseptr;
+	lf->size = *(size_t *)sizeptr;
+
+	if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
+	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
+	    hdr->e_ident[EI_VERSION] != EV_CURRENT ||
+	    hdr->e_version != EV_CURRENT ||
+	    hdr->e_type != ET_REL ||
+	    hdr->e_machine != ELF_TARG_MACH) {
+		error = EFTYPE;
+		goto out;
+	}
+	ef->e_shdr = shdr;
+
+	/* Scan the section header for information and table sizing. */
+	symtabindex = -1;
+	symstrindex = -1;
+	for (i = 0; i < hdr->e_shnum; i++) {
+		switch (shdr[i].sh_type) {
+		case SHT_PROGBITS:
+		case SHT_NOBITS:
+			ef->nprogtab++;
+			break;
+		case SHT_SYMTAB:
+			symtabindex = i;
+			symstrindex = shdr[i].sh_link;
+			break;
+		case SHT_REL:
+			ef->nreltab++;
+			break;
+		case SHT_RELA:
+			ef->nrelatab++;
+			break;
+		}
+	}
+
+	shstrindex = hdr->e_shstrndx;
+	if (ef->nprogtab == 0 || symstrindex < 0 ||
+	    symstrindex >= hdr->e_shnum ||
+	    shdr[symstrindex].sh_type != SHT_STRTAB || shstrindex == 0 ||
+	    shstrindex >= hdr->e_shnum ||
+	    shdr[shstrindex].sh_type != SHT_STRTAB) {
+		printf("%s: bad/missing section headers\n", filename);
+		error = ENOEXEC;
+		goto out;
+	}
+
+	/* Allocate space for tracking the load chunks */
+	if (ef->nprogtab != 0)
+		ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab),
+		    M_LINKER, M_WAITOK | M_ZERO);
+	if (ef->nreltab != 0)
+		ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab),
+		    M_LINKER, M_WAITOK | M_ZERO);
+	if (ef->nrelatab != 0)
+		ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab),
+		    M_LINKER, M_WAITOK | M_ZERO);
+	if ((ef->nprogtab != 0 && ef->progtab == NULL) ||
+	    (ef->nreltab != 0 && ef->reltab == NULL) ||
+	    (ef->nrelatab != 0 && ef->relatab == NULL)) {
+		error = ENOMEM;
+		goto out;
+	}
+
+	/* XXX, relocate the sh_addr fields saved by the loader. */
+	off = 0;
+	for (i = 0; i < hdr->e_shnum; i++) {
+		if (shdr[i].sh_addr != 0 && (off == 0 || shdr[i].sh_addr < off))
+			off = shdr[i].sh_addr;
+	}
+	for (i = 0; i < hdr->e_shnum; i++) {
+		if (shdr[i].sh_addr != 0)
+			shdr[i].sh_addr = shdr[i].sh_addr - off +
+			    (Elf_Addr)ef->address;
+	}
+
+	ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym);
+	ef->ddbsymtab = (Elf_Sym *)shdr[symtabindex].sh_addr;
+	ef->ddbstrcnt = shdr[symstrindex].sh_size;
+	ef->ddbstrtab = (char *)shdr[symstrindex].sh_addr;
+	ef->shstrcnt = shdr[shstrindex].sh_size;
+	ef->shstrtab = (char *)shdr[shstrindex].sh_addr;
+
+	/* Now fill out progtab and the relocation tables. */
+	pb = 0;
+	rl = 0;
+	ra = 0;
+	for (i = 0; i < hdr->e_shnum; i++) {
+		switch (shdr[i].sh_type) {
+		case SHT_PROGBITS:
+		case SHT_NOBITS:
+			ef->progtab[pb].addr = (void *)shdr[i].sh_addr;
+			if (shdr[i].sh_type == SHT_PROGBITS)
+				ef->progtab[pb].name = "<<PROGBITS>>";
+			else
+				ef->progtab[pb].name = "<<NOBITS>>";
+			ef->progtab[pb].size = shdr[i].sh_size;
+			ef->progtab[pb].sec = i;
+			if (ef->shstrtab && shdr[i].sh_name != 0)
+				ef->progtab[pb].name =
+				    ef->shstrtab + shdr[i].sh_name;
+			if (ef->progtab[pb].name != NULL && 
+			    !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) {
+				void *dpcpu;
+
+				dpcpu = dpcpu_alloc(shdr[i].sh_size);
+				if (dpcpu == NULL) {
+					error = ENOSPC;
+					goto out;
+				}
+				memcpy(dpcpu, ef->progtab[pb].addr,
+				    ef->progtab[pb].size);
+				dpcpu_copy(dpcpu, shdr[i].sh_size);
+				ef->progtab[pb].addr = dpcpu;
+#ifdef VIMAGE
+			} else if (ef->progtab[pb].name != NULL &&
+			    !strcmp(ef->progtab[pb].name, VNET_SETNAME)) {
+				void *vnet_data;
+
+				vnet_data = vnet_data_alloc(shdr[i].sh_size);
+				if (vnet_data == NULL) {
+					error = ENOSPC;
+					goto out;
+				}
+				memcpy(vnet_data, ef->progtab[pb].addr,
+				    ef->progtab[pb].size);
+				vnet_data_copy(vnet_data, shdr[i].sh_size);
+				ef->progtab[pb].addr = vnet_data;
+#endif
+			}
+
+			/* Update all symbol values with the offset. */
+			for (j = 0; j < ef->ddbsymcnt; j++) {
+				es = &ef->ddbsymtab[j];
+				if (es->st_shndx != i)
+					continue;
+				es->st_value += (Elf_Addr)ef->progtab[pb].addr;
+			}
+			pb++;
+			break;
+		case SHT_REL:
+			ef->reltab[rl].rel = (Elf_Rel *)shdr[i].sh_addr;
+			ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel);
+			ef->reltab[rl].sec = shdr[i].sh_info;
+			rl++;
+			break;
+		case SHT_RELA:
+			ef->relatab[ra].rela = (Elf_Rela *)shdr[i].sh_addr;
+			ef->relatab[ra].nrela =
+			    shdr[i].sh_size / sizeof(Elf_Rela);
+			ef->relatab[ra].sec = shdr[i].sh_info;
+			ra++;
+			break;
+		}
+	}
+	if (pb != ef->nprogtab)
+		panic("lost progbits");
+	if (rl != ef->nreltab)
+		panic("lost reltab");
+	if (ra != ef->nrelatab)
+		panic("lost relatab");
+
+	/* Local intra-module relocations */
+	link_elf_reloc_local(lf);
+
+	*result = lf;
+	return (0);
+
+out:
+	/* preload not done this way */
+	linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+	return (error);
+}
+
+static int
+link_elf_link_preload_finish(linker_file_t lf)
+{
+	elf_file_t ef;
+	int error;
+
+	ef = (elf_file_t)lf;
+	error = relocate_file(ef);
+	if (error)
+		return error;
+
+	/* Notify MD code that a module is being loaded. */
+	error = elf_cpu_load_file(lf);
+	if (error)
+		return (error);
+
+	return (0);
+}
+
+static int
+link_elf_load_file(linker_class_t cls, const char *filename,
+    linker_file_t *result)
+{
+	struct nameidata nd;
+	struct thread *td = curthread;	/* XXX */
+	Elf_Ehdr *hdr;
+	Elf_Shdr *shdr;
+	Elf_Sym *es;
+	int nbytes, i, j;
+	vm_offset_t mapbase;
+	size_t mapsize;
+	int error = 0;
+	ssize_t resid;
+	int flags;
+	elf_file_t ef;
+	linker_file_t lf;
+	int symtabindex;
+	int symstrindex;
+	int shstrindex;
+	int nsym;
+	int pb, rl, ra;
+	int alignmask;
+
+	shdr = NULL;
+	lf = NULL;
+	mapsize = 0;
+	hdr = NULL;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+	flags = FREAD;
+	error = vn_open(&nd, &flags, 0, NULL);
+	if (error)
+		return error;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (nd.ni_vp->v_type != VREG) {
+		error = ENOEXEC;
+		goto out;
+	}
+#ifdef MAC
+	error = mac_kld_check_load(td->td_ucred, nd.ni_vp);
+	if (error) {
+		goto out;
+	}
+#endif
+
+	/* Read the elf header from the file. */
+	hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK);
+	error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)hdr, sizeof(*hdr), 0,
+	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+	    &resid, td);
+	if (error)
+		goto out;
+	if (resid != 0){
+		error = ENOEXEC;
+		goto out;
+	}
+
+	if (!IS_ELF(*hdr)) {
+		error = ENOEXEC;
+		goto out;
+	}
+
+	if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
+	    || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
+		link_elf_error(filename, "Unsupported file layout");
+		error = ENOEXEC;
+		goto out;
+	}
+	if (hdr->e_ident[EI_VERSION] != EV_CURRENT
+	    || hdr->e_version != EV_CURRENT) {
+		link_elf_error(filename, "Unsupported file version");
+		error = ENOEXEC;
+		goto out;
+	}
+	if (hdr->e_type != ET_REL) {
+		error = ENOSYS;
+		goto out;
+	}
+	if (hdr->e_machine != ELF_TARG_MACH) {
+		link_elf_error(filename, "Unsupported machine");
+		error = ENOEXEC;
+		goto out;
+	}
+
+	lf = linker_make_file(filename, &link_elf_class);
+	if (!lf) {
+		error = ENOMEM;
+		goto out;
+	}
+	ef = (elf_file_t) lf;
+	ef->nprogtab = 0;
+	ef->e_shdr = 0;
+	ef->nreltab = 0;
+	ef->nrelatab = 0;
+
+	/* Allocate and read in the section header */
+	nbytes = hdr->e_shnum * hdr->e_shentsize;
+	if (nbytes == 0 || hdr->e_shoff == 0 ||
+	    hdr->e_shentsize != sizeof(Elf_Shdr)) {
+		error = ENOEXEC;
+		goto out;
+	}
+	shdr = malloc(nbytes, M_LINKER, M_WAITOK);
+	ef->e_shdr = shdr;
+	error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff,
+	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td);
+	if (error)
+		goto out;
+	if (resid) {
+		error = ENOEXEC;
+		goto out;
+	}
+
+	/* Scan the section header for information and table sizing. */
+	nsym = 0;
+	symtabindex = -1;
+	symstrindex = -1;
+	for (i = 0; i < hdr->e_shnum; i++) {
+		if (shdr[i].sh_size == 0)
+			continue;
+		switch (shdr[i].sh_type) {
+		case SHT_PROGBITS:
+		case SHT_NOBITS:
+			ef->nprogtab++;
+			break;
+		case SHT_SYMTAB:
+			nsym++;
+			symtabindex = i;
+			symstrindex = shdr[i].sh_link;
+			break;
+		case SHT_REL:
+			ef->nreltab++;
+			break;
+		case SHT_RELA:
+			ef->nrelatab++;
+			break;
+		case SHT_STRTAB:
+			break;
+		}
+	}
+	if (ef->nprogtab == 0) {
+		link_elf_error(filename, "file has no contents");
+		error = ENOEXEC;
+		goto out;
+	}
+	if (nsym != 1) {
+		/* Only allow one symbol table for now */
+		link_elf_error(filename, "file has no valid symbol table");
+		error = ENOEXEC;
+		goto out;
+	}
+	if (symstrindex < 0 || symstrindex > hdr->e_shnum ||
+	    shdr[symstrindex].sh_type != SHT_STRTAB) {
+		link_elf_error(filename, "file has invalid symbol strings");
+		error = ENOEXEC;
+		goto out;
+	}
+
+	/* Allocate space for tracking the load chunks */
+	if (ef->nprogtab != 0)
+		ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab),
+		    M_LINKER, M_WAITOK | M_ZERO);
+	if (ef->nreltab != 0)
+		ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab),
+		    M_LINKER, M_WAITOK | M_ZERO);
+	if (ef->nrelatab != 0)
+		ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab),
+		    M_LINKER, M_WAITOK | M_ZERO);
+
+	if (symtabindex == -1)
+		panic("lost symbol table index");
+	/* Allocate space for and load the symbol table */
+	ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym);
+	ef->ddbsymtab = malloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK);
+	error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->ddbsymtab,
+	    shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset,
+	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+	    &resid, td);
+	if (error)
+		goto out;
+	if (resid != 0){
+		error = EINVAL;
+		goto out;
+	}
+
+	if (symstrindex == -1)
+		panic("lost symbol string index");
+	/* Allocate space for and load the symbol strings */
+	ef->ddbstrcnt = shdr[symstrindex].sh_size;
+	ef->ddbstrtab = malloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK);
+	error = vn_rdwr(UIO_READ, nd.ni_vp, ef->ddbstrtab,
+	    shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset,
+	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+	    &resid, td);
+	if (error)
+		goto out;
+	if (resid != 0){
+		error = EINVAL;
+		goto out;
+	}
+
+	/* Do we have a string table for the section names?  */
+	shstrindex = -1;
+	if (hdr->e_shstrndx != 0 &&
+	    shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) {
+		shstrindex = hdr->e_shstrndx;
+		ef->shstrcnt = shdr[shstrindex].sh_size;
+		ef->shstrtab = malloc(shdr[shstrindex].sh_size, M_LINKER,
+		    M_WAITOK);
+		error = vn_rdwr(UIO_READ, nd.ni_vp, ef->shstrtab,
+		    shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+		    &resid, td);
+		if (error)
+			goto out;
+		if (resid != 0){
+			error = EINVAL;
+			goto out;
+		}
+	}
+
+	/* Size up code/data(progbits) and bss(nobits). */
+	alignmask = 0;
+	for (i = 0; i < hdr->e_shnum; i++) {
+		if (shdr[i].sh_size == 0)
+			continue;
+		switch (shdr[i].sh_type) {
+		case SHT_PROGBITS:
+		case SHT_NOBITS:
+			alignmask = shdr[i].sh_addralign - 1;
+			mapsize += alignmask;
+			mapsize &= ~alignmask;
+			mapsize += shdr[i].sh_size;
+			break;
+		}
+	}
+
+	/*
+	 * We know how much space we need for the text/data/bss/etc.
+	 * This stuff needs to be in a single chunk so that profiling etc
+	 * can get the bounds and gdb can associate offsets with modules
+	 */
+	ef->object = vm_object_allocate(OBJT_DEFAULT,
+	    round_page(mapsize) >> PAGE_SHIFT);
+	if (ef->object == NULL) {
+		error = ENOMEM;
+		goto out;
+	}
+	ef->address = (caddr_t) vm_map_min(kernel_map);
+
+	/*
+	 * In order to satisfy amd64's architectural requirements on the
+	 * location of code and data in the kernel's address space, request a
+	 * mapping that is above the kernel.  
+	 */
+#ifdef __amd64__
+	mapbase = KERNBASE;
+#else
+	mapbase = VM_MIN_KERNEL_ADDRESS;
+#endif
+	error = vm_map_find(kernel_map, ef->object, 0, &mapbase,
+	    round_page(mapsize), TRUE, VM_PROT_ALL, VM_PROT_ALL, FALSE);
+	if (error) {
+		vm_object_deallocate(ef->object);
+		ef->object = 0;
+		goto out;
+	}
+
+	/* Wire the pages */
+	error = vm_map_wire(kernel_map, mapbase,
+	    mapbase + round_page(mapsize),
+	    VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
+	if (error != KERN_SUCCESS) {
+		error = ENOMEM;
+		goto out;
+	}
+
+	/* Inform the kld system about the situation */
+	lf->address = ef->address = (caddr_t)mapbase;
+	lf->size = mapsize;
+
+	/*
+	 * Now load code/data(progbits), zero bss(nobits), allocate space for
+	 * and load relocs
+	 */
+	pb = 0;
+	rl = 0;
+	ra = 0;
+	alignmask = 0;
+	for (i = 0; i < hdr->e_shnum; i++) {
+		if (shdr[i].sh_size == 0)
+			continue;
+		switch (shdr[i].sh_type) {
+		case SHT_PROGBITS:
+		case SHT_NOBITS:
+			alignmask = shdr[i].sh_addralign - 1;
+			mapbase += alignmask;
+			mapbase &= ~alignmask;
+			if (ef->shstrtab && shdr[i].sh_name != 0)
+				ef->progtab[pb].name =
+				    ef->shstrtab + shdr[i].sh_name;
+			else if (shdr[i].sh_type == SHT_PROGBITS)
+				ef->progtab[pb].name = "<<PROGBITS>>";
+			else
+				ef->progtab[pb].name = "<<NOBITS>>";
+			if (ef->progtab[pb].name != NULL && 
+			    !strcmp(ef->progtab[pb].name, DPCPU_SETNAME))
+				ef->progtab[pb].addr =
+				    dpcpu_alloc(shdr[i].sh_size);
+#ifdef VIMAGE
+			else if (ef->progtab[pb].name != NULL &&
+			    !strcmp(ef->progtab[pb].name, VNET_SETNAME))
+				ef->progtab[pb].addr =
+				    vnet_data_alloc(shdr[i].sh_size);
+#endif
+			else
+				ef->progtab[pb].addr =
+				    (void *)(uintptr_t)mapbase;
+			if (ef->progtab[pb].addr == NULL) {
+				error = ENOSPC;
+				goto out;
+			}
+			ef->progtab[pb].size = shdr[i].sh_size;
+			ef->progtab[pb].sec = i;
+			if (shdr[i].sh_type == SHT_PROGBITS) {
+				error = vn_rdwr(UIO_READ, nd.ni_vp,
+				    ef->progtab[pb].addr,
+				    shdr[i].sh_size, shdr[i].sh_offset,
+				    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
+				    NOCRED, &resid, td);
+				if (error)
+					goto out;
+				if (resid != 0){
+					error = EINVAL;
+					goto out;
+				}
+				/* Initialize the per-cpu or vnet area. */
+				if (ef->progtab[pb].addr != (void *)mapbase &&
+				    !strcmp(ef->progtab[pb].name, DPCPU_SETNAME))
+					dpcpu_copy(ef->progtab[pb].addr,
+					    shdr[i].sh_size);
+#ifdef VIMAGE
+				else if (ef->progtab[pb].addr !=
+				    (void *)mapbase &&
+				    !strcmp(ef->progtab[pb].name, VNET_SETNAME))
+					vnet_data_copy(ef->progtab[pb].addr,
+					    shdr[i].sh_size);
+#endif
+			} else
+				bzero(ef->progtab[pb].addr, shdr[i].sh_size);
+
+			/* Update all symbol values with the offset. */
+			for (j = 0; j < ef->ddbsymcnt; j++) {
+				es = &ef->ddbsymtab[j];
+				if (es->st_shndx != i)
+					continue;
+				es->st_value += (Elf_Addr)ef->progtab[pb].addr;
+			}
+			mapbase += shdr[i].sh_size;
+			pb++;
+			break;
+		case SHT_REL:
+			ef->reltab[rl].rel = malloc(shdr[i].sh_size, M_LINKER,
+			    M_WAITOK);
+			ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel);
+			ef->reltab[rl].sec = shdr[i].sh_info;
+			error = vn_rdwr(UIO_READ, nd.ni_vp,
+			    (void *)ef->reltab[rl].rel,
+			    shdr[i].sh_size, shdr[i].sh_offset,
+			    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+			    &resid, td);
+			if (error)
+				goto out;
+			if (resid != 0){
+				error = EINVAL;
+				goto out;
+			}
+			rl++;
+			break;
+		case SHT_RELA:
+			ef->relatab[ra].rela = malloc(shdr[i].sh_size, M_LINKER,
+			    M_WAITOK);
+			ef->relatab[ra].nrela =
+			    shdr[i].sh_size / sizeof(Elf_Rela);
+			ef->relatab[ra].sec = shdr[i].sh_info;
+			error = vn_rdwr(UIO_READ, nd.ni_vp,
+			    (void *)ef->relatab[ra].rela,
+			    shdr[i].sh_size, shdr[i].sh_offset,
+			    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+			    &resid, td);
+			if (error)
+				goto out;
+			if (resid != 0){
+				error = EINVAL;
+				goto out;
+			}
+			ra++;
+			break;
+		}
+	}
+	if (pb != ef->nprogtab)
+		panic("lost progbits");
+	if (rl != ef->nreltab)
+		panic("lost reltab");
+	if (ra != ef->nrelatab)
+		panic("lost relatab");
+	if (mapbase != (vm_offset_t)ef->address + mapsize)
+		panic("mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n",
+		    (u_long)mapbase, ef->address, (u_long)mapsize,
+		    (u_long)(vm_offset_t)ef->address + mapsize);
+
+	/* Local intra-module relocations */
+	link_elf_reloc_local(lf);
+
+	/* Pull in dependencies */
+	VOP_UNLOCK(nd.ni_vp, 0);
+	error = linker_load_dependencies(lf);
+	vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
+	if (error)
+		goto out;
+
+	/* External relocations */
+	error = relocate_file(ef);
+	if (error)
+		goto out;
+
+	/* Notify MD code that a module is being loaded. */
+	error = elf_cpu_load_file(lf);
+	if (error)
+		goto out;
+
+	*result = lf;
+
+out:
+	VOP_UNLOCK(nd.ni_vp, 0);
+	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+	if (error && lf)
+		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+	if (hdr)
+		free(hdr, M_LINKER);
+
+	return error;
+}
+
+static void
+link_elf_unload_file(linker_file_t file)
+{
+	elf_file_t ef = (elf_file_t) file;
+	int i;
+
+	/* Notify MD code that a module is being unloaded. */
+	elf_cpu_unload_file(file);
+
+	if (ef->progtab) {
+		for (i = 0; i < ef->nprogtab; i++) {
+			if (ef->progtab[i].size == 0)
+				continue;
+			if (ef->progtab[i].name == NULL)
+				continue;
+			if (!strcmp(ef->progtab[i].name, DPCPU_SETNAME))
+				dpcpu_free(ef->progtab[i].addr,
+				    ef->progtab[i].size);
+#ifdef VIMAGE
+			else if (!strcmp(ef->progtab[i].name, VNET_SETNAME))
+				vnet_data_free(ef->progtab[i].addr,
+				    ef->progtab[i].size);
+#endif
+		}
+	}
+	if (ef->preloaded) {
+		if (ef->reltab)
+			free(ef->reltab, M_LINKER);
+		if (ef->relatab)
+			free(ef->relatab, M_LINKER);
+		if (ef->progtab)
+			free(ef->progtab, M_LINKER);
+		if (ef->ctftab)
+			free(ef->ctftab, M_LINKER);
+		if (ef->ctfoff)
+			free(ef->ctfoff, M_LINKER);
+		if (ef->typoff)
+			free(ef->typoff, M_LINKER);
+		if (file->filename != NULL)
+			preload_delete_name(file->filename);
+		/* XXX reclaim module memory? */
+		return;
+	}
+
+	for (i = 0; i < ef->nreltab; i++)
+		if (ef->reltab[i].rel)
+			free(ef->reltab[i].rel, M_LINKER);
+	for (i = 0; i < ef->nrelatab; i++)
+		if (ef->relatab[i].rela)
+			free(ef->relatab[i].rela, M_LINKER);
+	if (ef->reltab)
+		free(ef->reltab, M_LINKER);
+	if (ef->relatab)
+		free(ef->relatab, M_LINKER);
+	if (ef->progtab)
+		free(ef->progtab, M_LINKER);
+
+	if (ef->object) {
+		vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+		    (vm_offset_t) ef->address +
+		    (ef->object->size << PAGE_SHIFT));
+	}
+	if (ef->e_shdr)
+		free(ef->e_shdr, M_LINKER);
+	if (ef->ddbsymtab)
+		free(ef->ddbsymtab, M_LINKER);
+	if (ef->ddbstrtab)
+		free(ef->ddbstrtab, M_LINKER);
+	if (ef->shstrtab)
+		free(ef->shstrtab, M_LINKER);
+	if (ef->ctftab)
+		free(ef->ctftab, M_LINKER);
+	if (ef->ctfoff)
+		free(ef->ctfoff, M_LINKER);
+	if (ef->typoff)
+		free(ef->typoff, M_LINKER);
+}
+
+static const char *
+symbol_name(elf_file_t ef, Elf_Size r_info)
+{
+	const Elf_Sym *ref;
+
+	if (ELF_R_SYM(r_info)) {
+		ref = ef->ddbsymtab + ELF_R_SYM(r_info);
+		return ef->ddbstrtab + ref->st_name;
+	} else
+		return NULL;
+}
+
+static Elf_Addr
+findbase(elf_file_t ef, int sec)
+{
+	int i;
+	Elf_Addr base = 0;
+
+	for (i = 0; i < ef->nprogtab; i++) {
+		if (sec == ef->progtab[i].sec) {
+			base = (Elf_Addr)ef->progtab[i].addr;
+			break;
+		}
+	}
+	return base;
+}
+
+static int
+relocate_file(elf_file_t ef)
+{
+	const Elf_Rel *rellim;
+	const Elf_Rel *rel;
+	const Elf_Rela *relalim;
+	const Elf_Rela *rela;
+	const char *symname;
+	const Elf_Sym *sym;
+	int i;
+	Elf_Size symidx;
+	Elf_Addr base;
+
+
+	/* Perform relocations without addend if there are any: */
+	for (i = 0; i < ef->nreltab; i++) {
+		rel = ef->reltab[i].rel;
+		if (rel == NULL)
+			panic("lost a reltab!");
+		rellim = rel + ef->reltab[i].nrel;
+		base = findbase(ef, ef->reltab[i].sec);
+		if (base == 0)
+			panic("lost base for reltab");
+		for ( ; rel < rellim; rel++) {
+			symidx = ELF_R_SYM(rel->r_info);
+			if (symidx >= ef->ddbsymcnt)
+				continue;
+			sym = ef->ddbsymtab + symidx;
+			/* Local relocs are already done */
+			if (ELF_ST_BIND(sym->st_info) == STB_LOCAL)
+				continue;
+			if (elf_reloc(&ef->lf, base, rel, ELF_RELOC_REL,
+			    elf_obj_lookup)) {
+				symname = symbol_name(ef, rel->r_info);
+				printf("link_elf_obj: symbol %s undefined\n",
+				    symname);
+				return ENOENT;
+			}
+		}
+	}
+
+	/* Perform relocations with addend if there are any: */
+	for (i = 0; i < ef->nrelatab; i++) {
+		rela = ef->relatab[i].rela;
+		if (rela == NULL)
+			panic("lost a relatab!");
+		relalim = rela + ef->relatab[i].nrela;
+		base = findbase(ef, ef->relatab[i].sec);
+		if (base == 0)
+			panic("lost base for relatab");
+		for ( ; rela < relalim; rela++) {
+			symidx = ELF_R_SYM(rela->r_info);
+			if (symidx >= ef->ddbsymcnt)
+				continue;
+			sym = ef->ddbsymtab + symidx;
+			/* Local relocs are already done */
+			if (ELF_ST_BIND(sym->st_info) == STB_LOCAL)
+				continue;
+			if (elf_reloc(&ef->lf, base, rela, ELF_RELOC_RELA,
+			    elf_obj_lookup)) {
+				symname = symbol_name(ef, rela->r_info);
+				printf("link_elf_obj: symbol %s undefined\n",
+				    symname);
+				return ENOENT;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int
+link_elf_lookup_symbol(linker_file_t lf, const char *name, c_linker_sym_t *sym)
+{
+	elf_file_t ef = (elf_file_t) lf;
+	const Elf_Sym *symp;
+	const char *strp;
+	int i;
+
+	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+		strp = ef->ddbstrtab + symp->st_name;
+		if (symp->st_shndx != SHN_UNDEF && strcmp(name, strp) == 0) {
+			*sym = (c_linker_sym_t) symp;
+			return 0;
+		}
+	}
+	return ENOENT;
+}
+
+static int
+link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym,
+    linker_symval_t *symval)
+{
+	elf_file_t ef = (elf_file_t) lf;
+	const Elf_Sym *es = (const Elf_Sym*) sym;
+
+	if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) {
+		symval->name = ef->ddbstrtab + es->st_name;
+		symval->value = (caddr_t)es->st_value;
+		symval->size = es->st_size;
+		return 0;
+	}
+	return ENOENT;
+}
+
+static int
+link_elf_search_symbol(linker_file_t lf, caddr_t value,
+    c_linker_sym_t *sym, long *diffp)
+{
+	elf_file_t ef = (elf_file_t) lf;
+	u_long off = (uintptr_t) (void *) value;
+	u_long diff = off;
+	u_long st_value;
+	const Elf_Sym *es;
+	const Elf_Sym *best = 0;
+	int i;
+
+	for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
+		if (es->st_name == 0)
+			continue;
+		st_value = es->st_value;
+		if (off >= st_value) {
+			if (off - st_value < diff) {
+				diff = off - st_value;
+				best = es;
+				if (diff == 0)
+					break;
+			} else if (off - st_value == diff) {
+				best = es;
+			}
+		}
+	}
+	if (best == 0)
+		*diffp = off;
+	else
+		*diffp = diff;
+	*sym = (c_linker_sym_t) best;
+
+	return 0;
+}
+
+/*
+ * Look up a linker set on an ELF system.
+ */
+static int
+link_elf_lookup_set(linker_file_t lf, const char *name,
+    void ***startp, void ***stopp, int *countp)
+{
+	elf_file_t ef = (elf_file_t)lf;
+	void **start, **stop;
+	int i, count;
+
+	/* Relative to section number */
+	for (i = 0; i < ef->nprogtab; i++) {
+		if ((strncmp(ef->progtab[i].name, "set_", 4) == 0) &&
+		    strcmp(ef->progtab[i].name + 4, name) == 0) {
+			start  = (void **)ef->progtab[i].addr;
+			stop = (void **)((char *)ef->progtab[i].addr +
+			    ef->progtab[i].size);
+			count = stop - start;
+			if (startp)
+				*startp = start;
+			if (stopp)
+				*stopp = stop;
+			if (countp)
+				*countp = count;
+			return (0);
+		}
+	}
+	return (ESRCH);
+}
+
+static int
+link_elf_each_function_name(linker_file_t file,
+    int (*callback)(const char *, void *), void *opaque)
+{
+	elf_file_t ef = (elf_file_t)file;
+	const Elf_Sym *symp;
+	int i, error;
+	
+	/* Exhaustive search */
+	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+		if (symp->st_value != 0 &&
+		    ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
+			error = callback(ef->ddbstrtab + symp->st_name, opaque);
+			if (error)
+				return (error);
+		}
+	}
+	return (0);
+}
+
+static int
+link_elf_each_function_nameval(linker_file_t file,
+    linker_function_nameval_callback_t callback, void *opaque)
+{
+	linker_symval_t symval;
+	elf_file_t ef = (elf_file_t)file;
+	const Elf_Sym* symp;
+	int i, error;
+
+	/* Exhaustive search */
+	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+		if (symp->st_value != 0 &&
+		    ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
+			error = link_elf_symbol_values(file, (c_linker_sym_t) symp, &symval);
+			if (error)
+				return (error);
+			error = callback(file, i, &symval, opaque);
+			if (error)
+				return (error);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Symbol lookup function that can be used when the symbol index is known (ie
+ * in relocations). It uses the symbol index instead of doing a fully fledged
+ * hash table based lookup when such is valid. For example for local symbols.
+ * This is not only more efficient, it's also more correct. It's not always
+ * the case that the symbol can be found through the hash table.
+ */
+static Elf_Addr
+elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps)
+{
+	elf_file_t ef = (elf_file_t)lf;
+	const Elf_Sym *sym;
+	const char *symbol;
+	Elf_Addr ret;
+
+	/* Don't even try to lookup the symbol if the index is bogus. */
+	if (symidx >= ef->ddbsymcnt)
+		return (0);
+
+	sym = ef->ddbsymtab + symidx;
+
+	/* Quick answer if there is a definition included. */
+	if (sym->st_shndx != SHN_UNDEF)
+		return (sym->st_value);
+
+	/* If we get here, then it is undefined and needs a lookup. */
+	switch (ELF_ST_BIND(sym->st_info)) {
+	case STB_LOCAL:
+		/* Local, but undefined? huh? */
+		return (0);
+
+	case STB_GLOBAL:
+		/* Relative to Data or Function name */
+		symbol = ef->ddbstrtab + sym->st_name;
+
+		/* Force a lookup failure if the symbol name is bogus. */
+		if (*symbol == 0)
+			return (0);
+		ret = ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps));
+		return ret;
+
+	case STB_WEAK:
+		printf("link_elf_obj: Weak symbols not supported\n");
+		return (0);
+
+	default:
+		return (0);
+	}
+}
+
+static void
+link_elf_fix_link_set(elf_file_t ef)
+{
+	static const char startn[] = "__start_";
+	static const char stopn[] = "__stop_";
+	Elf_Sym *sym;
+	const char *sym_name, *linkset_name;
+	Elf_Addr startp, stopp;
+	Elf_Size symidx;
+	int start, i;
+
+	startp = stopp = 0;
+	for (symidx = 1 /* zero entry is special */;
+		symidx < ef->ddbsymcnt; symidx++) {
+		sym = ef->ddbsymtab + symidx;
+		if (sym->st_shndx != SHN_UNDEF)
+			continue;
+
+		sym_name = ef->ddbstrtab + sym->st_name;
+		if (strncmp(sym_name, startn, sizeof(startn) - 1) == 0) {
+			start = 1;
+			linkset_name = sym_name + sizeof(startn) - 1;
+		}
+		else if (strncmp(sym_name, stopn, sizeof(stopn) - 1) == 0) {
+			start = 0;
+			linkset_name = sym_name + sizeof(stopn) - 1;
+		}
+		else
+			continue;
+
+		for (i = 0; i < ef->nprogtab; i++) {
+			if (strcmp(ef->progtab[i].name, linkset_name) == 0) {
+				startp = (Elf_Addr)ef->progtab[i].addr;
+				stopp = (Elf_Addr)(startp + ef->progtab[i].size);
+				break;
+			}
+		}
+		if (i == ef->nprogtab)
+			continue;
+
+		sym->st_value = start ? startp : stopp;
+		sym->st_shndx = i;
+	}
+}
+
+static void
+link_elf_reloc_local(linker_file_t lf)
+{
+	elf_file_t ef = (elf_file_t)lf;
+	const Elf_Rel *rellim;
+	const Elf_Rel *rel;
+	const Elf_Rela *relalim;
+	const Elf_Rela *rela;
+	const Elf_Sym *sym;
+	Elf_Addr base;
+	int i;
+	Elf_Size symidx;
+
+	link_elf_fix_link_set(ef);
+
+	/* Perform relocations without addend if there are any: */
+	for (i = 0; i < ef->nreltab; i++) {
+		rel = ef->reltab[i].rel;
+		if (rel == NULL)
+			panic("lost a reltab!");
+		rellim = rel + ef->reltab[i].nrel;
+		base = findbase(ef, ef->reltab[i].sec);
+		if (base == 0)
+			panic("lost base for reltab");
+		for ( ; rel < rellim; rel++) {
+			symidx = ELF_R_SYM(rel->r_info);
+			if (symidx >= ef->ddbsymcnt)
+				continue;
+			sym = ef->ddbsymtab + symidx;
+			/* Only do local relocs */
+			if (ELF_ST_BIND(sym->st_info) != STB_LOCAL)
+				continue;
+			elf_reloc_local(lf, base, rel, ELF_RELOC_REL,
+			    elf_obj_lookup);
+		}
+	}
+
+	/* Perform relocations with addend if there are any: */
+	for (i = 0; i < ef->nrelatab; i++) {
+		rela = ef->relatab[i].rela;
+		if (rela == NULL)
+			panic("lost a relatab!");
+		relalim = rela + ef->relatab[i].nrela;
+		base = findbase(ef, ef->relatab[i].sec);
+		if (base == 0)
+			panic("lost base for relatab");
+		for ( ; rela < relalim; rela++) {
+			symidx = ELF_R_SYM(rela->r_info);
+			if (symidx >= ef->ddbsymcnt)
+				continue;
+			sym = ef->ddbsymtab + symidx;
+			/* Only do local relocs */
+			if (ELF_ST_BIND(sym->st_info) != STB_LOCAL)
+				continue;
+			elf_reloc_local(lf, base, rela, ELF_RELOC_RELA,
+			    elf_obj_lookup);
+		}
+	}
+}
+
+static long
+link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab)
+{
+    elf_file_t ef = (elf_file_t)lf;
+    
+    *symtab = ef->ddbsymtab;
+    
+    if (*symtab == NULL)
+        return (0);
+
+    return (ef->ddbsymcnt);
+}
+    
+static long
+link_elf_strtab_get(linker_file_t lf, caddr_t *strtab)
+{
+    elf_file_t ef = (elf_file_t)lf;
+
+    *strtab = ef->ddbstrtab;
+
+    if (*strtab == NULL)
+        return (0);
+
+    return (ef->ddbstrcnt);
+}
diff --git a/sys/kern/linker_if.m b/sys/kern/linker_if.m
new file mode 100644
index 0000000..3df592c
--- /dev/null
+++ b/sys/kern/linker_if.m
@@ -0,0 +1,145 @@
+#-
+# Copyright (c) 2000 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/linker.h>
+
+INTERFACE linker;
+
+#
+# Lookup a symbol in the file's symbol table.  If the symbol is not
+# found then return ENOENT, otherwise zero.
+#
+METHOD int lookup_symbol {
+    linker_file_t	file;
+    const char*		name;
+    c_linker_sym_t*	symp;
+};
+
+METHOD int symbol_values {
+    linker_file_t	file;
+    c_linker_sym_t	sym;
+    linker_symval_t*	valp;
+};
+
+METHOD int search_symbol {
+    linker_file_t	file;
+    caddr_t		value;
+    c_linker_sym_t*	symp;
+    long*		diffp;
+};
+
+#
+# Call the callback with each specified function defined in the file.
+# Stop and return the error if the callback returns an error.
+#
+METHOD int each_function_name {
+	linker_file_t	file;
+	linker_function_name_callback_t	callback;
+	void*		opaque;
+};
+
+#
+# Call the callback with each specified function and it's value
+# defined in the file.
+# Stop and return the error if the callback returns an error.
+#
+METHOD int each_function_nameval {
+	linker_file_t	file;
+	linker_function_nameval_callback_t	callback;
+	void*		opaque;
+};
+
+#
+# Search for a linker set in a file.  Return a pointer to the first
+# entry (which is itself a pointer), and the number of entries.
+# "stop" points to the entry beyond the last valid entry.
+# If count, start or stop are NULL, they are not returned.
+#
+METHOD int lookup_set {
+    linker_file_t	file;
+    const char*		name;
+    void***		start;
+    void***		stop;
+    int*		count;
+};
+
+#
+# Unload a file, releasing dependancies and freeing storage.
+#
+METHOD void unload {
+    linker_file_t	file;
+};
+
+#
+# Load CTF data if necessary and if there is a .SUNW_ctf section
+# in the ELF file, returning info in the linker CTF structure.
+#
+METHOD int ctf_get {
+	linker_file_t	file;
+	linker_ctf_t	*lc;
+};
+
+#
+# Get the symbol table, returning it in **symtab.  Return the 
+# number of symbols, otherwise zero.
+#
+METHOD long symtab_get {
+	linker_file_t	file;
+	const Elf_Sym	**symtab;
+};
+
+#
+# Get the string table, returning it in *strtab.  Return the
+# size (in bytes) of the string table, otherwise zero.
+#
+METHOD long strtab_get {
+	linker_file_t	file;
+	caddr_t		*strtab;
+};
+
+#
+# Load a file, returning the new linker_file_t in *result.  If
+# the class does not recognise the file type, zero should be
+# returned, without modifying *result.  If the file is
+# recognised, the file should be loaded, *result set to the new
+# file and zero returned.  If some other error is detected an
+# appropriate errno should be returned.
+#
+STATICMETHOD int load_file {
+    linker_class_t	cls;
+    const char*		filename;
+    linker_file_t*	result;
+};
+STATICMETHOD int link_preload {
+    linker_class_t	cls;
+    const char*		filename;
+    linker_file_t*	result;
+};
+METHOD int link_preload_finish {
+    linker_file_t	file;
+};
diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh
new file mode 100644
index 0000000..21e6046
--- /dev/null
+++ b/sys/kern/makesyscalls.sh
@@ -0,0 +1,653 @@
+#! /bin/sh -
+#	@(#)makesyscalls.sh	8.1 (Berkeley) 6/10/93
+# $FreeBSD$
+
+set -e
+
+# name of compat options:
+compat=COMPAT_43
+compat4=COMPAT_FREEBSD4
+compat6=COMPAT_FREEBSD6
+compat7=COMPAT_FREEBSD7
+
+# output files:
+sysnames="syscalls.c"
+sysproto="../sys/sysproto.h"
+sysproto_h=_SYS_SYSPROTO_H_
+syshdr="../sys/syscall.h"
+sysmk="../sys/syscall.mk"
+syssw="init_sysent.c"
+syscallprefix="SYS_"
+switchname="sysent"
+namesname="syscallnames"
+systrace="systrace_args.c"
+
+# tmp files:
+sysaue="sysent.aue.$$"
+sysdcl="sysent.dcl.$$"
+syscompat="sysent.compat.$$"
+syscompatdcl="sysent.compatdcl.$$"
+syscompat4="sysent.compat4.$$"
+syscompat4dcl="sysent.compat4dcl.$$"
+syscompat6="sysent.compat6.$$"
+syscompat6dcl="sysent.compat6dcl.$$"
+syscompat7="sysent.compat7.$$"
+syscompat7dcl="sysent.compat7dcl.$$"
+sysent="sysent.switch.$$"
+sysinc="sysinc.switch.$$"
+sysarg="sysarg.switch.$$"
+sysprotoend="sysprotoend.$$"
+systracetmp="systrace.$$"
+systraceret="systraceret.$$"
+
+if [ -r capabilities.conf ]; then
+	capenabled=`cat capabilities.conf | grep -v "^#" | grep -v "^$"`
+	capenabled=`echo $capenabled | sed 's/ /,/g'`
+else
+	capenabled=""
+fi
+
+trap "rm $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp $systraceret" 0
+
+touch $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp $systraceret
+
+case $# in
+    0)	echo "usage: $0 input-file <config-file>" 1>&2
+	exit 1
+	;;
+esac
+
+if [ -n "$2" -a -f "$2" ]; then
+	. $2
+fi
+
+sed -e '
+s/\$//g
+:join
+	/\\$/{a\
+
+	N
+	s/\\\n//
+	b join
+	}
+2,${
+	/^#/!s/\([{}()*,]\)/ \1 /g
+}
+' < $1 | awk "
+	BEGIN {
+		sysaue = \"$sysaue\"
+		sysdcl = \"$sysdcl\"
+		sysproto = \"$sysproto\"
+		sysprotoend = \"$sysprotoend\"
+		sysproto_h = \"$sysproto_h\"
+		syscompat = \"$syscompat\"
+		syscompatdcl = \"$syscompatdcl\"
+		syscompat4 = \"$syscompat4\"
+		syscompat4dcl = \"$syscompat4dcl\"
+		syscompat6 = \"$syscompat6\"
+		syscompat6dcl = \"$syscompat6dcl\"
+		syscompat7 = \"$syscompat7\"
+		syscompat7dcl = \"$syscompat7dcl\"
+		sysent = \"$sysent\"
+		syssw = \"$syssw\"
+		sysinc = \"$sysinc\"
+		sysarg = \"$sysarg\"
+		sysnames = \"$sysnames\"
+		syshdr = \"$syshdr\"
+		sysmk = \"$sysmk\"
+		systrace = \"$systrace\"
+		systracetmp = \"$systracetmp\"
+		systraceret = \"$systraceret\"
+		compat = \"$compat\"
+		compat4 = \"$compat4\"
+		compat6 = \"$compat6\"
+		compat7 = \"$compat7\"
+		syscallprefix = \"$syscallprefix\"
+		switchname = \"$switchname\"
+		namesname = \"$namesname\"
+		infile = \"$1\"
+		capenabled_string = \"$capenabled\"
+		"'
+
+		split(capenabled_string, capenabled, ",");
+
+		printf "/*\n * System call switch table.\n *\n" > syssw
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > syssw
+		printf " * $%s$\n", "FreeBSD" > syssw
+
+		printf "/*\n * System call prototypes.\n *\n" > sysarg
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarg
+		printf " * $%s$\n", "FreeBSD" > sysarg
+
+		printf "\n#ifdef %s\n\n", compat > syscompat
+		printf "\n#ifdef %s\n\n", compat4 > syscompat4
+		printf "\n#ifdef %s\n\n", compat6 > syscompat6
+		printf "\n#ifdef %s\n\n", compat7 > syscompat7
+
+		printf "/*\n * System call names.\n *\n" > sysnames
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
+		printf " * $%s$\n", "FreeBSD" > sysnames
+
+		printf "/*\n * System call numbers.\n *\n" > syshdr
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr
+		printf " * $%s$\n", "FreeBSD" > syshdr
+		printf "# FreeBSD system call names.\n" > sysmk
+		printf "# DO NOT EDIT-- this file is automatically generated.\n" > sysmk
+		printf "# $%s$\n", "FreeBSD" > sysmk
+
+		printf "/*\n * System call argument to DTrace register array converstion.\n *\n" > systrace
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > systrace
+		printf " * $%s$\n", "FreeBSD" > systrace
+	}
+	NR == 1 {
+		gsub("[$]FreeBSD: ", "", $0)
+		gsub(" [$]", "", $0)
+
+		printf " * created from%s\n */\n\n", $0 > syssw
+
+		printf "\n/* The casts are bogus but will do for now. */\n" > sysent
+		printf "struct sysent %s[] = {\n",switchname > sysent
+
+		printf " * created from%s\n */\n\n", $0 > sysarg
+		printf "#ifndef %s\n", sysproto_h > sysarg
+		printf "#define\t%s\n\n", sysproto_h > sysarg
+		printf "#include <sys/signal.h>\n" > sysarg
+		printf "#include <sys/acl.h>\n" > sysarg
+		printf "#include <sys/cpuset.h>\n" > sysarg
+		printf "#include <sys/_ffcounter.h>\n" > sysarg
+		printf "#include <sys/_semaphore.h>\n" > sysarg
+		printf "#include <sys/ucontext.h>\n\n" > sysarg
+		printf "#include <bsm/audit_kevents.h>\n\n" > sysarg
+		printf "struct proc;\n\n" > sysarg
+		printf "struct thread;\n\n" > sysarg
+		printf "#define\tPAD_(t)\t(sizeof(register_t) <= sizeof(t) ? \\\n" > sysarg
+		printf "\t\t0 : sizeof(register_t) - sizeof(t))\n\n" > sysarg
+		printf "#if BYTE_ORDER == LITTLE_ENDIAN\n"> sysarg
+		printf "#define\tPADL_(t)\t0\n" > sysarg
+		printf "#define\tPADR_(t)\tPAD_(t)\n" > sysarg
+		printf "#else\n" > sysarg
+		printf "#define\tPADL_(t)\tPAD_(t)\n" > sysarg
+		printf "#define\tPADR_(t)\t0\n" > sysarg
+		printf "#endif\n\n" > sysarg
+
+		printf " * created from%s\n */\n\n", $0 > sysnames
+		printf "const char *%s[] = {\n", namesname > sysnames
+
+		printf " * created from%s\n */\n\n", $0 > syshdr
+
+		printf "# created from%s\nMIASM = ", $0 > sysmk
+
+		printf " * This file is part of the DTrace syscall provider.\n */\n\n" > systrace
+		printf "static void\nsystrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)\n{\n" > systrace
+		printf "\tint64_t *iarg  = (int64_t *) uarg;\n" > systrace
+		printf "\tswitch (sysnum) {\n" > systrace
+
+		printf "static void\nsystrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)\n{\n\tconst char *p = NULL;\n" > systracetmp
+		printf "\tswitch (sysnum) {\n" > systracetmp
+
+		printf "static void\nsystrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)\n{\n\tconst char *p = NULL;\n" > systraceret
+		printf "\tswitch (sysnum) {\n" > systraceret
+
+		next
+	}
+	NF == 0 || $1 ~ /^;/ {
+		next
+	}
+	$1 ~ /^#[ 	]*include/ {
+		print > sysinc
+		next
+	}
+	$1 ~ /^#[ 	]*if/ {
+		print > sysent
+		print > sysdcl
+		print > sysarg
+		print > syscompat
+		print > syscompat4
+		print > syscompat6
+		print > syscompat7
+		print > sysnames
+		print > systrace
+		print > systracetmp
+		print > systraceret
+		savesyscall = syscall
+		next
+	}
+	$1 ~ /^#[ 	]*else/ {
+		print > sysent
+		print > sysdcl
+		print > sysarg
+		print > syscompat
+		print > syscompat4
+		print > syscompat6
+		print > syscompat7
+		print > sysnames
+		print > systrace
+		print > systracetmp
+		print > systraceret
+		syscall = savesyscall
+		next
+	}
+	$1 ~ /^#/ {
+		print > sysent
+		print > sysdcl
+		print > sysarg
+		print > syscompat
+		print > syscompat4
+		print > syscompat6
+		print > syscompat7
+		print > sysnames
+		print > systrace
+		print > systracetmp
+		print > systraceret
+		next
+	}
+	syscall != $1 {
+		printf "%s: line %d: syscall number out of sync at %d\n",
+		    infile, NR, syscall
+		printf "line is:\n"
+		print
+		exit 1
+	}
+	# Returns true if the type "name" is the first flag in the type field
+	function type(name, flags, n) {
+		n = split($3, flags, /\|/)
+		return (n > 0 && flags[1] == name)
+	}
+	# Returns true if the flag "name" is set in the type field
+	function flag(name, flags, i, n) {
+		n = split($3, flags, /\|/)
+		for (i = 1; i <= n; i++)
+			if (flags[i] == name)
+				return 1
+		return 0
+	}
+	function align_sysent_comment(column) {
+		printf("\t") > sysent
+		column = column + 8 - column % 8
+		while (column < 56) {
+			printf("\t") > sysent
+			column = column + 8
+		}
+	}
+	function parserr(was, wanted) {
+		printf "%s: line %d: unexpected %s (expected %s)\n",
+		    infile, NR, was, wanted
+		exit 1
+	}
+	function parseline() {
+		f=4			# toss number, type, audit event
+		argc= 0;
+		argssize = "0"
+		thr_flag = "SY_THR_STATIC"
+		if (flag("NOTSTATIC")) {
+			thr_flag = "SY_THR_ABSENT"
+		}
+		if ($NF != "}") {
+			funcalias=$(NF-2)
+			argalias=$(NF-1)
+			rettype=$NF
+			end=NF-3
+		} else {
+			funcalias=""
+			argalias=""
+			rettype="int"
+			end=NF
+		}
+		if (flag("NODEF")) {
+			auditev="AUE_NULL"
+			funcname=$4
+			argssize = "AS(" $6 ")"
+			return
+		}
+		if ($f != "{")
+			parserr($f, "{")
+		f++
+		if ($end != "}")
+			parserr($end, "}")
+		end--
+		if ($end != ";")
+			parserr($end, ";")
+		end--
+		if ($end != ")")
+			parserr($end, ")")
+		end--
+
+		syscallret=$f
+		f++
+
+		funcname=$f
+
+		#
+		# We now know the func name, so define a flags field for it.
+		# Do this before any other processing as we may return early
+		# from it.
+		#
+		for (cap in capenabled) {
+			if (funcname == capenabled[cap]) {
+				flags = "SYF_CAPENABLED";
+			}
+		}
+
+		if (funcalias == "")
+			funcalias = funcname
+		if (argalias == "") {
+			argalias = funcname "_args"
+			if (flag("COMPAT"))
+				argalias = "o" argalias
+			if (flag("COMPAT4"))
+				argalias = "freebsd4_" argalias
+			if (flag("COMPAT6"))
+				argalias = "freebsd6_" argalias
+			if (flag("COMPAT7"))
+				argalias = "freebsd7_" argalias
+		}
+		f++
+
+		if ($f != "(")
+			parserr($f, ")")
+		f++
+
+		if (f == end) {
+			if ($f != "void")
+				parserr($f, "argument definition")
+			return
+		}
+
+		while (f <= end) {
+			argc++
+			argtype[argc]=""
+			oldf=""
+			while (f < end && $(f+1) != ",") {
+				if (argtype[argc] != "" && oldf != "*")
+					argtype[argc] = argtype[argc]" ";
+				argtype[argc] = argtype[argc]$f;
+				oldf = $f;
+				f++
+			}
+			if (argtype[argc] == "")
+				parserr($f, "argument definition")
+			argname[argc]=$f;
+			f += 2;			# skip name, and any comma
+		}
+		if (argc != 0)
+			argssize = "AS(" argalias ")"
+	}
+	{	comment = $4
+		if (NF < 7)
+			for (i = 5; i <= NF; i++)
+				comment = comment " " $i
+	}
+
+	#
+	# The AUE_ audit event identifier.
+	#
+	{
+		auditev = $2;
+	}
+
+	#
+	# The flags, if any.
+	#
+	{
+		flags = "0";
+	}
+
+	type("STD") || type("NODEF") || type("NOARGS") || type("NOPROTO") \
+	    || type("NOSTD") {
+		parseline()
+		printf("\t/* %s */\n\tcase %d: {\n", funcname, syscall) > systrace
+		printf("\t/* %s */\n\tcase %d:\n", funcname, syscall) > systracetmp
+		printf("\t/* %s */\n\tcase %d:\n", funcname, syscall) > systraceret
+		if (argc > 0) {
+			printf("\t\tswitch(ndx) {\n") > systracetmp
+			printf("\t\tstruct %s *p = params;\n", argalias) > systrace
+			for (i = 1; i <= argc; i++) {
+				printf("\t\tcase %d:\n\t\t\tp = \"%s\";\n\t\t\tbreak;\n", i - 1, argtype[i]) > systracetmp
+				if (index(argtype[i], "*") > 0 || argtype[i] == "caddr_t")
+					printf("\t\tuarg[%d] = (intptr_t) p->%s; /* %s */\n", \
+					     i - 1, \
+					     argname[i], argtype[i]) > systrace
+				else if (substr(argtype[i], 1, 1) == "u" || argtype[i] == "size_t")
+					printf("\t\tuarg[%d] = p->%s; /* %s */\n", \
+					     i - 1, \
+					     argname[i], argtype[i]) > systrace
+				else
+					printf("\t\tiarg[%d] = p->%s; /* %s */\n", \
+					     i - 1, \
+					     argname[i], argtype[i]) > systrace
+			}
+			printf("\t\tdefault:\n\t\t\tbreak;\n\t\t};\n") > systracetmp
+
+			printf("\t\tif (ndx == 0 || ndx == 1)\n") > systraceret
+			printf("\t\t\tp = \"%s\";\n", syscallret) > systraceret
+			printf("\t\tbreak;\n") > systraceret
+		}
+		printf("\t\t*n_args = %d;\n\t\tbreak;\n\t}\n", argc) > systrace
+		printf("\t\tbreak;\n") > systracetmp
+		if (argc != 0 && !flag("NOARGS") && !flag("NOPROTO") && \
+		    !flag("NODEF")) {
+			printf("struct %s {\n", argalias) > sysarg
+			for (i = 1; i <= argc; i++)
+				printf("\tchar %s_l_[PADL_(%s)]; " \
+				    "%s %s; char %s_r_[PADR_(%s)];\n",
+				    argname[i], argtype[i],
+				    argtype[i], argname[i],
+				    argname[i], argtype[i]) > sysarg
+			printf("};\n") > sysarg
+		}
+		else if (!flag("NOARGS") && !flag("NOPROTO") && !flag("NODEF"))
+			printf("struct %s {\n\tregister_t dummy;\n};\n",
+			    argalias) > sysarg
+		if (!flag("NOPROTO") && !flag("NODEF")) {
+			if (funcname == "nosys" || funcname == "lkmnosys" ||
+			    funcname == "sysarch" || funcname ~ /^freebsd/ || 
+			    funcname ~ /^linux/ || funcname ~ /^svr4/ || 
+			    funcname ~ /^ibcs2/ || funcname ~ /^xenix/) {				
+				printf("%s\t%s(struct thread *, struct %s *)",
+				    rettype, funcname, argalias) > sysdcl
+			} else {
+				printf("%s\tsys_%s(struct thread *, struct %s *)",
+				    rettype, funcname, argalias) > sysdcl
+			} 
+			printf(";\n") > sysdcl
+			printf("#define\t%sAUE_%s\t%s\n", syscallprefix,
+			    funcalias, auditev) > sysaue
+		}
+		printf("\t{ %s, (sy_call_t *)", argssize) > sysent
+		column = 8 + 2 + length(argssize) + 15
+		if (flag("NOSTD")) {
+			printf("%s },", "lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT") > sysent
+			column = column + length("lkmressys") + length("AUE_NULL") + 3
+		} else {
+			if (funcname == "nosys" || funcname == "sysarch" || 
+			    funcname == "lkmnosys" || funcname ~ /^freebsd/ ||
+			    funcname ~ /^linux/ || funcname ~ /^svr4/ ||
+			    funcname ~ /^ibcs2/ || funcname ~ /^xenix/) {
+				printf("%s, %s, NULL, 0, 0, %s, %s },", funcname, auditev, flags, thr_flag) > sysent
+				column = column + length(funcname) + length(auditev) + length(flags) + 3 
+			} else {
+				printf("sys_%s, %s, NULL, 0, 0, %s, %s },", funcname, auditev, flags, thr_flag) > sysent
+				column = column + length(funcname) + length(auditev) + length(flags) + 3 + 4
+			} 
+		} 
+		align_sysent_comment(column)
+		printf("/* %d = %s */\n", syscall, funcalias) > sysent
+		printf("\t\"%s\",\t\t\t/* %d = %s */\n",
+		    funcalias, syscall, funcalias) > sysnames
+		if (!flag("NODEF")) {
+			printf("#define\t%s%s\t%d\n", syscallprefix,
+		    	    funcalias, syscall) > syshdr
+			printf(" \\\n\t%s.o", funcalias) > sysmk
+		}
+		syscall++
+		next
+	}
+	type("COMPAT") || type("COMPAT4") || type("COMPAT6") || \
+	    type("COMPAT7") {
+		if (flag("COMPAT")) {
+			ncompat++
+			out = syscompat
+			outdcl = syscompatdcl
+			wrap = "compat"
+			prefix = "o"
+			descr = "old"
+		} else if (flag("COMPAT4")) {
+			ncompat4++
+			out = syscompat4
+			outdcl = syscompat4dcl
+			wrap = "compat4"
+			prefix = "freebsd4_"
+			descr = "freebsd4"
+		} else if (flag("COMPAT6")) {
+			ncompat6++
+			out = syscompat6
+			outdcl = syscompat6dcl
+			wrap = "compat6"
+			prefix = "freebsd6_"
+			descr = "freebsd6"
+		} else if (flag("COMPAT7")) {
+			ncompat7++
+			out = syscompat7
+			outdcl = syscompat7dcl
+			wrap = "compat7"
+			prefix = "freebsd7_"
+			descr = "freebsd7"
+		}
+		parseline()
+		if (argc != 0 && !flag("NOARGS") && !flag("NOPROTO") && \
+		    !flag("NODEF")) {
+			printf("struct %s {\n", argalias) > out
+			for (i = 1; i <= argc; i++)
+				printf("\tchar %s_l_[PADL_(%s)]; %s %s; " \
+				    "char %s_r_[PADR_(%s)];\n",
+				    argname[i], argtype[i],
+				    argtype[i], argname[i],
+				    argname[i], argtype[i]) > out
+			printf("};\n") > out
+		}
+		else if (!flag("NOARGS") && !flag("NOPROTO") && !flag("NODEF"))
+			printf("struct %s {\n\tregister_t dummy;\n};\n",
+			    argalias) > sysarg
+		if (!flag("NOPROTO") && !flag("NODEF")) {
+			printf("%s\t%s%s(struct thread *, struct %s *);\n",
+			    rettype, prefix, funcname, argalias) > outdcl
+			printf("#define\t%sAUE_%s%s\t%s\n", syscallprefix,
+			    prefix, funcname, auditev) > sysaue
+		}
+		if (flag("NOSTD")) {
+			printf("\t{ %s, (sy_call_t *)%s, %s, NULL, 0, 0, 0, SY_THR_ABSENT },",
+			    "0", "lkmressys", "AUE_NULL") > sysent
+			align_sysent_comment(8 + 2 + length("0") + 15 + \
+			    length("lkmressys") + length("AUE_NULL") + 3)
+		} else {
+			printf("\t{ %s(%s,%s), %s, NULL, 0, 0, %s, %s },",
+			    wrap, argssize, funcname, auditev, flags, thr_flag) > sysent
+			align_sysent_comment(8 + 9 + length(argssize) + 1 + \
+			    length(funcname) + length(auditev) + \
+			    length(flags) + 4)
+		}
+		printf("/* %d = %s %s */\n", syscall, descr, funcalias) > sysent
+		printf("\t\"%s.%s\",\t\t/* %d = %s %s */\n",
+		    wrap, funcalias, syscall, descr, funcalias) > sysnames
+		if (flag("COMPAT")) {
+			printf("\t\t\t\t/* %d is old %s */\n",
+			    syscall, funcalias) > syshdr
+		} else if (!flag("NODEF")) {
+			printf("#define\t%s%s%s\t%d\n", syscallprefix,
+			    prefix, funcalias, syscall) > syshdr
+			printf(" \\\n\t%s%s.o", prefix, funcalias) > sysmk
+		}
+		syscall++
+		next
+	}
+	type("OBSOL") {
+		printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },") > sysent
+		align_sysent_comment(34)
+		printf("/* %d = obsolete %s */\n", syscall, comment) > sysent
+		printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n",
+		    $4, syscall, comment) > sysnames
+		printf("\t\t\t\t/* %d is obsolete %s */\n",
+		    syscall, comment) > syshdr
+		syscall++
+		next
+	}
+	type("UNIMPL") {
+		printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },\t\t\t/* %d = %s */\n",
+		    syscall, comment) > sysent
+		printf("\t\"#%d\",\t\t\t/* %d = %s */\n",
+		    syscall, syscall, comment) > sysnames
+		syscall++
+		next
+	}
+	{
+		printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $3
+		exit 1
+	}
+	END {
+		printf "\n#define AS(name) (sizeof(struct name) / sizeof(register_t))\n" > sysinc
+
+		if (ncompat != 0 || ncompat4 != 0 || ncompat6 != 0 || ncompat7 != 0)
+			printf "#include \"opt_compat.h\"\n\n" > syssw
+
+		if (ncompat != 0) {
+			printf "\n#ifdef %s\n", compat > sysinc
+			printf "#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)\n" > sysinc
+			printf "#else\n" > sysinc
+			printf "#define compat(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+			printf "#endif\n" > sysinc
+		}
+
+		if (ncompat4 != 0) {
+			printf "\n#ifdef %s\n", compat4 > sysinc
+			printf "#define compat4(n, name) n, (sy_call_t *)__CONCAT(freebsd4_,name)\n" > sysinc
+			printf "#else\n" > sysinc
+			printf "#define compat4(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+			printf "#endif\n" > sysinc
+		}
+
+		if (ncompat6 != 0) {
+			printf "\n#ifdef %s\n", compat6 > sysinc
+			printf "#define compat6(n, name) n, (sy_call_t *)__CONCAT(freebsd6_,name)\n" > sysinc
+			printf "#else\n" > sysinc
+			printf "#define compat6(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+			printf "#endif\n" > sysinc
+		}
+
+		if (ncompat7 != 0) {
+			printf "\n#ifdef %s\n", compat7 > sysinc
+			printf "#define compat7(n, name) n, (sy_call_t *)__CONCAT(freebsd7_,name)\n" > sysinc
+			printf "#else\n" > sysinc
+			printf "#define compat7(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+			printf "#endif\n" > sysinc
+		}
+
+		printf("\n#endif /* %s */\n\n", compat) > syscompatdcl
+		printf("\n#endif /* %s */\n\n", compat4) > syscompat4dcl
+		printf("\n#endif /* %s */\n\n", compat6) > syscompat6dcl
+		printf("\n#endif /* %s */\n\n", compat7) > syscompat7dcl
+
+		printf("\n#undef PAD_\n") > sysprotoend
+		printf("#undef PADL_\n") > sysprotoend
+		printf("#undef PADR_\n") > sysprotoend
+		printf("\n#endif /* !%s */\n", sysproto_h) > sysprotoend
+
+		printf("\n") > sysmk
+		printf("};\n") > sysent
+		printf("};\n") > sysnames
+		printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \
+		    > syshdr
+		printf "\tdefault:\n\t\t*n_args = 0;\n\t\tbreak;\n\t};\n}\n" > systrace
+		printf "\tdefault:\n\t\tbreak;\n\t};\n\tif (p != NULL)\n\t\tstrlcpy(desc, p, descsz);\n}\n" > systracetmp
+		printf "\tdefault:\n\t\tbreak;\n\t};\n\tif (p != NULL)\n\t\tstrlcpy(desc, p, descsz);\n}\n" > systraceret
+	} '
+
+cat $sysinc $sysent >> $syssw
+cat $sysarg $sysdcl \
+	$syscompat $syscompatdcl \
+	$syscompat4 $syscompat4dcl \
+	$syscompat6 $syscompat6dcl \
+	$syscompat7 $syscompat7dcl \
+	$sysaue $sysprotoend > $sysproto
+cat $systracetmp >> $systrace
+cat $systraceret >> $systrace
+
diff --git a/sys/kern/md4c.c b/sys/kern/md4c.c
new file mode 100644
index 0000000..84a294a
--- /dev/null
+++ b/sys/kern/md4c.c
@@ -0,0 +1,288 @@
+/* MD4C.C - RSA Data Security, Inc., MD4 message-digest algorithm
+ */
+
+/*-
+   Copyright (C) 1990-2, RSA Data Security, Inc. All rights reserved.
+
+   License to copy and use this software is granted provided that it
+   is identified as the "RSA Data Security, Inc. MD4 Message-Digest
+   Algorithm" in all material mentioning or referencing this software
+   or this function.
+
+   License is also granted to make and use derivative works provided
+   that such works are identified as "derived from the RSA Data
+   Security, Inc. MD4 Message-Digest Algorithm" in all material
+   mentioning or referencing the derived work.
+
+   RSA Data Security, Inc. makes no representations concerning either
+   the merchantability of this software or the suitability of this
+   software for any particular purpose. It is provided "as is"
+   without express or implied warranty of any kind.
+
+   These notices must be retained in any copies of any part of this
+   documentation and/or software.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/md4.h>
+
+typedef unsigned char *POINTER;
+typedef u_int16_t UINT2;
+typedef u_int32_t UINT4;
+
+#define PROTO_LIST(list) list
+
+/* Constants for MD4Transform routine.
+ */
+#define S11 3
+#define S12 7
+#define S13 11
+#define S14 19
+#define S21 3
+#define S22 5
+#define S23 9
+#define S24 13
+#define S31 3
+#define S32 9
+#define S33 11
+#define S34 15
+
+static void MD4Transform PROTO_LIST ((UINT4 [4], const unsigned char [64]));
+static void Encode PROTO_LIST
+  ((unsigned char *, UINT4 *, unsigned int));
+static void Decode PROTO_LIST
+  ((UINT4 *, const unsigned char *, unsigned int));
+
+static unsigned char PADDING[64] = {
+  0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G and H are basic MD4 functions.
+ */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+
+/* ROTATE_LEFT rotates x left n bits.
+ */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/* FF, GG and HH are transformations for rounds 1, 2 and 3 */
+/* Rotation is separate from addition to prevent recomputation */
+#define FF(a, b, c, d, x, s) { \
+    (a) += F ((b), (c), (d)) + (x); \
+    (a) = ROTATE_LEFT ((a), (s)); \
+  }
+#define GG(a, b, c, d, x, s) { \
+    (a) += G ((b), (c), (d)) + (x) + (UINT4)0x5a827999; \
+    (a) = ROTATE_LEFT ((a), (s)); \
+  }
+#define HH(a, b, c, d, x, s) { \
+    (a) += H ((b), (c), (d)) + (x) + (UINT4)0x6ed9eba1; \
+    (a) = ROTATE_LEFT ((a), (s)); \
+  }
+
+/* MD4 initialization. Begins an MD4 operation, writing a new context.
+ */
+void MD4Init (context)
+MD4_CTX *context;                                        /* context */
+{
+  context->count[0] = context->count[1] = 0;
+
+  /* Load magic initialization constants.
+   */
+  context->state[0] = 0x67452301;
+  context->state[1] = 0xefcdab89;
+  context->state[2] = 0x98badcfe;
+  context->state[3] = 0x10325476;
+}
+
+/* MD4 block update operation. Continues an MD4 message-digest
+     operation, processing another message block, and updating the
+     context.
+ */
+void MD4Update (context, input, inputLen)
+MD4_CTX *context;                                        /* context */
+const unsigned char *input;                                /* input block */
+unsigned int inputLen;                     /* length of input block */
+{
+  unsigned int i, index, partLen;
+
+  /* Compute number of bytes mod 64 */
+  index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+  /* Update number of bits */
+  if ((context->count[0] += ((UINT4)inputLen << 3))
+      < ((UINT4)inputLen << 3))
+    context->count[1]++;
+  context->count[1] += ((UINT4)inputLen >> 29);
+
+  partLen = 64 - index;
+  /* Transform as many times as possible.
+   */
+  if (inputLen >= partLen) {
+    bcopy(input, &context->buffer[index], partLen);
+    MD4Transform (context->state, context->buffer);
+
+    for (i = partLen; i + 63 < inputLen; i += 64)
+      MD4Transform (context->state, &input[i]);
+
+    index = 0;
+  }
+  else
+    i = 0;
+
+  /* Buffer remaining input */
+  bcopy(&input[i], &context->buffer[index], inputLen-i);
+}
+
+/* MD4 padding. */
+void MD4Pad (context)
+MD4_CTX *context;                                        /* context */
+{
+  unsigned char bits[8];
+  unsigned int index, padLen;
+
+  /* Save number of bits */
+  Encode (bits, context->count, 8);
+
+  /* Pad out to 56 mod 64.
+   */
+  index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+  padLen = (index < 56) ? (56 - index) : (120 - index);
+  MD4Update (context, PADDING, padLen);
+
+  /* Append length (before padding) */
+  MD4Update (context, bits, 8);
+}
+
+/* MD4 finalization. Ends an MD4 message-digest operation, writing the
+     the message digest and zeroizing the context.
+ */
+void MD4Final (digest, context)
+unsigned char digest[16];                         /* message digest */
+MD4_CTX *context;                                        /* context */
+{
+  /* Do padding */
+  MD4Pad (context);
+
+  /* Store state in digest */
+  Encode (digest, context->state, 16);
+
+  /* Zeroize sensitive information.
+   */
+  bzero((POINTER)context, sizeof (*context));
+}
+
+/* MD4 basic transformation. Transforms state based on block.
+ */
+static void MD4Transform (state, block)
+UINT4 state[4];
+const unsigned char block[64];
+{
+  UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+  Decode (x, block, 64);
+
+  /* Round 1 */
+  FF (a, b, c, d, x[ 0], S11); /* 1 */
+  FF (d, a, b, c, x[ 1], S12); /* 2 */
+  FF (c, d, a, b, x[ 2], S13); /* 3 */
+  FF (b, c, d, a, x[ 3], S14); /* 4 */
+  FF (a, b, c, d, x[ 4], S11); /* 5 */
+  FF (d, a, b, c, x[ 5], S12); /* 6 */
+  FF (c, d, a, b, x[ 6], S13); /* 7 */
+  FF (b, c, d, a, x[ 7], S14); /* 8 */
+  FF (a, b, c, d, x[ 8], S11); /* 9 */
+  FF (d, a, b, c, x[ 9], S12); /* 10 */
+  FF (c, d, a, b, x[10], S13); /* 11 */
+  FF (b, c, d, a, x[11], S14); /* 12 */
+  FF (a, b, c, d, x[12], S11); /* 13 */
+  FF (d, a, b, c, x[13], S12); /* 14 */
+  FF (c, d, a, b, x[14], S13); /* 15 */
+  FF (b, c, d, a, x[15], S14); /* 16 */
+
+  /* Round 2 */
+  GG (a, b, c, d, x[ 0], S21); /* 17 */
+  GG (d, a, b, c, x[ 4], S22); /* 18 */
+  GG (c, d, a, b, x[ 8], S23); /* 19 */
+  GG (b, c, d, a, x[12], S24); /* 20 */
+  GG (a, b, c, d, x[ 1], S21); /* 21 */
+  GG (d, a, b, c, x[ 5], S22); /* 22 */
+  GG (c, d, a, b, x[ 9], S23); /* 23 */
+  GG (b, c, d, a, x[13], S24); /* 24 */
+  GG (a, b, c, d, x[ 2], S21); /* 25 */
+  GG (d, a, b, c, x[ 6], S22); /* 26 */
+  GG (c, d, a, b, x[10], S23); /* 27 */
+  GG (b, c, d, a, x[14], S24); /* 28 */
+  GG (a, b, c, d, x[ 3], S21); /* 29 */
+  GG (d, a, b, c, x[ 7], S22); /* 30 */
+  GG (c, d, a, b, x[11], S23); /* 31 */
+  GG (b, c, d, a, x[15], S24); /* 32 */
+
+  /* Round 3 */
+  HH (a, b, c, d, x[ 0], S31); /* 33 */
+  HH (d, a, b, c, x[ 8], S32); /* 34 */
+  HH (c, d, a, b, x[ 4], S33); /* 35 */
+  HH (b, c, d, a, x[12], S34); /* 36 */
+  HH (a, b, c, d, x[ 2], S31); /* 37 */
+  HH (d, a, b, c, x[10], S32); /* 38 */
+  HH (c, d, a, b, x[ 6], S33); /* 39 */
+  HH (b, c, d, a, x[14], S34); /* 40 */
+  HH (a, b, c, d, x[ 1], S31); /* 41 */
+  HH (d, a, b, c, x[ 9], S32); /* 42 */
+  HH (c, d, a, b, x[ 5], S33); /* 43 */
+  HH (b, c, d, a, x[13], S34); /* 44 */
+  HH (a, b, c, d, x[ 3], S31); /* 45 */
+  HH (d, a, b, c, x[11], S32); /* 46 */
+  HH (c, d, a, b, x[ 7], S33); /* 47 */
+  HH (b, c, d, a, x[15], S34); /* 48 */
+
+  state[0] += a;
+  state[1] += b;
+  state[2] += c;
+  state[3] += d;
+
+  /* Zeroize sensitive information.
+   */
+  bzero((POINTER)x, sizeof (x));
+}
+
+/* Encodes input (UINT4) into output (unsigned char). Assumes len is
+     a multiple of 4.
+ */
+static void Encode (output, input, len)
+unsigned char *output;
+UINT4 *input;
+unsigned int len;
+{
+  unsigned int i, j;
+
+  for (i = 0, j = 0; j < len; i++, j += 4) {
+    output[j] = (unsigned char)(input[i] & 0xff);
+    output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
+    output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
+    output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
+  }
+}
+
+/* Decodes input (unsigned char) into output (UINT4). Assumes len is
+     a multiple of 4.
+ */
+static void Decode (output, input, len)
+
+UINT4 *output;
+const unsigned char *input;
+unsigned int len;
+{
+  unsigned int i, j;
+
+  for (i = 0, j = 0; j < len; i++, j += 4)
+    output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
+      (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
+}
diff --git a/sys/kern/md5c.c b/sys/kern/md5c.c
new file mode 100644
index 0000000..50e2022
--- /dev/null
+++ b/sys/kern/md5c.c
@@ -0,0 +1,340 @@
+/*-
+ * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
+ *
+ * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+ * rights reserved.
+ *
+ * License to copy and use this software is granted provided that it
+ * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ * Algorithm" in all material mentioning or referencing this software
+ * or this function.
+ *
+ * License is also granted to make and use derivative works provided
+ * that such works are identified as "derived from the RSA Data
+ * Security, Inc. MD5 Message-Digest Algorithm" in all material
+ * mentioning or referencing the derived work.
+ *
+ * RSA Data Security, Inc. makes no representations concerning either
+ * the merchantability of this software or the suitability of this
+ * software for any particular purpose. It is provided "as is"
+ * without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this
+ * documentation and/or software.
+ *
+ * This code is the same as the code published by RSA Inc.  It has been
+ * edited for clarity and style only.
+ */
+
+/*
+ * This file should be kept in sync with src/lib/libmd/md5c.c
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include <machine/endian.h>
+#include <sys/endian.h>
+#include <sys/md5.h>
+
+static void MD5Transform(u_int32_t [4], const unsigned char [64]);
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+#define Encode memcpy
+#define Decode memcpy
+#else 
+
+/*
+ * Encodes input (u_int32_t) into output (unsigned char). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Encode (unsigned char *output, u_int32_t *input, unsigned int len)
+{
+	unsigned int i;
+	uint32_t ip;
+
+	for (i = 0; i < len / 4; i++) {
+		ip = input[i];
+		*output++ = ip;
+		*output++ = ip >> 8;
+		*output++ = ip >> 16;
+		*output++ = ip >> 24;
+	}
+}
+
+/*
+ * Decodes input (unsigned char) into output (u_int32_t). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Decode (u_int32_t *output, const unsigned char *input, unsigned int len)
+{
+	unsigned int i;
+
+	for (i = 0; i < len; i += 4) { 
+		*output++ = input[i] | (input[i+1] << 8) | (input[i+2] << 16) |
+		    (input[i+3] << 24);
+	}
+}
+#endif
+
+static unsigned char PADDING[64] = {
+  0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G, H and I are basic MD5 functions. */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits. */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/*
+ * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+ * Rotation is separate from addition to prevent recomputation.
+ */
+#define FF(a, b, c, d, x, s, ac) { \
+	(a) += F ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+#define GG(a, b, c, d, x, s, ac) { \
+	(a) += G ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+#define HH(a, b, c, d, x, s, ac) { \
+	(a) += H ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+#define II(a, b, c, d, x, s, ac) { \
+	(a) += I ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+
+/* MD5 initialization. Begins an MD5 operation, writing a new context. */
+
+void
+MD5Init (context)
+	MD5_CTX *context;
+{
+
+	context->count[0] = context->count[1] = 0;
+
+	/* Load magic initialization constants.  */
+	context->state[0] = 0x67452301;
+	context->state[1] = 0xefcdab89;
+	context->state[2] = 0x98badcfe;
+	context->state[3] = 0x10325476;
+}
+
+/* 
+ * MD5 block update operation. Continues an MD5 message-digest
+ * operation, processing another message block, and updating the
+ * context.
+ */
+
+void
+MD5Update (context, in, inputLen)
+	MD5_CTX *context;
+	const void *in;
+	unsigned int inputLen;
+{
+	unsigned int i, index, partLen;
+	const unsigned char *input = in;
+
+	/* Compute number of bytes mod 64 */
+	index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+
+	/* Update number of bits */
+	if ((context->count[0] += ((u_int32_t)inputLen << 3))
+	    < ((u_int32_t)inputLen << 3))
+		context->count[1]++;
+	context->count[1] += ((u_int32_t)inputLen >> 29);
+
+	partLen = 64 - index;
+
+	/* Transform as many times as possible. */
+	if (inputLen >= partLen) {
+		memcpy((void *)&context->buffer[index], (const void *)input,
+		    partLen);
+		MD5Transform (context->state, context->buffer);
+
+		for (i = partLen; i + 63 < inputLen; i += 64)
+			MD5Transform (context->state, &input[i]);
+
+		index = 0;
+	}
+	else
+		i = 0;
+
+	/* Buffer remaining input */
+	memcpy ((void *)&context->buffer[index], (const void *)&input[i],
+	    inputLen-i);
+}
+
+/*
+ * MD5 padding. Adds padding followed by original length.
+ */
+
+static void
+MD5Pad (MD5_CTX *context)
+{
+	unsigned char bits[8];
+	unsigned int index, padLen;
+
+	/* Save number of bits */
+	Encode (bits, context->count, 8);
+
+	/* Pad out to 56 mod 64. */
+	index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+	padLen = (index < 56) ? (56 - index) : (120 - index);
+	MD5Update (context, PADDING, padLen);
+
+	/* Append length (before padding) */
+	MD5Update (context, bits, 8);
+}
+
+/*
+ * MD5 finalization. Ends an MD5 message-digest operation, writing the
+ * the message digest and zeroizing the context.
+ */
+
+void
+MD5Final (digest, context)
+	unsigned char digest[16];
+	MD5_CTX *context;
+{
+	/* Do padding. */
+	MD5Pad (context);
+
+	/* Store state in digest */
+	Encode (digest, context->state, 16);
+
+	/* Zeroize sensitive information. */
+	memset ((void *)context, 0, sizeof (*context));
+}
+
+/* MD5 basic transformation. Transforms state based on block. */
+
+static void
+MD5Transform (state, block)
+	u_int32_t state[4];
+	const unsigned char block[64];
+{
+	u_int32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+	Decode (x, block, 64);
+
+	/* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+	FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+	FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+	FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+	FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+	FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+	FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+	FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+	FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+	FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+	FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+	FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+	FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+	FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+	FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+	FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+	FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+	/* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+	GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+	GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+	GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+	GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+	GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+	GG (d, a, b, c, x[10], S22,  0x2441453); /* 22 */
+	GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+	GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+	GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+	GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+	GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+	GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+	GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+	GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+	GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+	GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+	/* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+	HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+	HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+	HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+	HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+	HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+	HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+	HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+	HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+	HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+	HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+	HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+	HH (b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
+	HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+	HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+	HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+	HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+
+	/* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+	II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+	II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+	II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+	II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+	II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+	II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+	II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+	II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+	II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+	II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+	II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+	II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+	II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+	II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+	II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+	II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+
+	/* Zeroize sensitive information. */
+	memset ((void *)x, 0, sizeof (x));
+}
diff --git a/sys/kern/p1003_1b.c b/sys/kern/p1003_1b.c
new file mode 100644
index 0000000..fb89efc
--- /dev/null
+++ b/sys/kern/p1003_1b.c
@@ -0,0 +1,315 @@
+/*-
+ * Copyright (c) 1996, 1997, 1998
+ *	HD Associates, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* p1003_1b: Real Time common code.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_posix.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+
+MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B");
+
+/* The system calls return ENOSYS if an entry is called that is not run-time
+ * supported.  I am also logging since some programs start to use this when
+ * they shouldn't.  That will be removed if annoying.
+ */
+int
+syscall_not_present(struct thread *td, const char *s, struct nosys_args *uap)
+{
+	log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
+			td->td_name, td->td_proc->p_pid, s);
+
+	/* a " return nosys(p, uap); " here causes a core dump.
+	 */
+
+	return ENOSYS;
+}
+
+#if !defined(_KPOSIX_PRIORITY_SCHEDULING)
+
+/* Not configured but loadable via a module:
+ */
+
+static int
+sched_attach(void)
+{
+	return 0;
+}
+
+SYSCALL_NOT_PRESENT_GEN(sched_setparam)
+SYSCALL_NOT_PRESENT_GEN(sched_getparam)
+SYSCALL_NOT_PRESENT_GEN(sched_setscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_getscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_yield)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min)
+SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval)
+#else
+
+/* Configured in kernel version:
+ */
+static struct ksched *ksched;
+
+static int
+sched_attach(void)
+{
+	int ret = ksched_attach(&ksched);
+
+	if (ret == 0)
+		p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 200112L);
+
+	return ret;
+}
+
+int
+sys_sched_setparam(struct thread *td, struct sched_setparam_args *uap)
+{
+	struct thread *targettd;
+	struct proc *targetp;
+	int e;
+	struct sched_param sched_param;
+
+	e = copyin(uap->param, &sched_param, sizeof(sched_param));
+	if (e)
+		return (e);
+
+	if (uap->pid == 0) {
+		targetp = td->td_proc;
+		targettd = td;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(uap->pid);
+		if (targetp == NULL)
+			return (ESRCH);
+		targettd = FIRST_THREAD_IN_PROC(targetp);
+	}
+
+	e = p_cansched(td, targetp);
+	if (e == 0) {
+		e = ksched_setparam(ksched, targettd,
+			(const struct sched_param *)&sched_param);
+	}
+	PROC_UNLOCK(targetp);
+	return (e);
+}
+
+int
+sys_sched_getparam(struct thread *td, struct sched_getparam_args *uap)
+{
+	int e;
+	struct sched_param sched_param;
+	struct thread *targettd;
+	struct proc *targetp;
+
+	if (uap->pid == 0) {
+		targetp = td->td_proc;
+		targettd = td;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(uap->pid);
+		if (targetp == NULL) {
+			return (ESRCH);
+		}
+		targettd = FIRST_THREAD_IN_PROC(targetp);
+	}
+
+	e = p_cansee(td, targetp);
+	if (e == 0) {
+		e = ksched_getparam(ksched, targettd, &sched_param);
+	}
+	PROC_UNLOCK(targetp);
+	if (e == 0)
+		e = copyout(&sched_param, uap->param, sizeof(sched_param));
+	return (e);
+}
+
+int
+sys_sched_setscheduler(struct thread *td, struct sched_setscheduler_args *uap)
+{
+	int e;
+	struct sched_param sched_param;
+	struct thread *targettd;
+	struct proc *targetp;
+
+	/* Don't allow non root user to set a scheduler policy. */
+	e = priv_check(td, PRIV_SCHED_SET);
+	if (e)
+		return (e);
+
+	e = copyin(uap->param, &sched_param, sizeof(sched_param));
+	if (e)
+		return (e);
+
+	if (uap->pid == 0) {
+		targetp = td->td_proc;
+		targettd = td;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(uap->pid);
+		if (targetp == NULL)
+			return (ESRCH);
+		targettd = FIRST_THREAD_IN_PROC(targetp);
+	}
+
+	e = p_cansched(td, targetp);
+	if (e == 0) {
+		e = ksched_setscheduler(ksched, targettd,
+			uap->policy, (const struct sched_param *)&sched_param);
+	}
+	PROC_UNLOCK(targetp);
+	return (e);
+}
+
+int
+sys_sched_getscheduler(struct thread *td, struct sched_getscheduler_args *uap)
+{
+	int e, policy;
+	struct thread *targettd;
+	struct proc *targetp;
+
+	if (uap->pid == 0) {
+		targetp = td->td_proc;
+		targettd = td;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(uap->pid);
+		if (targetp == NULL)
+			return (ESRCH);
+		targettd = FIRST_THREAD_IN_PROC(targetp);
+	}
+
+	e = p_cansee(td, targetp);
+	if (e == 0) {
+		e = ksched_getscheduler(ksched, targettd, &policy);
+		td->td_retval[0] = policy;
+	}
+	PROC_UNLOCK(targetp);
+
+	return (e);
+}
+
+int
+sys_sched_yield(struct thread *td, struct sched_yield_args *uap)
+{
+
+	sched_relinquish(curthread);
+	return 0;
+}
+
+int
+sys_sched_get_priority_max(struct thread *td,
+    struct sched_get_priority_max_args *uap)
+{
+	int error, prio;
+
+	error = ksched_get_priority_max(ksched, uap->policy, &prio);
+	td->td_retval[0] = prio;
+	return (error);
+}
+
+int
+sys_sched_get_priority_min(struct thread *td,
+    struct sched_get_priority_min_args *uap)
+{
+	int error, prio;
+
+	error = ksched_get_priority_min(ksched, uap->policy, &prio);
+	td->td_retval[0] = prio;
+	return (error);
+}
+
+int
+sys_sched_rr_get_interval(struct thread *td,
+    struct sched_rr_get_interval_args *uap)
+{
+	struct timespec timespec;
+	int error;
+
+	error = kern_sched_rr_get_interval(td, uap->pid, &timespec);
+	if (error == 0)
+		error = copyout(&timespec, uap->interval, sizeof(timespec));
+	return (error);
+}
+
+int
+kern_sched_rr_get_interval(struct thread *td, pid_t pid,
+    struct timespec *ts)
+{
+	int e;
+	struct thread *targettd;
+	struct proc *targetp;
+
+	if (pid == 0) {
+		targettd = td;
+		targetp = td->td_proc;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(pid);
+		if (targetp == NULL)
+			return (ESRCH);
+		targettd = FIRST_THREAD_IN_PROC(targetp);
+	}
+
+	e = p_cansee(td, targetp);
+	if (e == 0)
+		e = ksched_rr_get_interval(ksched, targettd, ts);
+	PROC_UNLOCK(targetp);
+	return (e);
+}
+
+#endif
+
+static void
+p31binit(void *notused)
+{
+	(void) sched_attach();
+	p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+}
+
+SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL);
diff --git a/sys/kern/posix4_mib.c b/sys/kern/posix4_mib.c
new file mode 100644
index 0000000..e299787
--- /dev/null
+++ b/sys/kern/posix4_mib.c
@@ -0,0 +1,183 @@
+/*-
+ * Copyright (c) 1998
+ *	HD Associates, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+
+static int facility[CTL_P1003_1B_MAXID - 1];
+static int facility_initialized[CTL_P1003_1B_MAXID - 1];
+
+static int p31b_sysctl_proc(SYSCTL_HANDLER_ARGS);
+
+/* OID_AUTO isn't working with sysconf(3).  I guess I'd have to
+ * modify it to do a lookup by name from the index.
+ * For now I've left it a top-level sysctl.
+ */
+
+#if 1
+
+SYSCTL_DECL(_p1003_1b);
+
+#define P1B_SYSCTL(num, name)  \
+	SYSCTL_INT(_p1003_1b, num, name, CTLFLAG_RD | CTLFLAG_CAPRD, \
+	facility + num - 1, 0, "");
+#define P1B_SYSCTL_RW(num, name)  \
+	SYSCTL_PROC(_p1003_1b, num, name, CTLTYPE_INT | CTLFLAG_RW, NULL, num, \
+	    p31b_sysctl_proc, "I", "");
+
+#else
+
+SYSCTL_DECL(_kern_p1003_1b);
+
+#define P1B_SYSCTL(num, name)  \
+	SYSCTL_INT(_kern_p1003_1b, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_CAPRD, \
+	    facility + num - 1, 0, "");
+#define P1B_SYSCTL_RW(num, name)  \
+	SYSCTL_PROC(_p1003_1b, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW, NULL, \
+	    num, p31b_sysctl_proc, "I", "");
+SYSCTL_NODE(_kern, OID_AUTO, p1003_1b, CTLFLAG_RW, 0, "P1003.1B");
+
+#endif
+
+SYSCTL_INT(_p1003_1b, CTL_P1003_1B_ASYNCHRONOUS_IO, \
+	asynchronous_io, CTLFLAG_RD, &async_io_version, 0, "");
+P1B_SYSCTL(CTL_P1003_1B_MAPPED_FILES, mapped_files);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK, memlock);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK_RANGE, memlock_range);
+P1B_SYSCTL(CTL_P1003_1B_MEMORY_PROTECTION, memory_protection);
+P1B_SYSCTL(CTL_P1003_1B_MESSAGE_PASSING, message_passing);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITIZED_IO, prioritized_io);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITY_SCHEDULING, priority_scheduling);
+P1B_SYSCTL(CTL_P1003_1B_REALTIME_SIGNALS, realtime_signals);
+P1B_SYSCTL(CTL_P1003_1B_SEMAPHORES, semaphores);
+P1B_SYSCTL(CTL_P1003_1B_FSYNC, fsync);
+P1B_SYSCTL(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, shared_memory_objects);
+P1B_SYSCTL(CTL_P1003_1B_SYNCHRONIZED_IO, synchronized_io);
+P1B_SYSCTL(CTL_P1003_1B_TIMERS, timers);
+P1B_SYSCTL(CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_MAX, aio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, aio_prio_delta_max);
+P1B_SYSCTL(CTL_P1003_1B_DELAYTIMER_MAX, delaytimer_max);
+P1B_SYSCTL(CTL_P1003_1B_MQ_OPEN_MAX, mq_open_max);
+P1B_SYSCTL(CTL_P1003_1B_PAGESIZE, pagesize);
+P1B_SYSCTL(CTL_P1003_1B_RTSIG_MAX, rtsig_max);
+P1B_SYSCTL_RW(CTL_P1003_1B_SEM_NSEMS_MAX, sem_nsems_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_VALUE_MAX, sem_value_max);
+P1B_SYSCTL(CTL_P1003_1B_SIGQUEUE_MAX, sigqueue_max);
+P1B_SYSCTL(CTL_P1003_1B_TIMER_MAX, timer_max);
+
+#define P31B_VALID(num)	((num) >= 1 && (num) < CTL_P1003_1B_MAXID)
+
+static int
+p31b_sysctl_proc(SYSCTL_HANDLER_ARGS)
+{
+	int error, num, val;
+
+	num = arg2;
+	if (!P31B_VALID(num))
+		return (EINVAL);
+	val = facility_initialized[num] ? facility[num - 1] : 0;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error == 0 && req->newptr != NULL && facility_initialized[num])
+		facility[num - 1] = val;
+	return (error);
+}
+
+/* p31b_setcfg: Set the configuration
+ */
+void
+p31b_setcfg(int num, int value)
+{
+
+	if (P31B_VALID(num)) {
+		facility[num - 1] = value;
+		facility_initialized[num - 1] = 1;
+	}
+}
+
+void
+p31b_unsetcfg(int num)
+{
+
+	facility[num - 1] = 0;
+	facility_initialized[num -1] = 0;
+}
+
+int
+p31b_getcfg(int num)
+{
+
+	if (P31B_VALID(num))
+		return (facility[num - 1]);
+	return (0);
+}
+
+int
+p31b_iscfg(int num)
+{
+
+	if (P31B_VALID(num))
+		return (facility_initialized[num - 1]);
+	return (0);
+}
+
+/*
+ * Turn on indications for standard (non-configurable) kernel features.
+ */
+static void
+p31b_set_standard(void *dummy)
+{
+
+	p31b_setcfg(CTL_P1003_1B_FSYNC, 200112L);
+	p31b_setcfg(CTL_P1003_1B_MAPPED_FILES, 200112L);
+	p31b_setcfg(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, 200112L);
+	p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+	if (!p31b_iscfg(CTL_P1003_1B_AIO_LISTIO_MAX))
+		p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
+	if (!p31b_iscfg(CTL_P1003_1B_AIO_MAX))
+		p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
+	if (!p31b_iscfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX))
+		p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
+}
+
+SYSINIT(p31b_set_standard, SI_SUB_P1003_1B, SI_ORDER_ANY, p31b_set_standard, 
+	0);
+
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
new file mode 100644
index 0000000..7c7d481
--- /dev/null
+++ b/sys/kern/sched_4bsd.c
@@ -0,0 +1,1784 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_hwpmc_hooks.h"
+#include "opt_sched.h"
+#include "opt_kdtrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/kthread.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/sx.h>
+#include <sys/turnstile.h>
+#include <sys/umtx.h>
+#include <machine/pcb.h>
+#include <machine/smp.h>
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+int				dtrace_vtime_active;
+dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
+#endif
+
+/*
+ * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
+ * the range 100-256 Hz (approximately).
+ */
+#define	ESTCPULIM(e) \
+    min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
+    RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
+#ifdef SMP
+#define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
+#else
+#define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
+#endif
+#define	NICE_WEIGHT		1	/* Priorities per nice level. */
+
+#define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
+
+/*
+ * The schedulable entity that runs a context.
+ * This is  an extension to the thread structure and is tailored to
+ * the requirements of this scheduler
+ */
+struct td_sched {
+	fixpt_t		ts_pctcpu;	/* (j) %cpu during p_swtime. */
+	int		ts_cpticks;	/* (j) Ticks of cpu time. */
+	int		ts_slptime;	/* (j) Seconds !RUNNING. */
+	int		ts_slice;	/* Remaining part of time slice. */
+	int		ts_flags;
+	struct runq	*ts_runq;	/* runq the thread is currently on */
+#ifdef KTR
+	char		ts_name[TS_NAME_LEN];
+#endif
+};
+
+/* flags kept in td_flags */
+#define TDF_DIDRUN	TDF_SCHED0	/* thread actually ran. */
+#define TDF_BOUND	TDF_SCHED1	/* Bound to one CPU. */
+#define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
+
+/* flags kept in ts_flags */
+#define	TSF_AFFINITY	0x0001		/* Has a non-"full" CPU set. */
+
+#define SKE_RUNQ_PCPU(ts)						\
+    ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
+
+#define	THREAD_CAN_SCHED(td, cpu)	\
+    CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
+
+static struct td_sched td_sched0;
+struct mtx sched_lock;
+
+static int	realstathz = 127; /* stathz is sometimes 0 and run off of hz. */
+static int	sched_tdcnt;	/* Total runnable threads in the system. */
+static int	sched_slice = 12; /* Thread run time before rescheduling. */
+
+static void	setup_runqs(void);
+static void	schedcpu(void);
+static void	schedcpu_thread(void);
+static void	sched_priority(struct thread *td, u_char prio);
+static void	sched_setup(void *dummy);
+static void	maybe_resched(struct thread *td);
+static void	updatepri(struct thread *td);
+static void	resetpriority(struct thread *td);
+static void	resetpriority_thread(struct thread *td);
+#ifdef SMP
+static int	sched_pickcpu(struct thread *td);
+static int	forward_wakeup(int cpunum);
+static void	kick_other_cpu(int pri, int cpuid);
+#endif
+
+static struct kproc_desc sched_kp = {
+        "schedcpu",
+        schedcpu_thread,
+        NULL
+};
+SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start,
+    &sched_kp);
+SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
+
+static void sched_initticks(void *dummy);
+SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
+    NULL);
+
+/*
+ * Global run queue.
+ */
+static struct runq runq;
+
+#ifdef SMP
+/*
+ * Per-CPU run queues
+ */
+static struct runq runq_pcpu[MAXCPU];
+long runq_length[MAXCPU];
+
+static cpuset_t idle_cpus_mask;
+#endif
+
+struct pcpuidlestat {
+	u_int idlecalls;
+	u_int oldidlecalls;
+};
+static DPCPU_DEFINE(struct pcpuidlestat, idlestat);
+
+static void
+setup_runqs(void)
+{
+#ifdef SMP
+	int i;
+
+	for (i = 0; i < MAXCPU; ++i)
+		runq_init(&runq_pcpu[i]);
+#endif
+
+	runq_init(&runq);
+}
+
+static int
+sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
+{
+	int error, new_val, period;
+
+	period = 1000000 / realstathz;
+	new_val = period * sched_slice;
+	error = sysctl_handle_int(oidp, &new_val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (new_val <= 0)
+		return (EINVAL);
+	sched_slice = imax(1, (new_val + period / 2) / period);
+	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
+	    realstathz);
+	return (0);
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler");
+
+SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
+    "Scheduler name");
+SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
+    NULL, 0, sysctl_kern_quantum, "I",
+    "Quantum for timeshare threads in microseconds");
+SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
+    "Quantum for timeshare threads in stathz ticks");
+#ifdef SMP
+/* Enable forwarding of wakeups to all other cpus */
+static SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL,
+    "Kernel SMP");
+
+static int runq_fuzz = 1;
+SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
+
+static int forward_wakeup_enabled = 1;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
+	   &forward_wakeup_enabled, 0,
+	   "Forwarding of wakeup to idle CPUs");
+
+static int forward_wakeups_requested = 0;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
+	   &forward_wakeups_requested, 0,
+	   "Requests for Forwarding of wakeup to idle CPUs");
+
+static int forward_wakeups_delivered = 0;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
+	   &forward_wakeups_delivered, 0,
+	   "Completed Forwarding of wakeup to idle CPUs");
+
+static int forward_wakeup_use_mask = 1;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
+	   &forward_wakeup_use_mask, 0,
+	   "Use the mask of idle cpus");
+
+static int forward_wakeup_use_loop = 0;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
+	   &forward_wakeup_use_loop, 0,
+	   "Use a loop to find idle cpus");
+
+#endif
+#if 0
+static int sched_followon = 0;
+SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
+	   &sched_followon, 0,
+	   "allow threads to share a quantum");
+#endif
+
+SDT_PROVIDER_DEFINE(sched);
+
+SDT_PROBE_DEFINE3(sched, , , change_pri, change-pri, "struct thread *", 
+    "struct proc *", "uint8_t");
+SDT_PROBE_DEFINE3(sched, , , dequeue, dequeue, "struct thread *", 
+    "struct proc *", "void *");
+SDT_PROBE_DEFINE4(sched, , , enqueue, enqueue, "struct thread *", 
+    "struct proc *", "void *", "int");
+SDT_PROBE_DEFINE4(sched, , , lend_pri, lend-pri, "struct thread *", 
+    "struct proc *", "uint8_t", "struct thread *");
+SDT_PROBE_DEFINE2(sched, , , load_change, load-change, "int", "int");
+SDT_PROBE_DEFINE2(sched, , , off_cpu, off-cpu, "struct thread *",
+    "struct proc *");
+SDT_PROBE_DEFINE(sched, , , on_cpu, on-cpu);
+SDT_PROBE_DEFINE(sched, , , remain_cpu, remain-cpu);
+SDT_PROBE_DEFINE2(sched, , , surrender, surrender, "struct thread *",
+    "struct proc *");
+
+static __inline void
+sched_load_add(void)
+{
+
+	sched_tdcnt++;
+	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
+	SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt);
+}
+
+static __inline void
+sched_load_rem(void)
+{
+
+	sched_tdcnt--;
+	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
+	SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt);
+}
+/*
+ * Arrange to reschedule if necessary, taking the priorities and
+ * schedulers into account.
+ */
+static void
+maybe_resched(struct thread *td)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	if (td->td_priority < curthread->td_priority)
+		curthread->td_flags |= TDF_NEEDRESCHED;
+}
+
+/*
+ * This function is called when a thread is about to be put on run queue
+ * because it has been made runnable or its priority has been adjusted.  It
+ * determines if the new thread should be immediately preempted to.  If so,
+ * it switches to it and eventually returns true.  If not, it returns false
+ * so that the caller may place the thread on an appropriate run queue.
+ */
+int
+maybe_preempt(struct thread *td)
+{
+#ifdef PREEMPTION
+	struct thread *ctd;
+	int cpri, pri;
+
+	/*
+	 * The new thread should not preempt the current thread if any of the
+	 * following conditions are true:
+	 *
+	 *  - The kernel is in the throes of crashing (panicstr).
+	 *  - The current thread has a higher (numerically lower) or
+	 *    equivalent priority.  Note that this prevents curthread from
+	 *    trying to preempt to itself.
+	 *  - It is too early in the boot for context switches (cold is set).
+	 *  - The current thread has an inhibitor set or is in the process of
+	 *    exiting.  In this case, the current thread is about to switch
+	 *    out anyways, so there's no point in preempting.  If we did,
+	 *    the current thread would not be properly resumed as well, so
+	 *    just avoid that whole landmine.
+	 *  - If the new thread's priority is not a realtime priority and
+	 *    the current thread's priority is not an idle priority and
+	 *    FULL_PREEMPTION is disabled.
+	 *
+	 * If all of these conditions are false, but the current thread is in
+	 * a nested critical section, then we have to defer the preemption
+	 * until we exit the critical section.  Otherwise, switch immediately
+	 * to the new thread.
+	 */
+	ctd = curthread;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT((td->td_inhibitors == 0),
+			("maybe_preempt: trying to run inhibited thread"));
+	pri = td->td_priority;
+	cpri = ctd->td_priority;
+	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
+	    TD_IS_INHIBITED(ctd))
+		return (0);
+#ifndef FULL_PREEMPTION
+	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
+		return (0);
+#endif
+
+	if (ctd->td_critnest > 1) {
+		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
+		    ctd->td_critnest);
+		ctd->td_owepreempt = 1;
+		return (0);
+	}
+	/*
+	 * Thread is runnable but not yet put on system run queue.
+	 */
+	MPASS(ctd->td_lock == td->td_lock);
+	MPASS(TD_ON_RUNQ(td));
+	TD_SET_RUNNING(td);
+	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
+	    td->td_proc->p_pid, td->td_name);
+	mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, td);
+	/*
+	 * td's lock pointer may have changed.  We have to return with it
+	 * locked.
+	 */
+	spinlock_enter();
+	thread_unlock(ctd);
+	thread_lock(td);
+	spinlock_exit();
+	return (1);
+#else
+	return (0);
+#endif
+}
+
+/*
+ * Constants for digital decay and forget:
+ *	90% of (td_estcpu) usage in 5 * loadav time
+ *	95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
+ *          Note that, as ps(1) mentions, this can let percentages
+ *          total over 100% (I've seen 137.9% for 3 processes).
+ *
+ * Note that schedclock() updates td_estcpu and p_cpticks asynchronously.
+ *
+ * We wish to decay away 90% of td_estcpu in (5 * loadavg) seconds.
+ * That is, the system wants to compute a value of decay such
+ * that the following for loop:
+ * 	for (i = 0; i < (5 * loadavg); i++)
+ * 		td_estcpu *= decay;
+ * will compute
+ * 	td_estcpu *= 0.1;
+ * for all values of loadavg:
+ *
+ * Mathematically this loop can be expressed by saying:
+ * 	decay ** (5 * loadavg) ~= .1
+ *
+ * The system computes decay as:
+ * 	decay = (2 * loadavg) / (2 * loadavg + 1)
+ *
+ * We wish to prove that the system's computation of decay
+ * will always fulfill the equation:
+ * 	decay ** (5 * loadavg) ~= .1
+ *
+ * If we compute b as:
+ * 	b = 2 * loadavg
+ * then
+ * 	decay = b / (b + 1)
+ *
+ * We now need to prove two things:
+ *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
+ *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
+ *
+ * Facts:
+ *         For x close to zero, exp(x) =~ 1 + x, since
+ *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
+ *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
+ *         For x close to zero, ln(1+x) =~ x, since
+ *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
+ *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
+ *         ln(.1) =~ -2.30
+ *
+ * Proof of (1):
+ *    Solve (factor)**(power) =~ .1 given power (5*loadav):
+ *	solving for factor,
+ *      ln(factor) =~ (-2.30/5*loadav), or
+ *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
+ *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
+ *
+ * Proof of (2):
+ *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
+ *	solving for power,
+ *      power*ln(b/(b+1)) =~ -2.30, or
+ *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
+ *
+ * Actual power values for the implemented algorithm are as follows:
+ *      loadav: 1       2       3       4
+ *      power:  5.68    10.32   14.94   19.55
+ */
+
+/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
+#define	loadfactor(loadav)	(2 * (loadav))
+#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
+
+/* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
+SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
+
+/*
+ * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
+ * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
+ * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
+ *
+ * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
+ *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
+ *
+ * If you don't want to bother with the faster/more-accurate formula, you
+ * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
+ * (more general) method of calculating the %age of CPU used by a process.
+ */
+#define	CCPU_SHIFT	11
+
+/*
+ * Recompute process priorities, every hz ticks.
+ * MP-safe, called without the Giant mutex.
+ */
+/* ARGSUSED */
+static void
+schedcpu(void)
+{
+	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+	struct thread *td;
+	struct proc *p;
+	struct td_sched *ts;
+	int awake;
+
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		if (p->p_state == PRS_NEW) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		FOREACH_THREAD_IN_PROC(p, td) {
+			awake = 0;
+			thread_lock(td);
+			ts = td->td_sched;
+			/*
+			 * Increment sleep time (if sleeping).  We
+			 * ignore overflow, as above.
+			 */
+			/*
+			 * The td_sched slptimes are not touched in wakeup
+			 * because the thread may not HAVE everything in
+			 * memory? XXX I think this is out of date.
+			 */
+			if (TD_ON_RUNQ(td)) {
+				awake = 1;
+				td->td_flags &= ~TDF_DIDRUN;
+			} else if (TD_IS_RUNNING(td)) {
+				awake = 1;
+				/* Do not clear TDF_DIDRUN */
+			} else if (td->td_flags & TDF_DIDRUN) {
+				awake = 1;
+				td->td_flags &= ~TDF_DIDRUN;
+			}
+
+			/*
+			 * ts_pctcpu is only for ps and ttyinfo().
+			 */
+			ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
+			/*
+			 * If the td_sched has been idle the entire second,
+			 * stop recalculating its priority until
+			 * it wakes up.
+			 */
+			if (ts->ts_cpticks != 0) {
+#if	(FSHIFT >= CCPU_SHIFT)
+				ts->ts_pctcpu += (realstathz == 100)
+				    ? ((fixpt_t) ts->ts_cpticks) <<
+				    (FSHIFT - CCPU_SHIFT) :
+				    100 * (((fixpt_t) ts->ts_cpticks)
+				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
+#else
+				ts->ts_pctcpu += ((FSCALE - ccpu) *
+				    (ts->ts_cpticks *
+				    FSCALE / realstathz)) >> FSHIFT;
+#endif
+				ts->ts_cpticks = 0;
+			}
+			/*
+			 * If there are ANY running threads in this process,
+			 * then don't count it as sleeping.
+			 * XXX: this is broken.
+			 */
+			if (awake) {
+				if (ts->ts_slptime > 1) {
+					/*
+					 * In an ideal world, this should not
+					 * happen, because whoever woke us
+					 * up from the long sleep should have
+					 * unwound the slptime and reset our
+					 * priority before we run at the stale
+					 * priority.  Should KASSERT at some
+					 * point when all the cases are fixed.
+					 */
+					updatepri(td);
+				}
+				ts->ts_slptime = 0;
+			} else
+				ts->ts_slptime++;
+			if (ts->ts_slptime > 1) {
+				thread_unlock(td);
+				continue;
+			}
+			td->td_estcpu = decay_cpu(loadfac, td->td_estcpu);
+		      	resetpriority(td);
+			resetpriority_thread(td);
+			thread_unlock(td);
+		}
+		PROC_UNLOCK(p);
+	}
+	sx_sunlock(&allproc_lock);
+}
+
+/*
+ * Main loop for a kthread that executes schedcpu once a second.
+ */
+static void
+schedcpu_thread(void)
+{
+
+	for (;;) {
+		schedcpu();
+		pause("-", hz);
+	}
+}
+
+/*
+ * Recalculate the priority of a process after it has slept for a while.
+ * For all load averages >= 1 and max td_estcpu of 255, sleeping for at
+ * least six times the loadfactor will decay td_estcpu to zero.
+ */
+static void
+updatepri(struct thread *td)
+{
+	struct td_sched *ts;
+	fixpt_t loadfac;
+	unsigned int newcpu;
+
+	ts = td->td_sched;
+	loadfac = loadfactor(averunnable.ldavg[0]);
+	if (ts->ts_slptime > 5 * loadfac)
+		td->td_estcpu = 0;
+	else {
+		newcpu = td->td_estcpu;
+		ts->ts_slptime--;	/* was incremented in schedcpu() */
+		while (newcpu && --ts->ts_slptime)
+			newcpu = decay_cpu(loadfac, newcpu);
+		td->td_estcpu = newcpu;
+	}
+}
+
+/*
+ * Compute the priority of a process when running in user mode.
+ * Arrange to reschedule if the resulting priority is better
+ * than that of the current process.
+ */
+static void
+resetpriority(struct thread *td)
+{
+	register unsigned int newpriority;
+
+	if (td->td_pri_class == PRI_TIMESHARE) {
+		newpriority = PUSER + td->td_estcpu / INVERSE_ESTCPU_WEIGHT +
+		    NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
+		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
+		    PRI_MAX_TIMESHARE);
+		sched_user_prio(td, newpriority);
+	}
+}
+
+/*
+ * Update the thread's priority when the associated process's user
+ * priority changes.
+ */
+static void
+resetpriority_thread(struct thread *td)
+{
+
+	/* Only change threads with a time sharing user priority. */
+	if (td->td_priority < PRI_MIN_TIMESHARE ||
+	    td->td_priority > PRI_MAX_TIMESHARE)
+		return;
+
+	/* XXX the whole needresched thing is broken, but not silly. */
+	maybe_resched(td);
+
+	sched_prio(td, td->td_user_pri);
+}
+
+/* ARGSUSED */
+static void
+sched_setup(void *dummy)
+{
+
+	setup_runqs();
+
+	/* Account for thread0. */
+	sched_load_add();
+}
+
+/*
+ * This routine determines time constants after stathz and hz are setup.
+ */
+static void
+sched_initticks(void *dummy)
+{
+
+	realstathz = stathz ? stathz : hz;
+	sched_slice = realstathz / 10;	/* ~100ms */
+	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
+	    realstathz);
+}
+
+/* External interfaces start here */
+
+/*
+ * Very early in the boot some setup of scheduler-specific
+ * parts of proc0 and of some scheduler resources needs to be done.
+ * Called from:
+ *  proc0_init()
+ */
+void
+schedinit(void)
+{
+	/*
+	 * Set up the scheduler specific parts of proc0.
+	 */
+	proc0.p_sched = NULL; /* XXX */
+	thread0.td_sched = &td_sched0;
+	thread0.td_lock = &sched_lock;
+	td_sched0.ts_slice = sched_slice;
+	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
+}
+
+int
+sched_runnable(void)
+{
+#ifdef SMP
+	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
+#else
+	return runq_check(&runq);
+#endif
+}
+
+int
+sched_rr_interval(void)
+{
+
+	/* Convert sched_slice from stathz to hz. */
+	return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
+}
+
+/*
+ * We adjust the priority of the current process.  The priority of
+ * a process gets worse as it accumulates CPU time.  The cpu usage
+ * estimator (td_estcpu) is increased here.  resetpriority() will
+ * compute a different priority each time td_estcpu increases by
+ * INVERSE_ESTCPU_WEIGHT
+ * (until MAXPRI is reached).  The cpu usage estimator ramps up
+ * quite quickly when the process is running (linearly), and decays
+ * away exponentially, at a rate which is proportionally slower when
+ * the system is busy.  The basic principle is that the system will
+ * 90% forget that the process used a lot of CPU time in 5 * loadav
+ * seconds.  This causes the system to favor processes which haven't
+ * run much recently, and to round-robin among other processes.
+ */
+void
+sched_clock(struct thread *td)
+{
+	struct pcpuidlestat *stat;
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+
+	ts->ts_cpticks++;
+	td->td_estcpu = ESTCPULIM(td->td_estcpu + 1);
+	if ((td->td_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
+		resetpriority(td);
+		resetpriority_thread(td);
+	}
+
+	/*
+	 * Force a context switch if the current thread has used up a full
+	 * time slice (default is 100ms).
+	 */
+	if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) {
+		ts->ts_slice = sched_slice;
+		td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
+	}
+
+	stat = DPCPU_PTR(idlestat);
+	stat->oldidlecalls = stat->idlecalls;
+	stat->idlecalls = 0;
+}
+
+/*
+ * Charge child's scheduling CPU usage to parent.
+ */
+void
+sched_exit(struct proc *p, struct thread *td)
+{
+
+	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit",
+	    "prio:%d", td->td_priority);
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
+}
+
+void
+sched_exit_thread(struct thread *td, struct thread *child)
+{
+
+	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit",
+	    "prio:%d", child->td_priority);
+	thread_lock(td);
+	td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu);
+	thread_unlock(td);
+	thread_lock(child);
+	if ((child->td_flags & TDF_NOLOAD) == 0)
+		sched_load_rem();
+	thread_unlock(child);
+}
+
+void
+sched_fork(struct thread *td, struct thread *childtd)
+{
+	sched_fork_thread(td, childtd);
+}
+
+void
+sched_fork_thread(struct thread *td, struct thread *childtd)
+{
+	struct td_sched *ts;
+
+	childtd->td_estcpu = td->td_estcpu;
+	childtd->td_lock = &sched_lock;
+	childtd->td_cpuset = cpuset_ref(td->td_cpuset);
+	childtd->td_priority = childtd->td_base_pri;
+	ts = childtd->td_sched;
+	bzero(ts, sizeof(*ts));
+	ts->ts_flags |= (td->td_sched->ts_flags & TSF_AFFINITY);
+	ts->ts_slice = 1;
+}
+
+void
+sched_nice(struct proc *p, int nice)
+{
+	struct thread *td;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	p->p_nice = nice;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		resetpriority(td);
+		resetpriority_thread(td);
+		thread_unlock(td);
+	}
+}
+
+void
+sched_class(struct thread *td, int class)
+{
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	td->td_pri_class = class;
+}
+
+/*
+ * Adjust the priority of a thread.
+ */
+static void
+sched_priority(struct thread *td, u_char prio)
+{
+
+
+	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change",
+	    "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED,
+	    sched_tdname(curthread));
+	SDT_PROBE3(sched, , , change_pri, td, td->td_proc, prio);
+	if (td != curthread && prio > td->td_priority) {
+		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
+		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
+		    prio, KTR_ATTR_LINKED, sched_tdname(td));
+		SDT_PROBE4(sched, , , lend_pri, td, td->td_proc, prio, 
+		    curthread);
+	}
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	if (td->td_priority == prio)
+		return;
+	td->td_priority = prio;
+	if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) {
+		sched_rem(td);
+		sched_add(td, SRQ_BORING);
+	}
+}
+
+/*
+ * Update a thread's priority when it is lent another thread's
+ * priority.
+ */
+void
+sched_lend_prio(struct thread *td, u_char prio)
+{
+
+	td->td_flags |= TDF_BORROWING;
+	sched_priority(td, prio);
+}
+
+/*
+ * Restore a thread's priority when priority propagation is
+ * over.  The prio argument is the minimum priority the thread
+ * needs to have to satisfy other possible priority lending
+ * requests.  If the thread's regulary priority is less
+ * important than prio the thread will keep a priority boost
+ * of prio.
+ */
+void
+sched_unlend_prio(struct thread *td, u_char prio)
+{
+	u_char base_pri;
+
+	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
+	    td->td_base_pri <= PRI_MAX_TIMESHARE)
+		base_pri = td->td_user_pri;
+	else
+		base_pri = td->td_base_pri;
+	if (prio >= base_pri) {
+		td->td_flags &= ~TDF_BORROWING;
+		sched_prio(td, base_pri);
+	} else
+		sched_lend_prio(td, prio);
+}
+
+void
+sched_prio(struct thread *td, u_char prio)
+{
+	u_char oldprio;
+
+	/* First, update the base priority. */
+	td->td_base_pri = prio;
+
+	/*
+	 * If the thread is borrowing another thread's priority, don't ever
+	 * lower the priority.
+	 */
+	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
+		return;
+
+	/* Change the real priority. */
+	oldprio = td->td_priority;
+	sched_priority(td, prio);
+
+	/*
+	 * If the thread is on a turnstile, then let the turnstile update
+	 * its state.
+	 */
+	if (TD_ON_LOCK(td) && oldprio != prio)
+		turnstile_adjust(td, oldprio);
+}
+
+void
+sched_user_prio(struct thread *td, u_char prio)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	td->td_base_user_pri = prio;
+	if (td->td_lend_user_pri <= prio)
+		return;
+	td->td_user_pri = prio;
+}
+
+void
+sched_lend_user_prio(struct thread *td, u_char prio)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	td->td_lend_user_pri = prio;
+	td->td_user_pri = min(prio, td->td_base_user_pri);
+	if (td->td_priority > td->td_user_pri)
+		sched_prio(td, td->td_user_pri);
+	else if (td->td_priority != td->td_user_pri)
+		td->td_flags |= TDF_NEEDRESCHED;
+}
+
+void
+sched_sleep(struct thread *td, int pri)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	td->td_slptick = ticks;
+	td->td_sched->ts_slptime = 0;
+	if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
+		sched_prio(td, pri);
+	if (TD_IS_SUSPENDED(td) || pri >= PSOCK)
+		td->td_flags |= TDF_CANSWAP;
+}
+
+void
+sched_switch(struct thread *td, struct thread *newtd, int flags)
+{
+	struct mtx *tmtx;
+	struct td_sched *ts;
+	struct proc *p;
+	int preempted;
+
+	tmtx = NULL;
+	ts = td->td_sched;
+	p = td->td_proc;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+	/* 
+	 * Switch to the sched lock to fix things up and pick
+	 * a new thread.
+	 * Block the td_lock in order to avoid breaking the critical path.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		tmtx = thread_lock_block(td);
+	}
+
+	if ((td->td_flags & TDF_NOLOAD) == 0)
+		sched_load_rem();
+
+	td->td_lastcpu = td->td_oncpu;
+	preempted = !(td->td_flags & TDF_SLICEEND);
+	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
+	td->td_owepreempt = 0;
+	td->td_oncpu = NOCPU;
+
+	/*
+	 * At the last moment, if this thread is still marked RUNNING,
+	 * then put it back on the run queue as it has not been suspended
+	 * or stopped or any thing else similar.  We never put the idle
+	 * threads on the run queue, however.
+	 */
+	if (td->td_flags & TDF_IDLETD) {
+		TD_SET_CAN_RUN(td);
+#ifdef SMP
+		CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask);
+#endif
+	} else {
+		if (TD_IS_RUNNING(td)) {
+			/* Put us back on the run queue. */
+			sched_add(td, preempted ?
+			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
+			    SRQ_OURSELF|SRQ_YIELDING);
+		}
+	}
+	if (newtd) {
+		/*
+		 * The thread we are about to run needs to be counted
+		 * as if it had been added to the run queue and selected.
+		 * It came from:
+		 * * A preemption
+		 * * An upcall
+		 * * A followon
+		 */
+		KASSERT((newtd->td_inhibitors == 0),
+			("trying to run inhibited thread"));
+		newtd->td_flags |= TDF_DIDRUN;
+        	TD_SET_RUNNING(newtd);
+		if ((newtd->td_flags & TDF_NOLOAD) == 0)
+			sched_load_add();
+	} else {
+		newtd = choosethread();
+		MPASS(newtd->td_lock == &sched_lock);
+	}
+
+	if (td != newtd) {
+#ifdef	HWPMC_HOOKS
+		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
+#endif
+
+		SDT_PROBE2(sched, , , off_cpu, td, td->td_proc);
+
+                /* I feel sleepy */
+		lock_profile_release_lock(&sched_lock.lock_object);
+#ifdef KDTRACE_HOOKS
+		/*
+		 * If DTrace has set the active vtime enum to anything
+		 * other than INACTIVE (0), then it should have set the
+		 * function to call.
+		 */
+		if (dtrace_vtime_active)
+			(*dtrace_vtime_switch_func)(newtd);
+#endif
+
+		cpu_switch(td, newtd, tmtx != NULL ? tmtx : td->td_lock);
+		lock_profile_obtain_lock_success(&sched_lock.lock_object,
+		    0, 0, __FILE__, __LINE__);
+		/*
+		 * Where am I?  What year is it?
+		 * We are in the same thread that went to sleep above,
+		 * but any amount of time may have passed. All our context
+		 * will still be available as will local variables.
+		 * PCPU values however may have changed as we may have
+		 * changed CPU so don't trust cached values of them.
+		 * New threads will go to fork_exit() instead of here
+		 * so if you change things here you may need to change
+		 * things there too.
+		 *
+		 * If the thread above was exiting it will never wake
+		 * up again here, so either it has saved everything it
+		 * needed to, or the thread_wait() or wait() will
+		 * need to reap it.
+		 */
+
+		SDT_PROBE0(sched, , , on_cpu);
+#ifdef	HWPMC_HOOKS
+		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
+#endif
+	} else
+		SDT_PROBE0(sched, , , remain_cpu);
+
+#ifdef SMP
+	if (td->td_flags & TDF_IDLETD)
+		CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask);
+#endif
+	sched_lock.mtx_lock = (uintptr_t)td;
+	td->td_oncpu = PCPU_GET(cpuid);
+	MPASS(td->td_lock == &sched_lock);
+}
+
+void
+sched_wakeup(struct thread *td)
+{
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	td->td_flags &= ~TDF_CANSWAP;
+	if (ts->ts_slptime > 1) {
+		updatepri(td);
+		resetpriority(td);
+	}
+	td->td_slptick = 0;
+	ts->ts_slptime = 0;
+	ts->ts_slice = sched_slice;
+	sched_add(td, SRQ_BORING);
+}
+
+#ifdef SMP
+static int
+forward_wakeup(int cpunum)
+{
+	struct pcpu *pc;
+	cpuset_t dontuse, map, map2;
+	u_int id, me;
+	int iscpuset;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+
+	CTR0(KTR_RUNQ, "forward_wakeup()");
+
+	if ((!forward_wakeup_enabled) ||
+	     (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
+		return (0);
+	if (!smp_started || cold || panicstr)
+		return (0);
+
+	forward_wakeups_requested++;
+
+	/*
+	 * Check the idle mask we received against what we calculated
+	 * before in the old version.
+	 */
+	me = PCPU_GET(cpuid);
+
+	/* Don't bother if we should be doing it ourself. */
+	if (CPU_ISSET(me, &idle_cpus_mask) &&
+	    (cpunum == NOCPU || me == cpunum))
+		return (0);
+
+	CPU_SETOF(me, &dontuse);
+	CPU_OR(&dontuse, &stopped_cpus);
+	CPU_OR(&dontuse, &hlt_cpus_mask);
+	CPU_ZERO(&map2);
+	if (forward_wakeup_use_loop) {
+		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
+			id = pc->pc_cpuid;
+			if (!CPU_ISSET(id, &dontuse) &&
+			    pc->pc_curthread == pc->pc_idlethread) {
+				CPU_SET(id, &map2);
+			}
+		}
+	}
+
+	if (forward_wakeup_use_mask) {
+		map = idle_cpus_mask;
+		CPU_NAND(&map, &dontuse);
+
+		/* If they are both on, compare and use loop if different. */
+		if (forward_wakeup_use_loop) {
+			if (CPU_CMP(&map, &map2)) {
+				printf("map != map2, loop method preferred\n");
+				map = map2;
+			}
+		}
+	} else {
+		map = map2;
+	}
+
+	/* If we only allow a specific CPU, then mask off all the others. */
+	if (cpunum != NOCPU) {
+		KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
+		iscpuset = CPU_ISSET(cpunum, &map);
+		if (iscpuset == 0)
+			CPU_ZERO(&map);
+		else
+			CPU_SETOF(cpunum, &map);
+	}
+	if (!CPU_EMPTY(&map)) {
+		forward_wakeups_delivered++;
+		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
+			id = pc->pc_cpuid;
+			if (!CPU_ISSET(id, &map))
+				continue;
+			if (cpu_idle_wakeup(pc->pc_cpuid))
+				CPU_CLR(id, &map);
+		}
+		if (!CPU_EMPTY(&map))
+			ipi_selected(map, IPI_AST);
+		return (1);
+	}
+	if (cpunum == NOCPU)
+		printf("forward_wakeup: Idle processor not found\n");
+	return (0);
+}
+
+static void
+kick_other_cpu(int pri, int cpuid)
+{
+	struct pcpu *pcpu;
+	int cpri;
+
+	pcpu = pcpu_find(cpuid);
+	if (CPU_ISSET(cpuid, &idle_cpus_mask)) {
+		forward_wakeups_delivered++;
+		if (!cpu_idle_wakeup(cpuid))
+			ipi_cpu(cpuid, IPI_AST);
+		return;
+	}
+
+	cpri = pcpu->pc_curthread->td_priority;
+	if (pri >= cpri)
+		return;
+
+#if defined(IPI_PREEMPTION) && defined(PREEMPTION)
+#if !defined(FULL_PREEMPTION)
+	if (pri <= PRI_MAX_ITHD)
+#endif /* ! FULL_PREEMPTION */
+	{
+		ipi_cpu(cpuid, IPI_PREEMPT);
+		return;
+	}
+#endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
+
+	pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
+	ipi_cpu(cpuid, IPI_AST);
+	return;
+}
+#endif /* SMP */
+
+#ifdef SMP
+static int
+sched_pickcpu(struct thread *td)
+{
+	int best, cpu;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+
+	if (THREAD_CAN_SCHED(td, td->td_lastcpu))
+		best = td->td_lastcpu;
+	else
+		best = NOCPU;
+	CPU_FOREACH(cpu) {
+		if (!THREAD_CAN_SCHED(td, cpu))
+			continue;
+	
+		if (best == NOCPU)
+			best = cpu;
+		else if (runq_length[cpu] < runq_length[best])
+			best = cpu;
+	}
+	KASSERT(best != NOCPU, ("no valid CPUs"));
+
+	return (best);
+}
+#endif
+
+void
+sched_add(struct thread *td, int flags)
+#ifdef SMP
+{
+	cpuset_t tidlemsk;
+	struct td_sched *ts;
+	u_int cpu, cpuid;
+	int forwarded = 0;
+	int single_cpu = 0;
+
+	ts = td->td_sched;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT((td->td_inhibitors == 0),
+	    ("sched_add: trying to run inhibited thread"));
+	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+	    ("sched_add: bad thread state"));
+	KASSERT(td->td_flags & TDF_INMEM,
+	    ("sched_add: thread swapped out"));
+
+	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
+	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
+	    sched_tdname(curthread));
+	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
+	    KTR_ATTR_LINKED, sched_tdname(td));
+	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
+	    flags & SRQ_PREEMPTED);
+
+
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_lock_set(td, &sched_lock);
+	}
+	TD_SET_RUNQ(td);
+
+	/*
+	 * If SMP is started and the thread is pinned or otherwise limited to
+	 * a specific set of CPUs, queue the thread to a per-CPU run queue.
+	 * Otherwise, queue the thread to the global run queue.
+	 *
+	 * If SMP has not yet been started we must use the global run queue
+	 * as per-CPU state may not be initialized yet and we may crash if we
+	 * try to access the per-CPU run queues.
+	 */
+	if (smp_started && (td->td_pinned != 0 || td->td_flags & TDF_BOUND ||
+	    ts->ts_flags & TSF_AFFINITY)) {
+		if (td->td_pinned != 0)
+			cpu = td->td_lastcpu;
+		else if (td->td_flags & TDF_BOUND) {
+			/* Find CPU from bound runq. */
+			KASSERT(SKE_RUNQ_PCPU(ts),
+			    ("sched_add: bound td_sched not on cpu runq"));
+			cpu = ts->ts_runq - &runq_pcpu[0];
+		} else
+			/* Find a valid CPU for our cpuset */
+			cpu = sched_pickcpu(td);
+		ts->ts_runq = &runq_pcpu[cpu];
+		single_cpu = 1;
+		CTR3(KTR_RUNQ,
+		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td,
+		    cpu);
+	} else {
+		CTR2(KTR_RUNQ,
+		    "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts,
+		    td);
+		cpu = NOCPU;
+		ts->ts_runq = &runq;
+	}
+
+	cpuid = PCPU_GET(cpuid);
+	if (single_cpu && cpu != cpuid) {
+	        kick_other_cpu(td->td_priority, cpu);
+	} else {
+		if (!single_cpu) {
+			tidlemsk = idle_cpus_mask;
+			CPU_NAND(&tidlemsk, &hlt_cpus_mask);
+			CPU_CLR(cpuid, &tidlemsk);
+
+			if (!CPU_ISSET(cpuid, &idle_cpus_mask) &&
+			    ((flags & SRQ_INTR) == 0) &&
+			    !CPU_EMPTY(&tidlemsk))
+				forwarded = forward_wakeup(cpu);
+		}
+
+		if (!forwarded) {
+			if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td))
+				return;
+			else
+				maybe_resched(td);
+		}
+	}
+
+	if ((td->td_flags & TDF_NOLOAD) == 0)
+		sched_load_add();
+	runq_add(ts->ts_runq, td, flags);
+	if (cpu != NOCPU)
+		runq_length[cpu]++;
+}
+#else /* SMP */
+{
+	struct td_sched *ts;
+
+	ts = td->td_sched;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT((td->td_inhibitors == 0),
+	    ("sched_add: trying to run inhibited thread"));
+	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+	    ("sched_add: bad thread state"));
+	KASSERT(td->td_flags & TDF_INMEM,
+	    ("sched_add: thread swapped out"));
+	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
+	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
+	    sched_tdname(curthread));
+	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
+	    KTR_ATTR_LINKED, sched_tdname(td));
+	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
+	    flags & SRQ_PREEMPTED);
+
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_lock_set(td, &sched_lock);
+	}
+	TD_SET_RUNQ(td);
+	CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
+	ts->ts_runq = &runq;
+
+	/*
+	 * If we are yielding (on the way out anyhow) or the thread
+	 * being saved is US, then don't try be smart about preemption
+	 * or kicking off another CPU as it won't help and may hinder.
+	 * In the YIEDLING case, we are about to run whoever is being
+	 * put in the queue anyhow, and in the OURSELF case, we are
+	 * puting ourself on the run queue which also only happens
+	 * when we are about to yield.
+	 */
+	if ((flags & SRQ_YIELDING) == 0) {
+		if (maybe_preempt(td))
+			return;
+	}
+	if ((td->td_flags & TDF_NOLOAD) == 0)
+		sched_load_add();
+	runq_add(ts->ts_runq, td, flags);
+	maybe_resched(td);
+}
+#endif /* SMP */
+
+void
+sched_rem(struct thread *td)
+{
+	struct td_sched *ts;
+
+	ts = td->td_sched;
+	KASSERT(td->td_flags & TDF_INMEM,
+	    ("sched_rem: thread swapped out"));
+	KASSERT(TD_ON_RUNQ(td),
+	    ("sched_rem: thread not on run queue"));
+	mtx_assert(&sched_lock, MA_OWNED);
+	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
+	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
+	    sched_tdname(curthread));
+	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
+
+	if ((td->td_flags & TDF_NOLOAD) == 0)
+		sched_load_rem();
+#ifdef SMP
+	if (ts->ts_runq != &runq)
+		runq_length[ts->ts_runq - runq_pcpu]--;
+#endif
+	runq_remove(ts->ts_runq, td);
+	TD_SET_CAN_RUN(td);
+}
+
+/*
+ * Select threads to run.  Note that running threads still consume a
+ * slot.
+ */
+struct thread *
+sched_choose(void)
+{
+	struct thread *td;
+	struct runq *rq;
+
+	mtx_assert(&sched_lock,  MA_OWNED);
+#ifdef SMP
+	struct thread *tdcpu;
+
+	rq = &runq;
+	td = runq_choose_fuzz(&runq, runq_fuzz);
+	tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
+
+	if (td == NULL ||
+	    (tdcpu != NULL &&
+	     tdcpu->td_priority < td->td_priority)) {
+		CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu,
+		     PCPU_GET(cpuid));
+		td = tdcpu;
+		rq = &runq_pcpu[PCPU_GET(cpuid)];
+	} else {
+		CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td);
+	}
+
+#else
+	rq = &runq;
+	td = runq_choose(&runq);
+#endif
+
+	if (td) {
+#ifdef SMP
+		if (td == tdcpu)
+			runq_length[PCPU_GET(cpuid)]--;
+#endif
+		runq_remove(rq, td);
+		td->td_flags |= TDF_DIDRUN;
+
+		KASSERT(td->td_flags & TDF_INMEM,
+		    ("sched_choose: thread swapped out"));
+		return (td);
+	}
+	return (PCPU_GET(idlethread));
+}
+
+void
+sched_preempt(struct thread *td)
+{
+
+	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
+	thread_lock(td);
+	if (td->td_critnest > 1)
+		td->td_owepreempt = 1;
+	else
+		mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL);
+	thread_unlock(td);
+}
+
+void
+sched_userret(struct thread *td)
+{
+	/*
+	 * XXX we cheat slightly on the locking here to avoid locking in
+	 * the usual case.  Setting td_priority here is essentially an
+	 * incomplete workaround for not setting it properly elsewhere.
+	 * Now that some interrupt handlers are threads, not setting it
+	 * properly elsewhere can clobber it in the window between setting
+	 * it here and returning to user mode, so don't waste time setting
+	 * it perfectly here.
+	 */
+	KASSERT((td->td_flags & TDF_BORROWING) == 0,
+	    ("thread with borrowed priority returning to userland"));
+	if (td->td_priority != td->td_user_pri) {
+		thread_lock(td);
+		td->td_priority = td->td_user_pri;
+		td->td_base_pri = td->td_user_pri;
+		thread_unlock(td);
+	}
+}
+
+void
+sched_bind(struct thread *td, int cpu)
+{
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
+	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
+
+	ts = td->td_sched;
+
+	td->td_flags |= TDF_BOUND;
+#ifdef SMP
+	ts->ts_runq = &runq_pcpu[cpu];
+	if (PCPU_GET(cpuid) == cpu)
+		return;
+
+	mi_switch(SW_VOL, NULL);
+#endif
+}
+
+void
+sched_unbind(struct thread* td)
+{
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
+	td->td_flags &= ~TDF_BOUND;
+}
+
+int
+sched_is_bound(struct thread *td)
+{
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	return (td->td_flags & TDF_BOUND);
+}
+
+void
+sched_relinquish(struct thread *td)
+{
+	thread_lock(td);
+	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
+	thread_unlock(td);
+}
+
+int
+sched_load(void)
+{
+	return (sched_tdcnt);
+}
+
+int
+sched_sizeof_proc(void)
+{
+	return (sizeof(struct proc));
+}
+
+int
+sched_sizeof_thread(void)
+{
+	return (sizeof(struct thread) + sizeof(struct td_sched));
+}
+
+fixpt_t
+sched_pctcpu(struct thread *td)
+{
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	return (ts->ts_pctcpu);
+}
+
+#ifdef	RACCT
+/*
+ * Calculates the contribution to the thread cpu usage for the latest
+ * (unfinished) second.
+ */
+fixpt_t
+sched_pctcpu_delta(struct thread *td)
+{
+	struct td_sched *ts;
+	fixpt_t delta;
+	int realstathz;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	delta = 0;
+	realstathz = stathz ? stathz : hz;
+	if (ts->ts_cpticks != 0) {
+#if	(FSHIFT >= CCPU_SHIFT)
+		delta = (realstathz == 100)
+		    ? ((fixpt_t) ts->ts_cpticks) <<
+		    (FSHIFT - CCPU_SHIFT) :
+		    100 * (((fixpt_t) ts->ts_cpticks)
+		    << (FSHIFT - CCPU_SHIFT)) / realstathz;
+#else
+		delta = ((FSCALE - ccpu) *
+		    (ts->ts_cpticks *
+		    FSCALE / realstathz)) >> FSHIFT;
+#endif
+	}
+
+	return (delta);
+}
+#endif
+
+void
+sched_tick(int cnt)
+{
+}
+
+/*
+ * The actual idle process.
+ */
+void
+sched_idletd(void *dummy)
+{
+	struct pcpuidlestat *stat;
+
+	THREAD_NO_SLEEPING();
+	stat = DPCPU_PTR(idlestat);
+	for (;;) {
+		mtx_assert(&Giant, MA_NOTOWNED);
+
+		while (sched_runnable() == 0) {
+			cpu_idle(stat->idlecalls + stat->oldidlecalls > 64);
+			stat->idlecalls++;
+		}
+
+		mtx_lock_spin(&sched_lock);
+		mi_switch(SW_VOL | SWT_IDLE, NULL);
+		mtx_unlock_spin(&sched_lock);
+	}
+}
+
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+	/*
+	 * Correct spinlock nesting.  The idle thread context that we are
+	 * borrowing was created so that it would start out with a single
+	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
+	 * explicitly acquired locks in this function, the nesting count
+	 * is now 2 rather than 1.  Since we are nested, calling
+	 * spinlock_exit() will simply adjust the counts without allowing
+	 * spin lock using code to interrupt us.
+	 */
+	if (td == NULL) {
+		mtx_lock_spin(&sched_lock);
+		spinlock_exit();
+		PCPU_SET(switchtime, cpu_ticks());
+		PCPU_SET(switchticks, ticks);
+	} else {
+		lock_profile_release_lock(&sched_lock.lock_object);
+		MPASS(td->td_lock == &sched_lock);
+	}
+	mtx_assert(&sched_lock, MA_OWNED);
+	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+	cpu_throw(td, choosethread());	/* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *td)
+{
+
+	/*
+	 * Finish setting up thread glue so that it begins execution in a
+	 * non-nested critical section with sched_lock held but not recursed.
+	 */
+	td->td_oncpu = PCPU_GET(cpuid);
+	sched_lock.mtx_lock = (uintptr_t)td;
+	lock_profile_obtain_lock_success(&sched_lock.lock_object,
+	    0, 0, __FILE__, __LINE__);
+	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
+}
+
+char *
+sched_tdname(struct thread *td)
+{
+#ifdef KTR
+	struct td_sched *ts;
+
+	ts = td->td_sched;
+	if (ts->ts_name[0] == '\0')
+		snprintf(ts->ts_name, sizeof(ts->ts_name),
+		    "%s tid %d", td->td_name, td->td_tid);
+	return (ts->ts_name);
+#else   
+	return (td->td_name);
+#endif
+}
+
+#ifdef KTR
+void
+sched_clear_tdname(struct thread *td)
+{
+	struct td_sched *ts;
+
+	ts = td->td_sched;
+	ts->ts_name[0] = '\0';
+}
+#endif
+
+void
+sched_affinity(struct thread *td)
+{
+#ifdef SMP
+	struct td_sched *ts;
+	int cpu;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);	
+
+	/*
+	 * Set the TSF_AFFINITY flag if there is at least one CPU this
+	 * thread can't run on.
+	 */
+	ts = td->td_sched;
+	ts->ts_flags &= ~TSF_AFFINITY;
+	CPU_FOREACH(cpu) {
+		if (!THREAD_CAN_SCHED(td, cpu)) {
+			ts->ts_flags |= TSF_AFFINITY;
+			break;
+		}
+	}
+
+	/*
+	 * If this thread can run on all CPUs, nothing else to do.
+	 */
+	if (!(ts->ts_flags & TSF_AFFINITY))
+		return;
+
+	/* Pinned threads and bound threads should be left alone. */
+	if (td->td_pinned != 0 || td->td_flags & TDF_BOUND)
+		return;
+
+	switch (td->td_state) {
+	case TDS_RUNQ:
+		/*
+		 * If we are on a per-CPU runqueue that is in the set,
+		 * then nothing needs to be done.
+		 */
+		if (ts->ts_runq != &runq &&
+		    THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu))
+			return;
+
+		/* Put this thread on a valid per-CPU runqueue. */
+		sched_rem(td);
+		sched_add(td, SRQ_BORING);
+		break;
+	case TDS_RUNNING:
+		/*
+		 * See if our current CPU is in the set.  If not, force a
+		 * context switch.
+		 */
+		if (THREAD_CAN_SCHED(td, td->td_oncpu))
+			return;
+
+		td->td_flags |= TDF_NEEDRESCHED;
+		if (td != curthread)
+			ipi_cpu(cpu, IPI_AST);
+		break;
+	default:
+		break;
+	}
+#endif
+}
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
new file mode 100644
index 0000000..cba9d80
--- /dev/null
+++ b/sys/kern/sched_ule.c
@@ -0,0 +1,2911 @@
+/*-
+ * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This file implements the ULE scheduler.  ULE supports independent CPU
+ * run queues and fine grain locking.  It has superior interactive
+ * performance under load even on uni-processor systems.
+ *
+ * etymology:
+ *   ULE is the last three letters in schedule.  It owes its name to a
+ * generic user created for a scheduling system by Paul Mikesell at
+ * Isilon Systems and a general lack of creativity on the part of the author.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/turnstile.h>
+#include <sys/umtx.h>
+#include <sys/vmmeter.h>
+#include <sys/cpuset.h>
+#include <sys/sbuf.h>
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+int				dtrace_vtime_active;
+dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
+#endif
+
+#include <machine/cpu.h>
+#include <machine/smp.h>
+
+#if defined(__powerpc__) && defined(BOOKE_E500)
+#error "This architecture is not currently compatible with ULE"
+#endif
+
+#define	KTR_ULE	0
+
+#define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
+#define	TDQ_NAME_LEN	(sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU)))
+#define	TDQ_LOADNAME_LEN	(sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load"))
+
+/*
+ * Thread scheduler specific section.  All fields are protected
+ * by the thread lock.
+ */
+struct td_sched {	
+	struct runq	*ts_runq;	/* Run-queue we're queued on. */
+	short		ts_flags;	/* TSF_* flags. */
+	u_char		ts_cpu;		/* CPU that we have affinity for. */
+	int		ts_rltick;	/* Real last tick, for affinity. */
+	int		ts_slice;	/* Ticks of slice remaining. */
+	u_int		ts_slptime;	/* Number of ticks we vol. slept */
+	u_int		ts_runtime;	/* Number of ticks we were running */
+	int		ts_ltick;	/* Last tick that we were running on */
+	int		ts_ftick;	/* First tick that we were running on */
+	int		ts_ticks;	/* Tick count */
+#ifdef KTR
+	char		ts_name[TS_NAME_LEN];
+#endif
+};
+/* flags kept in ts_flags */
+#define	TSF_BOUND	0x0001		/* Thread can not migrate. */
+#define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
+
+static struct td_sched td_sched0;
+
+#define	THREAD_CAN_MIGRATE(td)	((td)->td_pinned == 0)
+#define	THREAD_CAN_SCHED(td, cpu)	\
+    CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
+
+/*
+ * Priority ranges used for interactive and non-interactive timeshare
+ * threads.  The timeshare priorities are split up into four ranges.
+ * The first range handles interactive threads.  The last three ranges
+ * (NHALF, x, and NHALF) handle non-interactive threads with the outer
+ * ranges supporting nice values.
+ */
+#define	PRI_TIMESHARE_RANGE	(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
+#define	PRI_INTERACT_RANGE	((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2)
+#define	PRI_BATCH_RANGE		(PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE)
+
+#define	PRI_MIN_INTERACT	PRI_MIN_TIMESHARE
+#define	PRI_MAX_INTERACT	(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1)
+#define	PRI_MIN_BATCH		(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE)
+#define	PRI_MAX_BATCH		PRI_MAX_TIMESHARE
+
+/*
+ * Cpu percentage computation macros and defines.
+ *
+ * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
+ * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
+ * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
+ * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
+ * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
+ * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
+ */
+#define	SCHED_TICK_SECS		10
+#define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
+#define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
+#define	SCHED_TICK_SHIFT	10
+#define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
+#define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
+
+/*
+ * These macros determine priorities for non-interactive threads.  They are
+ * assigned a priority based on their recent cpu utilization as expressed
+ * by the ratio of ticks to the tick total.  NHALF priorities at the start
+ * and end of the MIN to MAX timeshare range are only reachable with negative
+ * or positive nice respectively.
+ *
+ * PRI_RANGE:	Priority range for utilization dependent priorities.
+ * PRI_NRESV:	Number of nice values.
+ * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
+ * PRI_NICE:	Determines the part of the priority inherited from nice.
+ */
+#define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
+#define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
+#define	SCHED_PRI_MIN		(PRI_MIN_BATCH + SCHED_PRI_NHALF)
+#define	SCHED_PRI_MAX		(PRI_MAX_BATCH - SCHED_PRI_NHALF)
+#define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
+#define	SCHED_PRI_TICKS(ts)						\
+    (SCHED_TICK_HZ((ts)) /						\
+    (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
+#define	SCHED_PRI_NICE(nice)	(nice)
+
+/*
+ * These determine the interactivity of a process.  Interactivity differs from
+ * cpu utilization in that it expresses the voluntary time slept vs time ran
+ * while cpu utilization includes all time not running.  This more accurately
+ * models the intent of the thread.
+ *
+ * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
+ *		before throttling back.
+ * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
+ * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
+ * INTERACT_THRESH:	Threshold for placement on the current runq.
+ */
+#define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
+#define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
+#define	SCHED_INTERACT_MAX	(100)
+#define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
+#define	SCHED_INTERACT_THRESH	(30)
+
+/*
+ * These parameters determine the slice behavior for batch work.
+ */
+#define	SCHED_SLICE_DEFAULT_DIVISOR	10	/* ~94 ms, 12 stathz ticks. */
+#define	SCHED_SLICE_MIN_DIVISOR		6	/* DEFAULT/MIN = ~16 ms. */
+
+/* Flags kept in td_flags. */
+#define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
+
+/*
+ * tickincr:		Converts a stathz tick into a hz domain scaled by
+ *			the shift factor.  Without the shift the error rate
+ *			due to rounding would be unacceptably high.
+ * realstathz:		stathz is sometimes 0 and run off of hz.
+ * sched_slice:		Runtime of each thread before rescheduling.
+ * preempt_thresh:	Priority threshold for preemption and remote IPIs.
+ */
+static int sched_interact = SCHED_INTERACT_THRESH;
+static int tickincr = 8 << SCHED_TICK_SHIFT;
+static int realstathz = 127;	/* reset during boot. */
+static int sched_slice = 10;	/* reset during boot. */
+static int sched_slice_min = 1;	/* reset during boot. */
+#ifdef PREEMPTION
+#ifdef FULL_PREEMPTION
+static int preempt_thresh = PRI_MAX_IDLE;
+#else
+static int preempt_thresh = PRI_MIN_KERN;
+#endif
+#else 
+static int preempt_thresh = 0;
+#endif
+static int static_boost = PRI_MIN_BATCH;
+static int sched_idlespins = 10000;
+static int sched_idlespinthresh = -1;
+
+/*
+ * tdq - per processor runqs and statistics.  All fields are protected by the
+ * tdq_lock.  The load and lowpri may be accessed without to avoid excess
+ * locking in sched_pickcpu();
+ */
+struct tdq {
+	/* 
+	 * Ordered to improve efficiency of cpu_search() and switch().
+	 * tdq_lock is padded to avoid false sharing with tdq_load and
+	 * tdq_cpu_idle.
+	 */
+	struct mtx_padalign tdq_lock;		/* run queue lock. */
+	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
+	volatile int	tdq_load;		/* Aggregate load. */
+	volatile int	tdq_cpu_idle;		/* cpu_idle() is active. */
+	int		tdq_sysload;		/* For loadavg, !ITHD load. */
+	int		tdq_transferable;	/* Transferable thread count. */
+	short		tdq_switchcnt;		/* Switches this tick. */
+	short		tdq_oldswitchcnt;	/* Switches last tick. */
+	u_char		tdq_lowpri;		/* Lowest priority thread. */
+	u_char		tdq_ipipending;		/* IPI pending. */
+	u_char		tdq_idx;		/* Current insert index. */
+	u_char		tdq_ridx;		/* Current removal index. */
+	struct runq	tdq_realtime;		/* real-time run queue. */
+	struct runq	tdq_timeshare;		/* timeshare run queue. */
+	struct runq	tdq_idle;		/* Queue of IDLE threads. */
+	char		tdq_name[TDQ_NAME_LEN];
+#ifdef KTR
+	char		tdq_loadname[TDQ_LOADNAME_LEN];
+#endif
+} __aligned(64);
+
+/* Idle thread states and config. */
+#define	TDQ_RUNNING	1
+#define	TDQ_IDLE	2
+
+#ifdef SMP
+struct cpu_group *cpu_top;		/* CPU topology */
+
+#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 1000))
+#define	SCHED_AFFINITY(ts, t)	((ts)->ts_rltick > ticks - ((t) * affinity))
+
+/*
+ * Run-time tunables.
+ */
+static int rebalance = 1;
+static int balance_interval = 128;	/* Default set in sched_initticks(). */
+static int affinity;
+static int steal_idle = 1;
+static int steal_thresh = 2;
+
+/*
+ * One thread queue per processor.
+ */
+static struct tdq	tdq_cpu[MAXCPU];
+static struct tdq	*balance_tdq;
+static int balance_ticks;
+static DPCPU_DEFINE(uint32_t, randomval);
+
+#define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
+#define	TDQ_CPU(x)	(&tdq_cpu[(x)])
+#define	TDQ_ID(x)	((int)((x) - tdq_cpu))
+#else	/* !SMP */
+static struct tdq	tdq_cpu;
+
+#define	TDQ_ID(x)	(0)
+#define	TDQ_SELF()	(&tdq_cpu)
+#define	TDQ_CPU(x)	(&tdq_cpu)
+#endif
+
+#define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
+#define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
+#define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
+#define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
+#define	TDQ_LOCKPTR(t)		((struct mtx *)(&(t)->tdq_lock))
+
+static void sched_priority(struct thread *);
+static void sched_thread_priority(struct thread *, u_char);
+static int sched_interact_score(struct thread *);
+static void sched_interact_update(struct thread *);
+static void sched_interact_fork(struct thread *);
+static void sched_pctcpu_update(struct td_sched *, int);
+
+/* Operations on per processor queues */
+static struct thread *tdq_choose(struct tdq *);
+static void tdq_setup(struct tdq *);
+static void tdq_load_add(struct tdq *, struct thread *);
+static void tdq_load_rem(struct tdq *, struct thread *);
+static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
+static __inline void tdq_runq_rem(struct tdq *, struct thread *);
+static inline int sched_shouldpreempt(int, int, int);
+void tdq_print(int cpu);
+static void runq_print(struct runq *rq);
+static void tdq_add(struct tdq *, struct thread *, int);
+#ifdef SMP
+static int tdq_move(struct tdq *, struct tdq *);
+static int tdq_idled(struct tdq *);
+static void tdq_notify(struct tdq *, struct thread *);
+static struct thread *tdq_steal(struct tdq *, int);
+static struct thread *runq_steal(struct runq *, int);
+static int sched_pickcpu(struct thread *, int);
+static void sched_balance(void);
+static int sched_balance_pair(struct tdq *, struct tdq *);
+static inline struct tdq *sched_setcpu(struct thread *, int, int);
+static inline void thread_unblock_switch(struct thread *, struct mtx *);
+static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
+static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
+static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, 
+    struct cpu_group *cg, int indent);
+#endif
+
+static void sched_setup(void *dummy);
+SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
+
+static void sched_initticks(void *dummy);
+SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
+    NULL);
+
+SDT_PROVIDER_DEFINE(sched);
+
+SDT_PROBE_DEFINE3(sched, , , change_pri, change-pri, "struct thread *", 
+    "struct proc *", "uint8_t");
+SDT_PROBE_DEFINE3(sched, , , dequeue, dequeue, "struct thread *", 
+    "struct proc *", "void *");
+SDT_PROBE_DEFINE4(sched, , , enqueue, enqueue, "struct thread *", 
+    "struct proc *", "void *", "int");
+SDT_PROBE_DEFINE4(sched, , , lend_pri, lend-pri, "struct thread *", 
+    "struct proc *", "uint8_t", "struct thread *");
+SDT_PROBE_DEFINE2(sched, , , load_change, load-change, "int", "int");
+SDT_PROBE_DEFINE2(sched, , , off_cpu, off-cpu, "struct thread *", 
+    "struct proc *");
+SDT_PROBE_DEFINE(sched, , , on_cpu, on-cpu);
+SDT_PROBE_DEFINE(sched, , , remain_cpu, remain-cpu);
+SDT_PROBE_DEFINE2(sched, , , surrender, surrender, "struct thread *", 
+    "struct proc *");
+
+/*
+ * Print the threads waiting on a run-queue.
+ */
+static void
+runq_print(struct runq *rq)
+{
+	struct rqhead *rqh;
+	struct thread *td;
+	int pri;
+	int j;
+	int i;
+
+	for (i = 0; i < RQB_LEN; i++) {
+		printf("\t\trunq bits %d 0x%zx\n",
+		    i, rq->rq_status.rqb_bits[i]);
+		for (j = 0; j < RQB_BPW; j++)
+			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
+				pri = j + (i << RQB_L2BPW);
+				rqh = &rq->rq_queues[pri];
+				TAILQ_FOREACH(td, rqh, td_runq) {
+					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
+					    td, td->td_name, td->td_priority,
+					    td->td_rqindex, pri);
+				}
+			}
+	}
+}
+
+/*
+ * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
+ */
+void
+tdq_print(int cpu)
+{
+	struct tdq *tdq;
+
+	tdq = TDQ_CPU(cpu);
+
+	printf("tdq %d:\n", TDQ_ID(tdq));
+	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
+	printf("\tLock name:      %s\n", tdq->tdq_name);
+	printf("\tload:           %d\n", tdq->tdq_load);
+	printf("\tswitch cnt:     %d\n", tdq->tdq_switchcnt);
+	printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
+	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
+	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
+	printf("\tload transferable: %d\n", tdq->tdq_transferable);
+	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
+	printf("\trealtime runq:\n");
+	runq_print(&tdq->tdq_realtime);
+	printf("\ttimeshare runq:\n");
+	runq_print(&tdq->tdq_timeshare);
+	printf("\tidle runq:\n");
+	runq_print(&tdq->tdq_idle);
+}
+
+static inline int
+sched_shouldpreempt(int pri, int cpri, int remote)
+{
+	/*
+	 * If the new priority is not better than the current priority there is
+	 * nothing to do.
+	 */
+	if (pri >= cpri)
+		return (0);
+	/*
+	 * Always preempt idle.
+	 */
+	if (cpri >= PRI_MIN_IDLE)
+		return (1);
+	/*
+	 * If preemption is disabled don't preempt others.
+	 */
+	if (preempt_thresh == 0)
+		return (0);
+	/*
+	 * Preempt if we exceed the threshold.
+	 */
+	if (pri <= preempt_thresh)
+		return (1);
+	/*
+	 * If we're interactive or better and there is non-interactive
+	 * or worse running preempt only remote processors.
+	 */
+	if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT)
+		return (1);
+	return (0);
+}
+
+/*
+ * Add a thread to the actual run-queue.  Keeps transferable counts up to
+ * date with what is actually on the run-queue.  Selects the correct
+ * queue position for timeshare threads.
+ */
+static __inline void
+tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
+{
+	struct td_sched *ts;
+	u_char pri;
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+	pri = td->td_priority;
+	ts = td->td_sched;
+	TD_SET_RUNQ(td);
+	if (THREAD_CAN_MIGRATE(td)) {
+		tdq->tdq_transferable++;
+		ts->ts_flags |= TSF_XFERABLE;
+	}
+	if (pri < PRI_MIN_BATCH) {
+		ts->ts_runq = &tdq->tdq_realtime;
+	} else if (pri <= PRI_MAX_BATCH) {
+		ts->ts_runq = &tdq->tdq_timeshare;
+		KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH,
+			("Invalid priority %d on timeshare runq", pri));
+		/*
+		 * This queue contains only priorities between MIN and MAX
+		 * realtime.  Use the whole queue to represent these values.
+		 */
+		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
+			pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE;
+			pri = (pri + tdq->tdq_idx) % RQ_NQS;
+			/*
+			 * This effectively shortens the queue by one so we
+			 * can have a one slot difference between idx and
+			 * ridx while we wait for threads to drain.
+			 */
+			if (tdq->tdq_ridx != tdq->tdq_idx &&
+			    pri == tdq->tdq_ridx)
+				pri = (unsigned char)(pri - 1) % RQ_NQS;
+		} else
+			pri = tdq->tdq_ridx;
+		runq_add_pri(ts->ts_runq, td, pri, flags);
+		return;
+	} else
+		ts->ts_runq = &tdq->tdq_idle;
+	runq_add(ts->ts_runq, td, flags);
+}
+
+/* 
+ * Remove a thread from a run-queue.  This typically happens when a thread
+ * is selected to run.  Running threads are not on the queue and the
+ * transferable count does not reflect them.
+ */
+static __inline void
+tdq_runq_rem(struct tdq *tdq, struct thread *td)
+{
+	struct td_sched *ts;
+
+	ts = td->td_sched;
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	KASSERT(ts->ts_runq != NULL,
+	    ("tdq_runq_remove: thread %p null ts_runq", td));
+	if (ts->ts_flags & TSF_XFERABLE) {
+		tdq->tdq_transferable--;
+		ts->ts_flags &= ~TSF_XFERABLE;
+	}
+	if (ts->ts_runq == &tdq->tdq_timeshare) {
+		if (tdq->tdq_idx != tdq->tdq_ridx)
+			runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
+		else
+			runq_remove_idx(ts->ts_runq, td, NULL);
+	} else
+		runq_remove(ts->ts_runq, td);
+}
+
+/*
+ * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
+ * for this thread to the referenced thread queue.
+ */
+static void
+tdq_load_add(struct tdq *tdq, struct thread *td)
+{
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+	tdq->tdq_load++;
+	if ((td->td_flags & TDF_NOLOAD) == 0)
+		tdq->tdq_sysload++;
+	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
+	SDT_PROBE2(sched, , , load_change, (int)TDQ_ID(tdq), tdq->tdq_load);
+}
+
+/*
+ * Remove the load from a thread that is transitioning to a sleep state or
+ * exiting.
+ */
+static void
+tdq_load_rem(struct tdq *tdq, struct thread *td)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	KASSERT(tdq->tdq_load != 0,
+	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
+
+	tdq->tdq_load--;
+	if ((td->td_flags & TDF_NOLOAD) == 0)
+		tdq->tdq_sysload--;
+	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
+	SDT_PROBE2(sched, , , load_change, (int)TDQ_ID(tdq), tdq->tdq_load);
+}
+
+/*
+ * Bound timeshare latency by decreasing slice size as load increases.  We
+ * consider the maximum latency as the sum of the threads waiting to run
+ * aside from curthread and target no more than sched_slice latency but
+ * no less than sched_slice_min runtime.
+ */
+static inline int
+tdq_slice(struct tdq *tdq)
+{
+	int load;
+
+	/*
+	 * It is safe to use sys_load here because this is called from
+	 * contexts where timeshare threads are running and so there
+	 * cannot be higher priority load in the system.
+	 */
+	load = tdq->tdq_sysload - 1;
+	if (load >= SCHED_SLICE_MIN_DIVISOR)
+		return (sched_slice_min);
+	if (load <= 1)
+		return (sched_slice);
+	return (sched_slice / load);
+}
+
+/*
+ * Set lowpri to its exact value by searching the run-queue and
+ * evaluating curthread.  curthread may be passed as an optimization.
+ */
+static void
+tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
+{
+	struct thread *td;
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	if (ctd == NULL)
+		ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
+	td = tdq_choose(tdq);
+	if (td == NULL || td->td_priority > ctd->td_priority)
+		tdq->tdq_lowpri = ctd->td_priority;
+	else
+		tdq->tdq_lowpri = td->td_priority;
+}
+
+#ifdef SMP
+struct cpu_search {
+	cpuset_t cs_mask;
+	u_int	cs_prefer;
+	int	cs_pri;		/* Min priority for low. */
+	int	cs_limit;	/* Max load for low, min load for high. */
+	int	cs_cpu;
+	int	cs_load;
+};
+
+#define	CPU_SEARCH_LOWEST	0x1
+#define	CPU_SEARCH_HIGHEST	0x2
+#define	CPU_SEARCH_BOTH		(CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
+
+#define	CPUSET_FOREACH(cpu, mask)				\
+	for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++)		\
+		if (CPU_ISSET(cpu, &mask))
+
+static __inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low,
+    struct cpu_search *high, const int match);
+int cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low);
+int cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high);
+int cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
+    struct cpu_search *high);
+
+/*
+ * Search the tree of cpu_groups for the lowest or highest loaded cpu
+ * according to the match argument.  This routine actually compares the
+ * load on all paths through the tree and finds the least loaded cpu on
+ * the least loaded path, which may differ from the least loaded cpu in
+ * the system.  This balances work among caches and busses.
+ *
+ * This inline is instantiated in three forms below using constants for the
+ * match argument.  It is reduced to the minimum set for each case.  It is
+ * also recursive to the depth of the tree.
+ */
+static __inline int
+cpu_search(const struct cpu_group *cg, struct cpu_search *low,
+    struct cpu_search *high, const int match)
+{
+	struct cpu_search lgroup;
+	struct cpu_search hgroup;
+	cpuset_t cpumask;
+	struct cpu_group *child;
+	struct tdq *tdq;
+	int cpu, i, hload, lload, load, total, rnd, *rndptr;
+
+	total = 0;
+	cpumask = cg->cg_mask;
+	if (match & CPU_SEARCH_LOWEST) {
+		lload = INT_MAX;
+		lgroup = *low;
+	}
+	if (match & CPU_SEARCH_HIGHEST) {
+		hload = INT_MIN;
+		hgroup = *high;
+	}
+
+	/* Iterate through the child CPU groups and then remaining CPUs. */
+	for (i = cg->cg_children, cpu = mp_maxid; ; ) {
+		if (i == 0) {
+#ifdef HAVE_INLINE_FFSL
+			cpu = CPU_FFS(&cpumask) - 1;
+#else
+			while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask))
+				cpu--;
+#endif
+			if (cpu < 0)
+				break;
+			child = NULL;
+		} else
+			child = &cg->cg_child[i - 1];
+
+		if (match & CPU_SEARCH_LOWEST)
+			lgroup.cs_cpu = -1;
+		if (match & CPU_SEARCH_HIGHEST)
+			hgroup.cs_cpu = -1;
+		if (child) {			/* Handle child CPU group. */
+			CPU_NAND(&cpumask, &child->cg_mask);
+			switch (match) {
+			case CPU_SEARCH_LOWEST:
+				load = cpu_search_lowest(child, &lgroup);
+				break;
+			case CPU_SEARCH_HIGHEST:
+				load = cpu_search_highest(child, &hgroup);
+				break;
+			case CPU_SEARCH_BOTH:
+				load = cpu_search_both(child, &lgroup, &hgroup);
+				break;
+			}
+		} else {			/* Handle child CPU. */
+			CPU_CLR(cpu, &cpumask);
+			tdq = TDQ_CPU(cpu);
+			load = tdq->tdq_load * 256;
+			rndptr = DPCPU_PTR(randomval);
+			rnd = (*rndptr = *rndptr * 69069 + 5) >> 26;
+			if (match & CPU_SEARCH_LOWEST) {
+				if (cpu == low->cs_prefer)
+					load -= 64;
+				/* If that CPU is allowed and get data. */
+				if (tdq->tdq_lowpri > lgroup.cs_pri &&
+				    tdq->tdq_load <= lgroup.cs_limit &&
+				    CPU_ISSET(cpu, &lgroup.cs_mask)) {
+					lgroup.cs_cpu = cpu;
+					lgroup.cs_load = load - rnd;
+				}
+			}
+			if (match & CPU_SEARCH_HIGHEST)
+				if (tdq->tdq_load >= hgroup.cs_limit &&
+				    tdq->tdq_transferable &&
+				    CPU_ISSET(cpu, &hgroup.cs_mask)) {
+					hgroup.cs_cpu = cpu;
+					hgroup.cs_load = load - rnd;
+				}
+		}
+		total += load;
+
+		/* We have info about child item. Compare it. */
+		if (match & CPU_SEARCH_LOWEST) {
+			if (lgroup.cs_cpu >= 0 &&
+			    (load < lload ||
+			     (load == lload && lgroup.cs_load < low->cs_load))) {
+				lload = load;
+				low->cs_cpu = lgroup.cs_cpu;
+				low->cs_load = lgroup.cs_load;
+			}
+		}
+		if (match & CPU_SEARCH_HIGHEST)
+			if (hgroup.cs_cpu >= 0 &&
+			    (load > hload ||
+			     (load == hload && hgroup.cs_load > high->cs_load))) {
+				hload = load;
+				high->cs_cpu = hgroup.cs_cpu;
+				high->cs_load = hgroup.cs_load;
+			}
+		if (child) {
+			i--;
+			if (i == 0 && CPU_EMPTY(&cpumask))
+				break;
+		}
+#ifndef HAVE_INLINE_FFSL
+		else
+			cpu--;
+#endif
+	}
+	return (total);
+}
+
+/*
+ * cpu_search instantiations must pass constants to maintain the inline
+ * optimization.
+ */
+int
+cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
+{
+	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
+}
+
+int
+cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
+{
+	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
+}
+
+int
+cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
+    struct cpu_search *high)
+{
+	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
+}
+
+/*
+ * Find the cpu with the least load via the least loaded path that has a
+ * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
+ * acceptable.
+ */
+static inline int
+sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
+    int prefer)
+{
+	struct cpu_search low;
+
+	low.cs_cpu = -1;
+	low.cs_prefer = prefer;
+	low.cs_mask = mask;
+	low.cs_pri = pri;
+	low.cs_limit = maxload;
+	cpu_search_lowest(cg, &low);
+	return low.cs_cpu;
+}
+
+/*
+ * Find the cpu with the highest load via the highest loaded path.
+ */
+static inline int
+sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
+{
+	struct cpu_search high;
+
+	high.cs_cpu = -1;
+	high.cs_mask = mask;
+	high.cs_limit = minload;
+	cpu_search_highest(cg, &high);
+	return high.cs_cpu;
+}
+
+/*
+ * Simultaneously find the highest and lowest loaded cpu reachable via
+ * cg.
+ */
+static inline void
+sched_both(const struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
+{
+	struct cpu_search high;
+	struct cpu_search low;
+
+	low.cs_cpu = -1;
+	low.cs_prefer = -1;
+	low.cs_pri = -1;
+	low.cs_limit = INT_MAX;
+	low.cs_mask = mask;
+	high.cs_cpu = -1;
+	high.cs_limit = -1;
+	high.cs_mask = mask;
+	cpu_search_both(cg, &low, &high);
+	*lowcpu = low.cs_cpu;
+	*highcpu = high.cs_cpu;
+	return;
+}
+
+static void
+sched_balance_group(struct cpu_group *cg)
+{
+	cpuset_t hmask, lmask;
+	int high, low, anylow;
+
+	CPU_FILL(&hmask);
+	for (;;) {
+		high = sched_highest(cg, hmask, 1);
+		/* Stop if there is no more CPU with transferrable threads. */
+		if (high == -1)
+			break;
+		CPU_CLR(high, &hmask);
+		CPU_COPY(&hmask, &lmask);
+		/* Stop if there is no more CPU left for low. */
+		if (CPU_EMPTY(&lmask))
+			break;
+		anylow = 1;
+nextlow:
+		low = sched_lowest(cg, lmask, -1,
+		    TDQ_CPU(high)->tdq_load - 1, high);
+		/* Stop if we looked well and found no less loaded CPU. */
+		if (anylow && low == -1)
+			break;
+		/* Go to next high if we found no less loaded CPU. */
+		if (low == -1)
+			continue;
+		/* Transfer thread from high to low. */
+		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) {
+			/* CPU that got thread can no longer be a donor. */
+			CPU_CLR(low, &hmask);
+		} else {
+			/*
+			 * If failed, then there is no threads on high
+			 * that can run on this low. Drop low from low
+			 * mask and look for different one.
+			 */
+			CPU_CLR(low, &lmask);
+			anylow = 0;
+			goto nextlow;
+		}
+	}
+}
+
+static void
+sched_balance(void)
+{
+	struct tdq *tdq;
+
+	/*
+	 * Select a random time between .5 * balance_interval and
+	 * 1.5 * balance_interval.
+	 */
+	balance_ticks = max(balance_interval / 2, 1);
+	balance_ticks += random() % balance_interval;
+	if (smp_started == 0 || rebalance == 0)
+		return;
+	tdq = TDQ_SELF();
+	TDQ_UNLOCK(tdq);
+	sched_balance_group(cpu_top);
+	TDQ_LOCK(tdq);
+}
+
+/*
+ * Lock two thread queues using their address to maintain lock order.
+ */
+static void
+tdq_lock_pair(struct tdq *one, struct tdq *two)
+{
+	if (one < two) {
+		TDQ_LOCK(one);
+		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
+	} else {
+		TDQ_LOCK(two);
+		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
+	}
+}
+
+/*
+ * Unlock two thread queues.  Order is not important here.
+ */
+static void
+tdq_unlock_pair(struct tdq *one, struct tdq *two)
+{
+	TDQ_UNLOCK(one);
+	TDQ_UNLOCK(two);
+}
+
+/*
+ * Transfer load between two imbalanced thread queues.
+ */
+static int
+sched_balance_pair(struct tdq *high, struct tdq *low)
+{
+	int moved;
+	int cpu;
+
+	tdq_lock_pair(high, low);
+	moved = 0;
+	/*
+	 * Determine what the imbalance is and then adjust that to how many
+	 * threads we actually have to give up (transferable).
+	 */
+	if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
+	    (moved = tdq_move(high, low)) > 0) {
+		/*
+		 * In case the target isn't the current cpu IPI it to force a
+		 * reschedule with the new workload.
+		 */
+		cpu = TDQ_ID(low);
+		if (cpu != PCPU_GET(cpuid))
+			ipi_cpu(cpu, IPI_PREEMPT);
+	}
+	tdq_unlock_pair(high, low);
+	return (moved);
+}
+
+/*
+ * Move a thread from one thread queue to another.
+ */
+static int
+tdq_move(struct tdq *from, struct tdq *to)
+{
+	struct td_sched *ts;
+	struct thread *td;
+	struct tdq *tdq;
+	int cpu;
+
+	TDQ_LOCK_ASSERT(from, MA_OWNED);
+	TDQ_LOCK_ASSERT(to, MA_OWNED);
+
+	tdq = from;
+	cpu = TDQ_ID(to);
+	td = tdq_steal(tdq, cpu);
+	if (td == NULL)
+		return (0);
+	ts = td->td_sched;
+	/*
+	 * Although the run queue is locked the thread may be blocked.  Lock
+	 * it to clear this and acquire the run-queue lock.
+	 */
+	thread_lock(td);
+	/* Drop recursive lock on from acquired via thread_lock(). */
+	TDQ_UNLOCK(from);
+	sched_rem(td);
+	ts->ts_cpu = cpu;
+	td->td_lock = TDQ_LOCKPTR(to);
+	tdq_add(to, td, SRQ_YIELDING);
+	return (1);
+}
+
+/*
+ * This tdq has idled.  Try to steal a thread from another cpu and switch
+ * to it.
+ */
+static int
+tdq_idled(struct tdq *tdq)
+{
+	struct cpu_group *cg;
+	struct tdq *steal;
+	cpuset_t mask;
+	int thresh;
+	int cpu;
+
+	if (smp_started == 0 || steal_idle == 0)
+		return (1);
+	CPU_FILL(&mask);
+	CPU_CLR(PCPU_GET(cpuid), &mask);
+	/* We don't want to be preempted while we're iterating. */
+	spinlock_enter();
+	for (cg = tdq->tdq_cg; cg != NULL; ) {
+		if ((cg->cg_flags & CG_FLAG_THREAD) == 0)
+			thresh = steal_thresh;
+		else
+			thresh = 1;
+		cpu = sched_highest(cg, mask, thresh);
+		if (cpu == -1) {
+			cg = cg->cg_parent;
+			continue;
+		}
+		steal = TDQ_CPU(cpu);
+		CPU_CLR(cpu, &mask);
+		tdq_lock_pair(tdq, steal);
+		if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
+			tdq_unlock_pair(tdq, steal);
+			continue;
+		}
+		/*
+		 * If a thread was added while interrupts were disabled don't
+		 * steal one here.  If we fail to acquire one due to affinity
+		 * restrictions loop again with this cpu removed from the
+		 * set.
+		 */
+		if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
+			tdq_unlock_pair(tdq, steal);
+			continue;
+		}
+		spinlock_exit();
+		TDQ_UNLOCK(steal);
+		mi_switch(SW_VOL | SWT_IDLE, NULL);
+		thread_unlock(curthread);
+
+		return (0);
+	}
+	spinlock_exit();
+	return (1);
+}
+
+/*
+ * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
+ */
+static void
+tdq_notify(struct tdq *tdq, struct thread *td)
+{
+	struct thread *ctd;
+	int pri;
+	int cpu;
+
+	if (tdq->tdq_ipipending)
+		return;
+	cpu = td->td_sched->ts_cpu;
+	pri = td->td_priority;
+	ctd = pcpu_find(cpu)->pc_curthread;
+	if (!sched_shouldpreempt(pri, ctd->td_priority, 1))
+		return;
+	if (TD_IS_IDLETHREAD(ctd)) {
+		/*
+		 * If the MD code has an idle wakeup routine try that before
+		 * falling back to IPI.
+		 */
+		if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu))
+			return;
+	}
+	tdq->tdq_ipipending = 1;
+	ipi_cpu(cpu, IPI_PREEMPT);
+}
+
+/*
+ * Steals load from a timeshare queue.  Honors the rotating queue head
+ * index.
+ */
+static struct thread *
+runq_steal_from(struct runq *rq, int cpu, u_char start)
+{
+	struct rqbits *rqb;
+	struct rqhead *rqh;
+	struct thread *td, *first;
+	int bit;
+	int pri;
+	int i;
+
+	rqb = &rq->rq_status;
+	bit = start & (RQB_BPW -1);
+	pri = 0;
+	first = NULL;
+again:
+	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
+		if (rqb->rqb_bits[i] == 0)
+			continue;
+		if (bit != 0) {
+			for (pri = bit; pri < RQB_BPW; pri++)
+				if (rqb->rqb_bits[i] & (1ul << pri))
+					break;
+			if (pri >= RQB_BPW)
+				continue;
+		} else
+			pri = RQB_FFS(rqb->rqb_bits[i]);
+		pri += (i << RQB_L2BPW);
+		rqh = &rq->rq_queues[pri];
+		TAILQ_FOREACH(td, rqh, td_runq) {
+			if (first && THREAD_CAN_MIGRATE(td) &&
+			    THREAD_CAN_SCHED(td, cpu))
+				return (td);
+			first = td;
+		}
+	}
+	if (start != 0) {
+		start = 0;
+		goto again;
+	}
+
+	if (first && THREAD_CAN_MIGRATE(first) &&
+	    THREAD_CAN_SCHED(first, cpu))
+		return (first);
+	return (NULL);
+}
+
+/*
+ * Steals load from a standard linear queue.
+ */
+static struct thread *
+runq_steal(struct runq *rq, int cpu)
+{
+	struct rqhead *rqh;
+	struct rqbits *rqb;
+	struct thread *td;
+	int word;
+	int bit;
+
+	rqb = &rq->rq_status;
+	for (word = 0; word < RQB_LEN; word++) {
+		if (rqb->rqb_bits[word] == 0)
+			continue;
+		for (bit = 0; bit < RQB_BPW; bit++) {
+			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
+				continue;
+			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
+			TAILQ_FOREACH(td, rqh, td_runq)
+				if (THREAD_CAN_MIGRATE(td) &&
+				    THREAD_CAN_SCHED(td, cpu))
+					return (td);
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Attempt to steal a thread in priority order from a thread queue.
+ */
+static struct thread *
+tdq_steal(struct tdq *tdq, int cpu)
+{
+	struct thread *td;
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
+		return (td);
+	if ((td = runq_steal_from(&tdq->tdq_timeshare,
+	    cpu, tdq->tdq_ridx)) != NULL)
+		return (td);
+	return (runq_steal(&tdq->tdq_idle, cpu));
+}
+
+/*
+ * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
+ * current lock and returns with the assigned queue locked.
+ */
+static inline struct tdq *
+sched_setcpu(struct thread *td, int cpu, int flags)
+{
+
+	struct tdq *tdq;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	tdq = TDQ_CPU(cpu);
+	td->td_sched->ts_cpu = cpu;
+	/*
+	 * If the lock matches just return the queue.
+	 */
+	if (td->td_lock == TDQ_LOCKPTR(tdq))
+		return (tdq);
+#ifdef notyet
+	/*
+	 * If the thread isn't running its lockptr is a
+	 * turnstile or a sleepqueue.  We can just lock_set without
+	 * blocking.
+	 */
+	if (TD_CAN_RUN(td)) {
+		TDQ_LOCK(tdq);
+		thread_lock_set(td, TDQ_LOCKPTR(tdq));
+		return (tdq);
+	}
+#endif
+	/*
+	 * The hard case, migration, we need to block the thread first to
+	 * prevent order reversals with other cpus locks.
+	 */
+	spinlock_enter();
+	thread_lock_block(td);
+	TDQ_LOCK(tdq);
+	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
+	spinlock_exit();
+	return (tdq);
+}
+
+SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
+SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
+SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
+SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
+SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
+SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
+
+static int
+sched_pickcpu(struct thread *td, int flags)
+{
+	struct cpu_group *cg, *ccg;
+	struct td_sched *ts;
+	struct tdq *tdq;
+	cpuset_t mask;
+	int cpu, pri, self;
+
+	self = PCPU_GET(cpuid);
+	ts = td->td_sched;
+	if (smp_started == 0)
+		return (self);
+	/*
+	 * Don't migrate a running thread from sched_switch().
+	 */
+	if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
+		return (ts->ts_cpu);
+	/*
+	 * Prefer to run interrupt threads on the processors that generate
+	 * the interrupt.
+	 */
+	pri = td->td_priority;
+	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
+	    curthread->td_intr_nesting_level && ts->ts_cpu != self) {
+		SCHED_STAT_INC(pickcpu_intrbind);
+		ts->ts_cpu = self;
+		if (TDQ_CPU(self)->tdq_lowpri > pri) {
+			SCHED_STAT_INC(pickcpu_affinity);
+			return (ts->ts_cpu);
+		}
+	}
+	/*
+	 * If the thread can run on the last cpu and the affinity has not
+	 * expired or it is idle run it there.
+	 */
+	tdq = TDQ_CPU(ts->ts_cpu);
+	cg = tdq->tdq_cg;
+	if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
+	    tdq->tdq_lowpri >= PRI_MIN_IDLE &&
+	    SCHED_AFFINITY(ts, CG_SHARE_L2)) {
+		if (cg->cg_flags & CG_FLAG_THREAD) {
+			CPUSET_FOREACH(cpu, cg->cg_mask) {
+				if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
+					break;
+			}
+		} else
+			cpu = INT_MAX;
+		if (cpu > mp_maxid) {
+			SCHED_STAT_INC(pickcpu_idle_affinity);
+			return (ts->ts_cpu);
+		}
+	}
+	/*
+	 * Search for the last level cache CPU group in the tree.
+	 * Skip caches with expired affinity time and SMT groups.
+	 * Affinity to higher level caches will be handled less aggressively.
+	 */
+	for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
+		if (cg->cg_flags & CG_FLAG_THREAD)
+			continue;
+		if (!SCHED_AFFINITY(ts, cg->cg_level))
+			continue;
+		ccg = cg;
+	}
+	if (ccg != NULL)
+		cg = ccg;
+	cpu = -1;
+	/* Search the group for the less loaded idle CPU we can run now. */
+	mask = td->td_cpuset->cs_mask;
+	if (cg != NULL && cg != cpu_top &&
+	    CPU_CMP(&cg->cg_mask, &cpu_top->cg_mask) != 0)
+		cpu = sched_lowest(cg, mask, max(pri, PRI_MAX_TIMESHARE),
+		    INT_MAX, ts->ts_cpu);
+	/* Search globally for the less loaded CPU we can run now. */
+	if (cpu == -1)
+		cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
+	/* Search globally for the less loaded CPU. */
+	if (cpu == -1)
+		cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
+	KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu."));
+	/*
+	 * Compare the lowest loaded cpu to current cpu.
+	 */
+	if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
+	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE &&
+	    TDQ_CPU(self)->tdq_load <= TDQ_CPU(cpu)->tdq_load + 1) {
+		SCHED_STAT_INC(pickcpu_local);
+		cpu = self;
+	} else
+		SCHED_STAT_INC(pickcpu_lowest);
+	if (cpu != ts->ts_cpu)
+		SCHED_STAT_INC(pickcpu_migration);
+	return (cpu);
+}
+#endif
+
+/*
+ * Pick the highest priority task we have and return it.
+ */
+static struct thread *
+tdq_choose(struct tdq *tdq)
+{
+	struct thread *td;
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	td = runq_choose(&tdq->tdq_realtime);
+	if (td != NULL)
+		return (td);
+	td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
+	if (td != NULL) {
+		KASSERT(td->td_priority >= PRI_MIN_BATCH,
+		    ("tdq_choose: Invalid priority on timeshare queue %d",
+		    td->td_priority));
+		return (td);
+	}
+	td = runq_choose(&tdq->tdq_idle);
+	if (td != NULL) {
+		KASSERT(td->td_priority >= PRI_MIN_IDLE,
+		    ("tdq_choose: Invalid priority on idle queue %d",
+		    td->td_priority));
+		return (td);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Initialize a thread queue.
+ */
+static void
+tdq_setup(struct tdq *tdq)
+{
+
+	if (bootverbose)
+		printf("ULE: setup cpu %d\n", TDQ_ID(tdq));
+	runq_init(&tdq->tdq_realtime);
+	runq_init(&tdq->tdq_timeshare);
+	runq_init(&tdq->tdq_idle);
+	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
+	    "sched lock %d", (int)TDQ_ID(tdq));
+	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
+	    MTX_SPIN | MTX_RECURSE);
+#ifdef KTR
+	snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname),
+	    "CPU %d load", (int)TDQ_ID(tdq));
+#endif
+}
+
+#ifdef SMP
+static void
+sched_setup_smp(void)
+{
+	struct tdq *tdq;
+	int i;
+
+	cpu_top = smp_topo();
+	CPU_FOREACH(i) {
+		tdq = TDQ_CPU(i);
+		tdq_setup(tdq);
+		tdq->tdq_cg = smp_topo_find(cpu_top, i);
+		if (tdq->tdq_cg == NULL)
+			panic("Can't find cpu group for %d\n", i);
+	}
+	balance_tdq = TDQ_SELF();
+	sched_balance();
+}
+#endif
+
+/*
+ * Setup the thread queues and initialize the topology based on MD
+ * information.
+ */
+static void
+sched_setup(void *dummy)
+{
+	struct tdq *tdq;
+
+	tdq = TDQ_SELF();
+#ifdef SMP
+	sched_setup_smp();
+#else
+	tdq_setup(tdq);
+#endif
+
+	/* Add thread0's load since it's running. */
+	TDQ_LOCK(tdq);
+	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
+	tdq_load_add(tdq, &thread0);
+	tdq->tdq_lowpri = thread0.td_priority;
+	TDQ_UNLOCK(tdq);
+}
+
+/*
+ * This routine determines time constants after stathz and hz are setup.
+ */
+/* ARGSUSED */
+static void
+sched_initticks(void *dummy)
+{
+	int incr;
+
+	realstathz = stathz ? stathz : hz;
+	sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR;
+	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
+	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
+	    realstathz);
+
+	/*
+	 * tickincr is shifted out by 10 to avoid rounding errors due to
+	 * hz not being evenly divisible by stathz on all platforms.
+	 */
+	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
+	/*
+	 * This does not work for values of stathz that are more than
+	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
+	 */
+	if (incr == 0)
+		incr = 1;
+	tickincr = incr;
+#ifdef SMP
+	/*
+	 * Set the default balance interval now that we know
+	 * what realstathz is.
+	 */
+	balance_interval = realstathz;
+	affinity = SCHED_AFFINITY_DEFAULT;
+#endif
+	if (sched_idlespinthresh < 0)
+		sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz;
+}
+
+
+/*
+ * This is the core of the interactivity algorithm.  Determines a score based
+ * on past behavior.  It is the ratio of sleep time to run time scaled to
+ * a [0, 100] integer.  This is the voluntary sleep time of a process, which
+ * differs from the cpu usage because it does not account for time spent
+ * waiting on a run-queue.  Would be prettier if we had floating point.
+ */
+static int
+sched_interact_score(struct thread *td)
+{
+	struct td_sched *ts;
+	int div;
+
+	ts = td->td_sched;
+	/*
+	 * The score is only needed if this is likely to be an interactive
+	 * task.  Don't go through the expense of computing it if there's
+	 * no chance.
+	 */
+	if (sched_interact <= SCHED_INTERACT_HALF &&
+		ts->ts_runtime >= ts->ts_slptime)
+			return (SCHED_INTERACT_HALF);
+
+	if (ts->ts_runtime > ts->ts_slptime) {
+		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
+		return (SCHED_INTERACT_HALF +
+		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
+	}
+	if (ts->ts_slptime > ts->ts_runtime) {
+		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
+		return (ts->ts_runtime / div);
+	}
+	/* runtime == slptime */
+	if (ts->ts_runtime)
+		return (SCHED_INTERACT_HALF);
+
+	/*
+	 * This can happen if slptime and runtime are 0.
+	 */
+	return (0);
+
+}
+
+/*
+ * Scale the scheduling priority according to the "interactivity" of this
+ * process.
+ */
+static void
+sched_priority(struct thread *td)
+{
+	int score;
+	int pri;
+
+	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
+		return;
+	/*
+	 * If the score is interactive we place the thread in the realtime
+	 * queue with a priority that is less than kernel and interrupt
+	 * priorities.  These threads are not subject to nice restrictions.
+	 *
+	 * Scores greater than this are placed on the normal timeshare queue
+	 * where the priority is partially decided by the most recent cpu
+	 * utilization and the rest is decided by nice value.
+	 *
+	 * The nice value of the process has a linear effect on the calculated
+	 * score.  Negative nice values make it easier for a thread to be
+	 * considered interactive.
+	 */
+	score = imax(0, sched_interact_score(td) + td->td_proc->p_nice);
+	if (score < sched_interact) {
+		pri = PRI_MIN_INTERACT;
+		pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) /
+		    sched_interact) * score;
+		KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT,
+		    ("sched_priority: invalid interactive priority %d score %d",
+		    pri, score));
+	} else {
+		pri = SCHED_PRI_MIN;
+		if (td->td_sched->ts_ticks)
+			pri += min(SCHED_PRI_TICKS(td->td_sched),
+			    SCHED_PRI_RANGE);
+		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
+		KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH,
+		    ("sched_priority: invalid priority %d: nice %d, " 
+		    "ticks %d ftick %d ltick %d tick pri %d",
+		    pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
+		    td->td_sched->ts_ftick, td->td_sched->ts_ltick,
+		    SCHED_PRI_TICKS(td->td_sched)));
+	}
+	sched_user_prio(td, pri);
+
+	return;
+}
+
+/*
+ * This routine enforces a maximum limit on the amount of scheduling history
+ * kept.  It is called after either the slptime or runtime is adjusted.  This
+ * function is ugly due to integer math.
+ */
+static void
+sched_interact_update(struct thread *td)
+{
+	struct td_sched *ts;
+	u_int sum;
+
+	ts = td->td_sched;
+	sum = ts->ts_runtime + ts->ts_slptime;
+	if (sum < SCHED_SLP_RUN_MAX)
+		return;
+	/*
+	 * This only happens from two places:
+	 * 1) We have added an unusual amount of run time from fork_exit.
+	 * 2) We have added an unusual amount of sleep time from sched_sleep().
+	 */
+	if (sum > SCHED_SLP_RUN_MAX * 2) {
+		if (ts->ts_runtime > ts->ts_slptime) {
+			ts->ts_runtime = SCHED_SLP_RUN_MAX;
+			ts->ts_slptime = 1;
+		} else {
+			ts->ts_slptime = SCHED_SLP_RUN_MAX;
+			ts->ts_runtime = 1;
+		}
+		return;
+	}
+	/*
+	 * If we have exceeded by more than 1/5th then the algorithm below
+	 * will not bring us back into range.  Dividing by two here forces
+	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
+	 */
+	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
+		ts->ts_runtime /= 2;
+		ts->ts_slptime /= 2;
+		return;
+	}
+	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
+	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
+}
+
+/*
+ * Scale back the interactivity history when a child thread is created.  The
+ * history is inherited from the parent but the thread may behave totally
+ * differently.  For example, a shell spawning a compiler process.  We want
+ * to learn that the compiler is behaving badly very quickly.
+ */
+static void
+sched_interact_fork(struct thread *td)
+{
+	int ratio;
+	int sum;
+
+	sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
+	if (sum > SCHED_SLP_RUN_FORK) {
+		ratio = sum / SCHED_SLP_RUN_FORK;
+		td->td_sched->ts_runtime /= ratio;
+		td->td_sched->ts_slptime /= ratio;
+	}
+}
+
+/*
+ * Called from proc0_init() to setup the scheduler fields.
+ */
+void
+schedinit(void)
+{
+
+	/*
+	 * Set up the scheduler specific parts of proc0.
+	 */
+	proc0.p_sched = NULL; /* XXX */
+	thread0.td_sched = &td_sched0;
+	td_sched0.ts_ltick = ticks;
+	td_sched0.ts_ftick = ticks;
+	td_sched0.ts_slice = 0;
+}
+
+/*
+ * This is only somewhat accurate since given many processes of the same
+ * priority they will switch when their slices run out, which will be
+ * at most sched_slice stathz ticks.
+ */
+int
+sched_rr_interval(void)
+{
+
+	/* Convert sched_slice from stathz to hz. */
+	return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
+}
+
+/*
+ * Update the percent cpu tracking information when it is requested or
+ * the total history exceeds the maximum.  We keep a sliding history of
+ * tick counts that slowly decays.  This is less precise than the 4BSD
+ * mechanism since it happens with less regular and frequent events.
+ */
+static void
+sched_pctcpu_update(struct td_sched *ts, int run)
+{
+	int t = ticks;
+
+	if (t - ts->ts_ltick >= SCHED_TICK_TARG) {
+		ts->ts_ticks = 0;
+		ts->ts_ftick = t - SCHED_TICK_TARG;
+	} else if (t - ts->ts_ftick >= SCHED_TICK_MAX) {
+		ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) *
+		    (ts->ts_ltick - (t - SCHED_TICK_TARG));
+		ts->ts_ftick = t - SCHED_TICK_TARG;
+	}
+	if (run)
+		ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT;
+	ts->ts_ltick = t;
+}
+
+/*
+ * Adjust the priority of a thread.  Move it to the appropriate run-queue
+ * if necessary.  This is the back-end for several priority related
+ * functions.
+ */
+static void
+sched_thread_priority(struct thread *td, u_char prio)
+{
+	struct td_sched *ts;
+	struct tdq *tdq;
+	int oldpri;
+
+	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
+	    "prio:%d", td->td_priority, "new prio:%d", prio,
+	    KTR_ATTR_LINKED, sched_tdname(curthread));
+	SDT_PROBE3(sched, , , change_pri, td, td->td_proc, prio);
+	if (td != curthread && prio < td->td_priority) {
+		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
+		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
+		    prio, KTR_ATTR_LINKED, sched_tdname(td));
+		SDT_PROBE4(sched, , , lend_pri, td, td->td_proc, prio, 
+		    curthread);
+	} 
+	ts = td->td_sched;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	if (td->td_priority == prio)
+		return;
+	/*
+	 * If the priority has been elevated due to priority
+	 * propagation, we may have to move ourselves to a new
+	 * queue.  This could be optimized to not re-add in some
+	 * cases.
+	 */
+	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
+		sched_rem(td);
+		td->td_priority = prio;
+		sched_add(td, SRQ_BORROWING);
+		return;
+	}
+	/*
+	 * If the thread is currently running we may have to adjust the lowpri
+	 * information so other cpus are aware of our current priority.
+	 */
+	if (TD_IS_RUNNING(td)) {
+		tdq = TDQ_CPU(ts->ts_cpu);
+		oldpri = td->td_priority;
+		td->td_priority = prio;
+		if (prio < tdq->tdq_lowpri)
+			tdq->tdq_lowpri = prio;
+		else if (tdq->tdq_lowpri == oldpri)
+			tdq_setlowpri(tdq, td);
+		return;
+	}
+	td->td_priority = prio;
+}
+
+/*
+ * Update a thread's priority when it is lent another thread's
+ * priority.
+ */
+void
+sched_lend_prio(struct thread *td, u_char prio)
+{
+
+	td->td_flags |= TDF_BORROWING;
+	sched_thread_priority(td, prio);
+}
+
+/*
+ * Restore a thread's priority when priority propagation is
+ * over.  The prio argument is the minimum priority the thread
+ * needs to have to satisfy other possible priority lending
+ * requests.  If the thread's regular priority is less
+ * important than prio, the thread will keep a priority boost
+ * of prio.
+ */
+void
+sched_unlend_prio(struct thread *td, u_char prio)
+{
+	u_char base_pri;
+
+	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
+	    td->td_base_pri <= PRI_MAX_TIMESHARE)
+		base_pri = td->td_user_pri;
+	else
+		base_pri = td->td_base_pri;
+	if (prio >= base_pri) {
+		td->td_flags &= ~TDF_BORROWING;
+		sched_thread_priority(td, base_pri);
+	} else
+		sched_lend_prio(td, prio);
+}
+
+/*
+ * Standard entry for setting the priority to an absolute value.
+ */
+void
+sched_prio(struct thread *td, u_char prio)
+{
+	u_char oldprio;
+
+	/* First, update the base priority. */
+	td->td_base_pri = prio;
+
+	/*
+	 * If the thread is borrowing another thread's priority, don't
+	 * ever lower the priority.
+	 */
+	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
+		return;
+
+	/* Change the real priority. */
+	oldprio = td->td_priority;
+	sched_thread_priority(td, prio);
+
+	/*
+	 * If the thread is on a turnstile, then let the turnstile update
+	 * its state.
+	 */
+	if (TD_ON_LOCK(td) && oldprio != prio)
+		turnstile_adjust(td, oldprio);
+}
+
+/*
+ * Set the base user priority, does not effect current running priority.
+ */
+void
+sched_user_prio(struct thread *td, u_char prio)
+{
+
+	td->td_base_user_pri = prio;
+	if (td->td_lend_user_pri <= prio)
+		return;
+	td->td_user_pri = prio;
+}
+
+void
+sched_lend_user_prio(struct thread *td, u_char prio)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	td->td_lend_user_pri = prio;
+	td->td_user_pri = min(prio, td->td_base_user_pri);
+	if (td->td_priority > td->td_user_pri)
+		sched_prio(td, td->td_user_pri);
+	else if (td->td_priority != td->td_user_pri)
+		td->td_flags |= TDF_NEEDRESCHED;
+}
+
+/*
+ * Handle migration from sched_switch().  This happens only for
+ * cpu binding.
+ */
+static struct mtx *
+sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
+{
+	struct tdq *tdn;
+
+	tdn = TDQ_CPU(td->td_sched->ts_cpu);
+#ifdef SMP
+	tdq_load_rem(tdq, td);
+	/*
+	 * Do the lock dance required to avoid LOR.  We grab an extra
+	 * spinlock nesting to prevent preemption while we're
+	 * not holding either run-queue lock.
+	 */
+	spinlock_enter();
+	thread_lock_block(td);	/* This releases the lock on tdq. */
+
+	/*
+	 * Acquire both run-queue locks before placing the thread on the new
+	 * run-queue to avoid deadlocks created by placing a thread with a
+	 * blocked lock on the run-queue of a remote processor.  The deadlock
+	 * occurs when a third processor attempts to lock the two queues in
+	 * question while the target processor is spinning with its own
+	 * run-queue lock held while waiting for the blocked lock to clear.
+	 */
+	tdq_lock_pair(tdn, tdq);
+	tdq_add(tdn, td, flags);
+	tdq_notify(tdn, td);
+	TDQ_UNLOCK(tdn);
+	spinlock_exit();
+#endif
+	return (TDQ_LOCKPTR(tdn));
+}
+
+/*
+ * Variadic version of thread_lock_unblock() that does not assume td_lock
+ * is blocked.
+ */
+static inline void
+thread_unblock_switch(struct thread *td, struct mtx *mtx)
+{
+	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
+	    (uintptr_t)mtx);
+}
+
+/*
+ * Switch threads.  This function has to handle threads coming in while
+ * blocked for some reason, running, or idle.  It also must deal with
+ * migrating a thread from one queue to another as running threads may
+ * be assigned elsewhere via binding.
+ */
+void
+sched_switch(struct thread *td, struct thread *newtd, int flags)
+{
+	struct tdq *tdq;
+	struct td_sched *ts;
+	struct mtx *mtx;
+	int srqflag;
+	int cpuid, preempted;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT(newtd == NULL, ("sched_switch: Unsupported newtd argument"));
+
+	cpuid = PCPU_GET(cpuid);
+	tdq = TDQ_CPU(cpuid);
+	ts = td->td_sched;
+	mtx = td->td_lock;
+	sched_pctcpu_update(ts, 1);
+	ts->ts_rltick = ticks;
+	td->td_lastcpu = td->td_oncpu;
+	td->td_oncpu = NOCPU;
+	preempted = !(td->td_flags & TDF_SLICEEND);
+	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
+	td->td_owepreempt = 0;
+	if (!TD_IS_IDLETHREAD(td))
+		tdq->tdq_switchcnt++;
+	/*
+	 * The lock pointer in an idle thread should never change.  Reset it
+	 * to CAN_RUN as well.
+	 */
+	if (TD_IS_IDLETHREAD(td)) {
+		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+		TD_SET_CAN_RUN(td);
+	} else if (TD_IS_RUNNING(td)) {
+		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+		srqflag = preempted ?
+		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
+		    SRQ_OURSELF|SRQ_YIELDING;
+#ifdef SMP
+		if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu))
+			ts->ts_cpu = sched_pickcpu(td, 0);
+#endif
+		if (ts->ts_cpu == cpuid)
+			tdq_runq_add(tdq, td, srqflag);
+		else {
+			KASSERT(THREAD_CAN_MIGRATE(td) ||
+			    (ts->ts_flags & TSF_BOUND) != 0,
+			    ("Thread %p shouldn't migrate", td));
+			mtx = sched_switch_migrate(tdq, td, srqflag);
+		}
+	} else {
+		/* This thread must be going to sleep. */
+		TDQ_LOCK(tdq);
+		mtx = thread_lock_block(td);
+		tdq_load_rem(tdq, td);
+	}
+	/*
+	 * We enter here with the thread blocked and assigned to the
+	 * appropriate cpu run-queue or sleep-queue and with the current
+	 * thread-queue locked.
+	 */
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
+	newtd = choosethread();
+	/*
+	 * Call the MD code to switch contexts if necessary.
+	 */
+	if (td != newtd) {
+#ifdef	HWPMC_HOOKS
+		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
+#endif
+		SDT_PROBE2(sched, , , off_cpu, newtd, newtd->td_proc);
+		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
+		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
+		sched_pctcpu_update(newtd->td_sched, 0);
+
+#ifdef KDTRACE_HOOKS
+		/*
+		 * If DTrace has set the active vtime enum to anything
+		 * other than INACTIVE (0), then it should have set the
+		 * function to call.
+		 */
+		if (dtrace_vtime_active)
+			(*dtrace_vtime_switch_func)(newtd);
+#endif
+
+		cpu_switch(td, newtd, mtx);
+		/*
+		 * We may return from cpu_switch on a different cpu.  However,
+		 * we always return with td_lock pointing to the current cpu's
+		 * run queue lock.
+		 */
+		cpuid = PCPU_GET(cpuid);
+		tdq = TDQ_CPU(cpuid);
+		lock_profile_obtain_lock_success(
+		    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
+
+		SDT_PROBE0(sched, , , on_cpu);
+#ifdef	HWPMC_HOOKS
+		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
+#endif
+	} else {
+		thread_unblock_switch(td, mtx);
+		SDT_PROBE0(sched, , , remain_cpu);
+	}
+	/*
+	 * Assert that all went well and return.
+	 */
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
+	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+	td->td_oncpu = cpuid;
+}
+
+/*
+ * Adjust thread priorities as a result of a nice request.
+ */
+void
+sched_nice(struct proc *p, int nice)
+{
+	struct thread *td;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	p->p_nice = nice;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		sched_priority(td);
+		sched_prio(td, td->td_base_user_pri);
+		thread_unlock(td);
+	}
+}
+
+/*
+ * Record the sleep time for the interactivity scorer.
+ */
+void
+sched_sleep(struct thread *td, int prio)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+	td->td_slptick = ticks;
+	if (TD_IS_SUSPENDED(td) || prio >= PSOCK)
+		td->td_flags |= TDF_CANSWAP;
+	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
+		return;
+	if (static_boost == 1 && prio)
+		sched_prio(td, prio);
+	else if (static_boost && td->td_priority > static_boost)
+		sched_prio(td, static_boost);
+}
+
+/*
+ * Schedule a thread to resume execution and record how long it voluntarily
+ * slept.  We also update the pctcpu, interactivity, and priority.
+ */
+void
+sched_wakeup(struct thread *td)
+{
+	struct td_sched *ts;
+	int slptick;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	td->td_flags &= ~TDF_CANSWAP;
+	/*
+	 * If we slept for more than a tick update our interactivity and
+	 * priority.
+	 */
+	slptick = td->td_slptick;
+	td->td_slptick = 0;
+	if (slptick && slptick != ticks) {
+		ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT;
+		sched_interact_update(td);
+		sched_pctcpu_update(ts, 0);
+	}
+	/*
+	 * Reset the slice value since we slept and advanced the round-robin.
+	 */
+	ts->ts_slice = 0;
+	sched_add(td, SRQ_BORING);
+}
+
+/*
+ * Penalize the parent for creating a new child and initialize the child's
+ * priority.
+ */
+void
+sched_fork(struct thread *td, struct thread *child)
+{
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	sched_pctcpu_update(td->td_sched, 1);
+	sched_fork_thread(td, child);
+	/*
+	 * Penalize the parent and child for forking.
+	 */
+	sched_interact_fork(child);
+	sched_priority(child);
+	td->td_sched->ts_runtime += tickincr;
+	sched_interact_update(td);
+	sched_priority(td);
+}
+
+/*
+ * Fork a new thread, may be within the same process.
+ */
+void
+sched_fork_thread(struct thread *td, struct thread *child)
+{
+	struct td_sched *ts;
+	struct td_sched *ts2;
+	struct tdq *tdq;
+
+	tdq = TDQ_SELF();
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	/*
+	 * Initialize child.
+	 */
+	ts = td->td_sched;
+	ts2 = child->td_sched;
+	child->td_lock = TDQ_LOCKPTR(tdq);
+	child->td_cpuset = cpuset_ref(td->td_cpuset);
+	ts2->ts_cpu = ts->ts_cpu;
+	ts2->ts_flags = 0;
+	/*
+	 * Grab our parents cpu estimation information.
+	 */
+	ts2->ts_ticks = ts->ts_ticks;
+	ts2->ts_ltick = ts->ts_ltick;
+	ts2->ts_ftick = ts->ts_ftick;
+	/*
+	 * Do not inherit any borrowed priority from the parent.
+	 */
+	child->td_priority = child->td_base_pri;
+	/*
+	 * And update interactivity score.
+	 */
+	ts2->ts_slptime = ts->ts_slptime;
+	ts2->ts_runtime = ts->ts_runtime;
+	/* Attempt to quickly learn interactivity. */
+	ts2->ts_slice = tdq_slice(tdq) - sched_slice_min;
+#ifdef KTR
+	bzero(ts2->ts_name, sizeof(ts2->ts_name));
+#endif
+}
+
+/*
+ * Adjust the priority class of a thread.
+ */
+void
+sched_class(struct thread *td, int class)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	if (td->td_pri_class == class)
+		return;
+	td->td_pri_class = class;
+}
+
+/*
+ * Return some of the child's priority and interactivity to the parent.
+ */
+void
+sched_exit(struct proc *p, struct thread *child)
+{
+	struct thread *td;
+
+	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit",
+	    "prio:%d", child->td_priority);
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	td = FIRST_THREAD_IN_PROC(p);
+	sched_exit_thread(td, child);
+}
+
+/*
+ * Penalize another thread for the time spent on this one.  This helps to
+ * worsen the priority and interactivity of processes which schedule batch
+ * jobs such as make.  This has little effect on the make process itself but
+ * causes new processes spawned by it to receive worse scores immediately.
+ */
+void
+sched_exit_thread(struct thread *td, struct thread *child)
+{
+
+	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit",
+	    "prio:%d", child->td_priority);
+	/*
+	 * Give the child's runtime to the parent without returning the
+	 * sleep time as a penalty to the parent.  This causes shells that
+	 * launch expensive things to mark their children as expensive.
+	 */
+	thread_lock(td);
+	td->td_sched->ts_runtime += child->td_sched->ts_runtime;
+	sched_interact_update(td);
+	sched_priority(td);
+	thread_unlock(td);
+}
+
+void
+sched_preempt(struct thread *td)
+{
+	struct tdq *tdq;
+
+	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
+
+	thread_lock(td);
+	tdq = TDQ_SELF();
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	tdq->tdq_ipipending = 0;
+	if (td->td_priority > tdq->tdq_lowpri) {
+		int flags;
+
+		flags = SW_INVOL | SW_PREEMPT;
+		if (td->td_critnest > 1)
+			td->td_owepreempt = 1;
+		else if (TD_IS_IDLETHREAD(td))
+			mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL);
+		else
+			mi_switch(flags | SWT_REMOTEPREEMPT, NULL);
+	}
+	thread_unlock(td);
+}
+
+/*
+ * Fix priorities on return to user-space.  Priorities may be elevated due
+ * to static priorities in msleep() or similar.
+ */
+void
+sched_userret(struct thread *td)
+{
+	/*
+	 * XXX we cheat slightly on the locking here to avoid locking in  
+	 * the usual case.  Setting td_priority here is essentially an
+	 * incomplete workaround for not setting it properly elsewhere.
+	 * Now that some interrupt handlers are threads, not setting it
+	 * properly elsewhere can clobber it in the window between setting
+	 * it here and returning to user mode, so don't waste time setting
+	 * it perfectly here.
+	 */
+	KASSERT((td->td_flags & TDF_BORROWING) == 0,
+	    ("thread with borrowed priority returning to userland"));
+	if (td->td_priority != td->td_user_pri) {
+		thread_lock(td);
+		td->td_priority = td->td_user_pri;
+		td->td_base_pri = td->td_user_pri;
+		tdq_setlowpri(TDQ_SELF(), td);
+		thread_unlock(td);
+        }
+}
+
+/*
+ * Handle a stathz tick.  This is really only relevant for timeshare
+ * threads.
+ */
+void
+sched_clock(struct thread *td)
+{
+	struct tdq *tdq;
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	tdq = TDQ_SELF();
+#ifdef SMP
+	/*
+	 * We run the long term load balancer infrequently on the first cpu.
+	 */
+	if (balance_tdq == tdq) {
+		if (balance_ticks && --balance_ticks == 0)
+			sched_balance();
+	}
+#endif
+	/*
+	 * Save the old switch count so we have a record of the last ticks
+	 * activity.   Initialize the new switch count based on our load.
+	 * If there is some activity seed it to reflect that.
+	 */
+	tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
+	tdq->tdq_switchcnt = tdq->tdq_load;
+	/*
+	 * Advance the insert index once for each tick to ensure that all
+	 * threads get a chance to run.
+	 */
+	if (tdq->tdq_idx == tdq->tdq_ridx) {
+		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
+		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
+			tdq->tdq_ridx = tdq->tdq_idx;
+	}
+	ts = td->td_sched;
+	sched_pctcpu_update(ts, 1);
+	if (td->td_pri_class & PRI_FIFO_BIT)
+		return;
+	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) {
+		/*
+		 * We used a tick; charge it to the thread so
+		 * that we can compute our interactivity.
+		 */
+		td->td_sched->ts_runtime += tickincr;
+		sched_interact_update(td);
+		sched_priority(td);
+	}
+
+	/*
+	 * Force a context switch if the current thread has used up a full
+	 * time slice (default is 100ms).
+	 */
+	if (!TD_IS_IDLETHREAD(td) && ++ts->ts_slice >= tdq_slice(tdq)) {
+		ts->ts_slice = 0;
+		td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
+	}
+}
+
+/*
+ * Called once per hz tick.
+ */
+void
+sched_tick(int cnt)
+{
+
+}
+
+/*
+ * Return whether the current CPU has runnable tasks.  Used for in-kernel
+ * cooperative idle threads.
+ */
+int
+sched_runnable(void)
+{
+	struct tdq *tdq;
+	int load;
+
+	load = 1;
+
+	tdq = TDQ_SELF();
+	if ((curthread->td_flags & TDF_IDLETD) != 0) {
+		if (tdq->tdq_load > 0)
+			goto out;
+	} else
+		if (tdq->tdq_load - 1 > 0)
+			goto out;
+	load = 0;
+out:
+	return (load);
+}
+
+/*
+ * Choose the highest priority thread to run.  The thread is removed from
+ * the run-queue while running however the load remains.  For SMP we set
+ * the tdq in the global idle bitmask if it idles here.
+ */
+struct thread *
+sched_choose(void)
+{
+	struct thread *td;
+	struct tdq *tdq;
+
+	tdq = TDQ_SELF();
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	td = tdq_choose(tdq);
+	if (td) {
+		tdq_runq_rem(tdq, td);
+		tdq->tdq_lowpri = td->td_priority;
+		return (td);
+	}
+	tdq->tdq_lowpri = PRI_MAX_IDLE;
+	return (PCPU_GET(idlethread));
+}
+
+/*
+ * Set owepreempt if necessary.  Preemption never happens directly in ULE,
+ * we always request it once we exit a critical section.
+ */
+static inline void
+sched_setpreempt(struct thread *td)
+{
+	struct thread *ctd;
+	int cpri;
+	int pri;
+
+	THREAD_LOCK_ASSERT(curthread, MA_OWNED);
+
+	ctd = curthread;
+	pri = td->td_priority;
+	cpri = ctd->td_priority;
+	if (pri < cpri)
+		ctd->td_flags |= TDF_NEEDRESCHED;
+	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
+		return;
+	if (!sched_shouldpreempt(pri, cpri, 0))
+		return;
+	ctd->td_owepreempt = 1;
+}
+
+/*
+ * Add a thread to a thread queue.  Select the appropriate runq and add the
+ * thread to it.  This is the internal function called when the tdq is
+ * predetermined.
+ */
+void
+tdq_add(struct tdq *tdq, struct thread *td, int flags)
+{
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	KASSERT((td->td_inhibitors == 0),
+	    ("sched_add: trying to run inhibited thread"));
+	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+	    ("sched_add: bad thread state"));
+	KASSERT(td->td_flags & TDF_INMEM,
+	    ("sched_add: thread swapped out"));
+
+	if (td->td_priority < tdq->tdq_lowpri)
+		tdq->tdq_lowpri = td->td_priority;
+	tdq_runq_add(tdq, td, flags);
+	tdq_load_add(tdq, td);
+}
+
+/*
+ * Select the target thread queue and add a thread to it.  Request
+ * preemption or IPI a remote processor if required.
+ */
+void
+sched_add(struct thread *td, int flags)
+{
+	struct tdq *tdq;
+#ifdef SMP
+	int cpu;
+#endif
+
+	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
+	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
+	    sched_tdname(curthread));
+	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
+	    KTR_ATTR_LINKED, sched_tdname(td));
+	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
+	    flags & SRQ_PREEMPTED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	/*
+	 * Recalculate the priority before we select the target cpu or
+	 * run-queue.
+	 */
+	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
+		sched_priority(td);
+#ifdef SMP
+	/*
+	 * Pick the destination cpu and if it isn't ours transfer to the
+	 * target cpu.
+	 */
+	cpu = sched_pickcpu(td, flags);
+	tdq = sched_setcpu(td, cpu, flags);
+	tdq_add(tdq, td, flags);
+	if (cpu != PCPU_GET(cpuid)) {
+		tdq_notify(tdq, td);
+		return;
+	}
+#else
+	tdq = TDQ_SELF();
+	TDQ_LOCK(tdq);
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	thread_lock_set(td, TDQ_LOCKPTR(tdq));
+	tdq_add(tdq, td, flags);
+#endif
+	if (!(flags & SRQ_YIELDING))
+		sched_setpreempt(td);
+}
+
+/*
+ * Remove a thread from a run-queue without running it.  This is used
+ * when we're stealing a thread from a remote queue.  Otherwise all threads
+ * exit by calling sched_exit_thread() and sched_throw() themselves.
+ */
+void
+sched_rem(struct thread *td)
+{
+	struct tdq *tdq;
+
+	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
+	    "prio:%d", td->td_priority);
+	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
+	tdq = TDQ_CPU(td->td_sched->ts_cpu);
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+	KASSERT(TD_ON_RUNQ(td),
+	    ("sched_rem: thread not on run queue"));
+	tdq_runq_rem(tdq, td);
+	tdq_load_rem(tdq, td);
+	TD_SET_CAN_RUN(td);
+	if (td->td_priority == tdq->tdq_lowpri)
+		tdq_setlowpri(tdq, NULL);
+}
+
+/*
+ * Fetch cpu utilization information.  Updates on demand.
+ */
+fixpt_t
+sched_pctcpu(struct thread *td)
+{
+	fixpt_t pctcpu;
+	struct td_sched *ts;
+
+	pctcpu = 0;
+	ts = td->td_sched;
+	if (ts == NULL)
+		return (0);
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	sched_pctcpu_update(ts, TD_IS_RUNNING(td));
+	if (ts->ts_ticks) {
+		int rtick;
+
+		/* How many rtick per second ? */
+		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
+		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
+	}
+
+	return (pctcpu);
+}
+
+/*
+ * Enforce affinity settings for a thread.  Called after adjustments to
+ * cpumask.
+ */
+void
+sched_affinity(struct thread *td)
+{
+#ifdef SMP
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	if (THREAD_CAN_SCHED(td, ts->ts_cpu))
+		return;
+	if (TD_ON_RUNQ(td)) {
+		sched_rem(td);
+		sched_add(td, SRQ_BORING);
+		return;
+	}
+	if (!TD_IS_RUNNING(td))
+		return;
+	/*
+	 * Force a switch before returning to userspace.  If the
+	 * target thread is not running locally send an ipi to force
+	 * the issue.
+	 */
+	td->td_flags |= TDF_NEEDRESCHED;
+	if (td != curthread)
+		ipi_cpu(ts->ts_cpu, IPI_PREEMPT);
+#endif
+}
+
+/*
+ * Bind a thread to a target cpu.
+ */
+void
+sched_bind(struct thread *td, int cpu)
+{
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
+	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
+	ts = td->td_sched;
+	if (ts->ts_flags & TSF_BOUND)
+		sched_unbind(td);
+	KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td));
+	ts->ts_flags |= TSF_BOUND;
+	sched_pin();
+	if (PCPU_GET(cpuid) == cpu)
+		return;
+	ts->ts_cpu = cpu;
+	/* When we return from mi_switch we'll be on the correct cpu. */
+	mi_switch(SW_VOL, NULL);
+}
+
+/*
+ * Release a bound thread.
+ */
+void
+sched_unbind(struct thread *td)
+{
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
+	ts = td->td_sched;
+	if ((ts->ts_flags & TSF_BOUND) == 0)
+		return;
+	ts->ts_flags &= ~TSF_BOUND;
+	sched_unpin();
+}
+
+int
+sched_is_bound(struct thread *td)
+{
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	return (td->td_sched->ts_flags & TSF_BOUND);
+}
+
+/*
+ * Basic yield call.
+ */
+void
+sched_relinquish(struct thread *td)
+{
+	thread_lock(td);
+	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
+	thread_unlock(td);
+}
+
+/*
+ * Return the total system load.
+ */
+int
+sched_load(void)
+{
+#ifdef SMP
+	int total;
+	int i;
+
+	total = 0;
+	CPU_FOREACH(i)
+		total += TDQ_CPU(i)->tdq_sysload;
+	return (total);
+#else
+	return (TDQ_SELF()->tdq_sysload);
+#endif
+}
+
+int
+sched_sizeof_proc(void)
+{
+	return (sizeof(struct proc));
+}
+
+int
+sched_sizeof_thread(void)
+{
+	return (sizeof(struct thread) + sizeof(struct td_sched));
+}
+
+#ifdef SMP
+#define	TDQ_IDLESPIN(tdq)						\
+    ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0)
+#else
+#define	TDQ_IDLESPIN(tdq)	1
+#endif
+
+/*
+ * The actual idle process.
+ */
+void
+sched_idletd(void *dummy)
+{
+	struct thread *td;
+	struct tdq *tdq;
+	int oldswitchcnt, switchcnt;
+	int i;
+
+	mtx_assert(&Giant, MA_NOTOWNED);
+	td = curthread;
+	tdq = TDQ_SELF();
+	THREAD_NO_SLEEPING();
+	oldswitchcnt = -1;
+	for (;;) {
+		if (tdq->tdq_load) {
+			thread_lock(td);
+			mi_switch(SW_VOL | SWT_IDLE, NULL);
+			thread_unlock(td);
+		}
+		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+#ifdef SMP
+		if (switchcnt != oldswitchcnt) {
+			oldswitchcnt = switchcnt;
+			if (tdq_idled(tdq) == 0)
+				continue;
+		}
+		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+#else
+		oldswitchcnt = switchcnt;
+#endif
+		/*
+		 * If we're switching very frequently, spin while checking
+		 * for load rather than entering a low power state that 
+		 * may require an IPI.  However, don't do any busy
+		 * loops while on SMT machines as this simply steals
+		 * cycles from cores doing useful work.
+		 */
+		if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
+			for (i = 0; i < sched_idlespins; i++) {
+				if (tdq->tdq_load)
+					break;
+				cpu_spinwait();
+			}
+		}
+
+		/* If there was context switch during spin, restart it. */
+		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+		if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt)
+			continue;
+
+		/* Run main MD idle handler. */
+		tdq->tdq_cpu_idle = 1;
+		cpu_idle(switchcnt * 4 > sched_idlespinthresh);
+		tdq->tdq_cpu_idle = 0;
+
+		/*
+		 * Account thread-less hardware interrupts and
+		 * other wakeup reasons equal to context switches.
+		 */
+		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+		if (switchcnt != oldswitchcnt)
+			continue;
+		tdq->tdq_switchcnt++;
+		oldswitchcnt++;
+	}
+}
+
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+	struct thread *newtd;
+	struct tdq *tdq;
+
+	tdq = TDQ_SELF();
+	if (td == NULL) {
+		/* Correct spinlock nesting and acquire the correct lock. */
+		TDQ_LOCK(tdq);
+		spinlock_exit();
+		PCPU_SET(switchtime, cpu_ticks());
+		PCPU_SET(switchticks, ticks);
+	} else {
+		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+		tdq_load_rem(tdq, td);
+		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
+	}
+	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+	newtd = choosethread();
+	TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
+	cpu_throw(td, newtd);		/* doesn't return */
+}
+
+/*
+ * This is called from fork_exit().  Just acquire the correct locks and
+ * let fork do the rest of the work.
+ */
+void
+sched_fork_exit(struct thread *td)
+{
+	struct td_sched *ts;
+	struct tdq *tdq;
+	int cpuid;
+
+	/*
+	 * Finish setting up thread glue so that it begins execution in a
+	 * non-nested critical section with the scheduler lock held.
+	 */
+	cpuid = PCPU_GET(cpuid);
+	tdq = TDQ_CPU(cpuid);
+	ts = td->td_sched;
+	if (TD_IS_IDLETHREAD(td))
+		td->td_lock = TDQ_LOCKPTR(tdq);
+	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+	td->td_oncpu = cpuid;
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
+	lock_profile_obtain_lock_success(
+	    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
+}
+
+/*
+ * Create on first use to catch odd startup conditons.
+ */
+char *
+sched_tdname(struct thread *td)
+{
+#ifdef KTR
+	struct td_sched *ts;
+
+	ts = td->td_sched;
+	if (ts->ts_name[0] == '\0')
+		snprintf(ts->ts_name, sizeof(ts->ts_name),
+		    "%s tid %d", td->td_name, td->td_tid);
+	return (ts->ts_name);
+#else
+	return (td->td_name);
+#endif
+}
+
+#ifdef KTR
+void
+sched_clear_tdname(struct thread *td)
+{
+	struct td_sched *ts;
+
+	ts = td->td_sched;
+	ts->ts_name[0] = '\0';
+}
+#endif
+
+#ifdef SMP
+
+/*
+ * Build the CPU topology dump string. Is recursively called to collect
+ * the topology tree.
+ */
+static int
+sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,
+    int indent)
+{
+	char cpusetbuf[CPUSETBUFSIZ];
+	int i, first;
+
+	sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
+	    "", 1 + indent / 2, cg->cg_level);
+	sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "",
+	    cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask));
+	first = TRUE;
+	for (i = 0; i < MAXCPU; i++) {
+		if (CPU_ISSET(i, &cg->cg_mask)) {
+			if (!first)
+				sbuf_printf(sb, ", ");
+			else
+				first = FALSE;
+			sbuf_printf(sb, "%d", i);
+		}
+	}
+	sbuf_printf(sb, "</cpu>\n");
+
+	if (cg->cg_flags != 0) {
+		sbuf_printf(sb, "%*s <flags>", indent, "");
+		if ((cg->cg_flags & CG_FLAG_HTT) != 0)
+			sbuf_printf(sb, "<flag name=\"HTT\">HTT group</flag>");
+		if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
+			sbuf_printf(sb, "<flag name=\"THREAD\">THREAD group</flag>");
+		if ((cg->cg_flags & CG_FLAG_SMT) != 0)
+			sbuf_printf(sb, "<flag name=\"SMT\">SMT group</flag>");
+		sbuf_printf(sb, "</flags>\n");
+	}
+
+	if (cg->cg_children > 0) {
+		sbuf_printf(sb, "%*s <children>\n", indent, "");
+		for (i = 0; i < cg->cg_children; i++)
+			sysctl_kern_sched_topology_spec_internal(sb, 
+			    &cg->cg_child[i], indent+2);
+		sbuf_printf(sb, "%*s </children>\n", indent, "");
+	}
+	sbuf_printf(sb, "%*s</group>\n", indent, "");
+	return (0);
+}
+
+/*
+ * Sysctl handler for retrieving topology dump. It's a wrapper for
+ * the recursive sysctl_kern_smp_topology_spec_internal().
+ */
+static int
+sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf *topo;
+	int err;
+
+	KASSERT(cpu_top != NULL, ("cpu_top isn't initialized"));
+
+	topo = sbuf_new(NULL, NULL, 500, SBUF_AUTOEXTEND);
+	if (topo == NULL)
+		return (ENOMEM);
+
+	sbuf_printf(topo, "<groups>\n");
+	err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
+	sbuf_printf(topo, "</groups>\n");
+
+	if (err == 0) {
+		sbuf_finish(topo);
+		err = SYSCTL_OUT(req, sbuf_data(topo), sbuf_len(topo));
+	}
+	sbuf_delete(topo);
+	return (err);
+}
+
+#endif
+
+static int
+sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
+{
+	int error, new_val, period;
+
+	period = 1000000 / realstathz;
+	new_val = period * sched_slice;
+	error = sysctl_handle_int(oidp, &new_val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (new_val <= 0)
+		return (EINVAL);
+	sched_slice = imax(1, (new_val + period / 2) / period);
+	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
+	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
+	    realstathz);
+	return (0);
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
+SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
+    "Scheduler name");
+SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
+    NULL, 0, sysctl_kern_quantum, "I",
+    "Quantum for timeshare threads in microseconds");
+SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
+    "Quantum for timeshare threads in stathz ticks");
+SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
+    "Interactivity score threshold");
+SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW,
+    &preempt_thresh, 0,
+    "Maximal (lowest) priority for preemption");
+SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0,
+    "Assign static kernel priorities to sleeping threads");
+SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0,
+    "Number of times idle thread will spin waiting for new work");
+SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW,
+    &sched_idlespinthresh, 0,
+    "Threshold before we will permit idle thread spinning");
+#ifdef SMP
+SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
+    "Number of hz ticks to keep thread affinity for");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
+    "Enables the long-term load balancer");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
+    &balance_interval, 0,
+    "Average period in stathz ticks to run the long-term balancer");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
+    "Attempts to steal work from other cores before idling");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
+    "Minimum load on remote CPU before we'll steal");
+SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
+    CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A",
+    "XML dump of detected CPU topology");
+#endif
+
+/* ps compat.  All cpu percentages from ULE are weighted. */
+static int ccpu = 0;
+SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
diff --git a/sys/kern/serdev_if.m b/sys/kern/serdev_if.m
new file mode 100644
index 0000000..fbf4363
--- /dev/null
+++ b/sys/kern/serdev_if.m
@@ -0,0 +1,94 @@
+#-
+# Copyright (c) 2006 Marcel Moolenaar
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+#include <sys/serial.h>
+
+# The serdev interface is used by umbrella drivers and children thereof to
+# establish a more intimate relationship, necessary for efficient handling
+# of multiple (concurrent) serial communication channels.  Examples include
+# serial communications controller (SCC) drivers, multi-I/O adapter drivers
+# and intelligent multi-port serial drivers.  Methods specifically deal
+# with interrupt handling and configuration.  Conceptually, the umbrella
+# driver is responsible for the overall operation of the hardware and uses
+# child drivers to handle each individual channel.
+# The serdev interface is intended to inherit the device interface.
+
+INTERFACE serdev;
+
+# Default implementations of some methods.
+CODE {
+	static serdev_intr_t *
+	default_ihand(device_t dev, int ipend)
+	{
+		return (NULL);
+	}
+
+	static int
+	default_ipend(device_t dev)
+	{
+		return (-1);
+	}
+
+	static int
+	default_sysdev(device_t dev)
+	{
+		return (0);
+	}
+};
+
+# ihand() - Query serial device interrupt handler.
+# This method is called by the umbrella driver to obtain function pointers
+# to interrupt handlers for each individual interrupt source. This allows
+# the umbralla driver to control the servicing of interrupts between the
+# different channels in the most flexible way.
+METHOD serdev_intr_t* ihand {
+	device_t dev;
+	int ipend;
+} DEFAULT default_ihand;
+
+# ipend() - Query pending interrupt status.
+# This method is called by the umbrella driver to obtain interrupt status
+# for the UART in question. This allows the umbrella driver to build a
+# matrix and service the interrupts in the most flexible way by calling
+# interrupt handlers collected with the ihand() method.
+METHOD int ipend {
+	device_t dev;
+} DEFAULT default_ipend;
+
+# sysdev() - Query system device status 
+# This method may be called by the umbrella driver for each child driver
+# to establish if a particular channel and mode is currently being used
+# for system specific usage. If this is the case, the hardware is not
+# reset and the channel will not change its operation mode.
+# The return value is !0 if the channel and mode are used for a system
+# device and 0 otherwise.
+METHOD int sysdev {
+	device_t dev;
+} DEFAULT default_sysdev;
+
diff --git a/sys/kern/stack_protector.c b/sys/kern/stack_protector.c
new file mode 100644
index 0000000..b5f9973
--- /dev/null
+++ b/sys/kern/stack_protector.c
@@ -0,0 +1,31 @@
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/libkern.h>
+
+long __stack_chk_guard[8] = {};
+void __stack_chk_fail(void);
+
+void
+__stack_chk_fail(void)
+{
+
+	panic("stack overflow detected; backtrace may be corrupted");
+}
+
+#define __arraycount(__x)	(sizeof(__x) / sizeof(__x[0]))
+static void
+__stack_chk_init(void *dummy __unused)
+{
+	size_t i;
+	long guard[__arraycount(__stack_chk_guard)];
+
+	arc4rand(guard, sizeof(guard), 0);
+	for (i = 0; i < __arraycount(guard); i++)
+		__stack_chk_guard[i] = guard[i];
+}
+SYSINIT(stack_chk, SI_SUB_RANDOM, SI_ORDER_ANY, __stack_chk_init, NULL);
diff --git a/sys/kern/subr_acl_nfs4.c b/sys/kern/subr_acl_nfs4.c
new file mode 100644
index 0000000..ef378a0
--- /dev/null
+++ b/sys/kern/subr_acl_nfs4.c
@@ -0,0 +1,1417 @@
+/*-
+ * Copyright (c) 2008-2010 Edward Tomasz Napierała <trasz@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * ACL support routines specific to NFSv4 access control lists.  These are
+ * utility routines for code common across file systems implementing NFSv4
+ * ACLs.
+ */
+
+#ifdef _KERNEL
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/acl.h>
+#else
+#include <errno.h>
+#include <assert.h>
+#include <sys/acl.h>
+#include <sys/stat.h>
+#define KASSERT(a, b) assert(a)
+#define CTASSERT(a)
+
+#endif /* !_KERNEL */
+
+#ifdef _KERNEL
+
+static void	acl_nfs4_trivial_from_mode(struct acl *aclp, mode_t mode);
+
+static int	acl_nfs4_old_semantics = 0;
+
+SYSCTL_INT(_vfs, OID_AUTO, acl_nfs4_old_semantics, CTLFLAG_RW,
+    &acl_nfs4_old_semantics, 0, "Use pre-PSARC/2010/029 NFSv4 ACL semantics");
+
+static struct {
+	accmode_t accmode;
+	int mask;
+} accmode2mask[] = {{VREAD, ACL_READ_DATA},
+		    {VWRITE, ACL_WRITE_DATA},
+		    {VAPPEND, ACL_APPEND_DATA},
+		    {VEXEC, ACL_EXECUTE},
+		    {VREAD_NAMED_ATTRS, ACL_READ_NAMED_ATTRS},
+		    {VWRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS},
+		    {VDELETE_CHILD, ACL_DELETE_CHILD},
+		    {VREAD_ATTRIBUTES, ACL_READ_ATTRIBUTES},
+		    {VWRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES},
+		    {VDELETE, ACL_DELETE},
+		    {VREAD_ACL, ACL_READ_ACL},
+		    {VWRITE_ACL, ACL_WRITE_ACL},
+		    {VWRITE_OWNER, ACL_WRITE_OWNER},
+		    {VSYNCHRONIZE, ACL_SYNCHRONIZE},
+		    {0, 0}};
+
+static int
+_access_mask_from_accmode(accmode_t accmode)
+{
+	int access_mask = 0, i;
+
+	for (i = 0; accmode2mask[i].accmode != 0; i++) {
+		if (accmode & accmode2mask[i].accmode)
+			access_mask |= accmode2mask[i].mask;
+	}
+
+	/*
+	 * VAPPEND is just a modifier for VWRITE; if the caller asked
+	 * for 'VAPPEND | VWRITE', we want to check for ACL_APPEND_DATA only.
+	 */
+	if (access_mask & ACL_APPEND_DATA)
+		access_mask &= ~ACL_WRITE_DATA;
+
+	return (access_mask);
+}
+
+/*
+ * Return 0, iff access is allowed, 1 otherwise.
+ */
+static int
+_acl_denies(const struct acl *aclp, int access_mask, struct ucred *cred,
+    int file_uid, int file_gid, int *denied_explicitly)
+{
+	int i;
+	const struct acl_entry *entry;
+
+	if (denied_explicitly != NULL)
+		*denied_explicitly = 0;
+
+	KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES,
+	    ("aclp->acl_cnt <= ACL_MAX_ENTRIES"));
+
+	for (i = 0; i < aclp->acl_cnt; i++) {
+		entry = &(aclp->acl_entry[i]);
+
+		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
+		    entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+			continue;
+		if (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY)
+			continue;
+		switch (entry->ae_tag) {
+		case ACL_USER_OBJ:
+			if (file_uid != cred->cr_uid)
+				continue;
+			break;
+		case ACL_USER:
+			if (entry->ae_id != cred->cr_uid)
+				continue;
+			break;
+		case ACL_GROUP_OBJ:
+			if (!groupmember(file_gid, cred))
+				continue;
+			break;
+		case ACL_GROUP:
+			if (!groupmember(entry->ae_id, cred))
+				continue;
+			break;
+		default:
+			KASSERT(entry->ae_tag == ACL_EVERYONE,
+			    ("entry->ae_tag == ACL_EVERYONE"));
+		}
+
+		if (entry->ae_entry_type == ACL_ENTRY_TYPE_DENY) {
+			if (entry->ae_perm & access_mask) {
+				if (denied_explicitly != NULL)
+					*denied_explicitly = 1;
+				return (1);
+			}
+		}
+
+		access_mask &= ~(entry->ae_perm);
+		if (access_mask == 0)
+			return (0);
+	}
+
+	if (access_mask == 0)
+		return (0);
+
+	return (1);
+}
+
+int
+vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid,
+    struct acl *aclp, accmode_t accmode, struct ucred *cred, int *privused)
+{
+	accmode_t priv_granted = 0;
+	int denied, explicitly_denied, access_mask, is_directory,
+	    must_be_owner = 0;
+	mode_t file_mode = 0;
+
+	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND |
+	    VEXPLICIT_DENY | VREAD_NAMED_ATTRS | VWRITE_NAMED_ATTRS |
+	    VDELETE_CHILD | VREAD_ATTRIBUTES | VWRITE_ATTRIBUTES | VDELETE |
+	    VREAD_ACL | VWRITE_ACL | VWRITE_OWNER | VSYNCHRONIZE)) == 0,
+	    ("invalid bit in accmode"));
+	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
+	    	("VAPPEND without VWRITE"));
+
+	if (privused != NULL)
+		*privused = 0;
+
+	if (accmode & VADMIN)
+		must_be_owner = 1;
+
+	/*
+	 * Ignore VSYNCHRONIZE permission.
+	 */
+	accmode &= ~VSYNCHRONIZE;
+
+	access_mask = _access_mask_from_accmode(accmode);
+
+	if (type == VDIR)
+		is_directory = 1;
+	else
+		is_directory = 0;
+
+	/*
+	 * File owner is always allowed to read and write the ACL
+	 * and basic attributes.  This is to prevent a situation
+	 * where user would change ACL in a way that prevents him
+	 * from undoing the change.
+	 */
+	if (file_uid == cred->cr_uid)
+		access_mask &= ~(ACL_READ_ACL | ACL_WRITE_ACL |
+		    ACL_READ_ATTRIBUTES | ACL_WRITE_ATTRIBUTES);
+
+	/*
+	 * Ignore append permission for regular files; use write
+	 * permission instead.
+	 */
+	if (!is_directory && (access_mask & ACL_APPEND_DATA)) {
+		access_mask &= ~ACL_APPEND_DATA;
+		access_mask |= ACL_WRITE_DATA;
+	}
+
+	denied = _acl_denies(aclp, access_mask, cred, file_uid, file_gid,
+	    &explicitly_denied);
+
+	if (must_be_owner) {
+		if (file_uid != cred->cr_uid)
+			denied = EPERM;
+	}
+
+	/*
+	 * For VEXEC, ensure that at least one execute bit is set for
+	 * non-directories. We have to check the mode here to stay
+	 * consistent with execve(2). See the test in
+	 * exec_check_permissions().
+	 */
+	acl_nfs4_sync_mode_from_acl(&file_mode, aclp);
+	if (!denied && !is_directory && (accmode & VEXEC) &&
+	    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)
+		denied = EACCES;
+
+	if (!denied)
+		return (0);
+
+	/*
+	 * Access failed.  Iff it was not denied explicitly and
+	 * VEXPLICIT_DENY flag was specified, allow access.
+	 */
+	if ((accmode & VEXPLICIT_DENY) && explicitly_denied == 0)
+		return (0);
+
+	accmode &= ~VEXPLICIT_DENY;
+
+	/*
+	 * No match.  Try to use privileges, if there are any.
+	 */
+	if (is_directory) {
+		if ((accmode & VEXEC) && !priv_check_cred(cred,
+		    PRIV_VFS_LOOKUP, 0))
+			priv_granted |= VEXEC;
+	} else {
+		/*
+		 * Ensure that at least one execute bit is on. Otherwise,
+		 * a privileged user will always succeed, and we don't want
+		 * this to happen unless the file really is executable.
+		 */
+		if ((accmode & VEXEC) && (file_mode &
+		    (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
+		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
+			priv_granted |= VEXEC;
+	}
+
+	if ((accmode & VREAD) && !priv_check_cred(cred, PRIV_VFS_READ, 0))
+		priv_granted |= VREAD;
+
+	if ((accmode & (VWRITE | VAPPEND | VDELETE_CHILD)) &&
+	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
+		priv_granted |= (VWRITE | VAPPEND | VDELETE_CHILD);
+
+	if ((accmode & VADMIN_PERMS) &&
+	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
+		priv_granted |= VADMIN_PERMS;
+
+	if ((accmode & VSTAT_PERMS) &&
+	    !priv_check_cred(cred, PRIV_VFS_STAT, 0))
+		priv_granted |= VSTAT_PERMS;
+
+	if ((accmode & priv_granted) == accmode) {
+		if (privused != NULL)
+			*privused = 1;
+
+		return (0);
+	}
+
+	if (accmode & (VADMIN_PERMS | VDELETE_CHILD | VDELETE))
+		denied = EPERM;
+	else
+		denied = EACCES;
+
+	return (denied);
+}
+#endif /* _KERNEL */
+
+static int
+_acl_entry_matches(struct acl_entry *entry, acl_tag_t tag, acl_perm_t perm,
+    acl_entry_type_t entry_type)
+{
+	if (entry->ae_tag != tag)
+		return (0);
+
+	if (entry->ae_id != ACL_UNDEFINED_ID)
+		return (0);
+
+	if (entry->ae_perm != perm)
+		return (0);
+
+	if (entry->ae_entry_type != entry_type)
+		return (0);
+
+	if (entry->ae_flags != 0)
+		return (0);
+
+	return (1);
+}
+
+static struct acl_entry *
+_acl_append(struct acl *aclp, acl_tag_t tag, acl_perm_t perm,
+    acl_entry_type_t entry_type)
+{
+	struct acl_entry *entry;
+
+	KASSERT(aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES,
+	    ("aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES"));
+
+	entry = &(aclp->acl_entry[aclp->acl_cnt]);
+	aclp->acl_cnt++;
+
+	entry->ae_tag = tag;
+	entry->ae_id = ACL_UNDEFINED_ID;
+	entry->ae_perm = perm;
+	entry->ae_entry_type = entry_type;
+	entry->ae_flags = 0;
+
+	return (entry);
+}
+
+static struct acl_entry *
+_acl_duplicate_entry(struct acl *aclp, int entry_index)
+{
+	int i;
+
+	KASSERT(aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES,
+	    ("aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES"));
+
+	for (i = aclp->acl_cnt; i > entry_index; i--)
+		aclp->acl_entry[i] = aclp->acl_entry[i - 1];
+
+	aclp->acl_cnt++;
+
+	return (&(aclp->acl_entry[entry_index + 1]));
+}
+
+static void
+acl_nfs4_sync_acl_from_mode_draft(struct acl *aclp, mode_t mode,
+    int file_owner_id)
+{
+	int i, meets, must_append;
+	struct acl_entry *entry, *copy, *previous,
+	    *a1, *a2, *a3, *a4, *a5, *a6;
+	mode_t amode;
+	const int READ = 04;
+	const int WRITE = 02;
+	const int EXEC = 01;
+
+	KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES,
+	    ("aclp->acl_cnt <= ACL_MAX_ENTRIES"));
+
+	/*
+	 * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt
+	 *
+	 * 3.16.6.3. Applying a Mode to an Existing ACL
+	 */
+
+	/*
+	 * 1. For each ACE:
+	 */
+	for (i = 0; i < aclp->acl_cnt; i++) {
+		entry = &(aclp->acl_entry[i]);
+
+		/*
+		 * 1.1. If the type is neither ALLOW or DENY - skip.
+		 */
+		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
+		    entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+			continue;
+
+		/*
+		 * 1.2. If ACL_ENTRY_INHERIT_ONLY is set - skip.
+		 */
+		if (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY)
+			continue;
+
+		/*
+		 * 1.3. If ACL_ENTRY_FILE_INHERIT or ACL_ENTRY_DIRECTORY_INHERIT
+		 *      are set:
+		 */
+		if (entry->ae_flags &
+		    (ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT)) {
+			/*
+			 * 1.3.1. A copy of the current ACE is made, and placed
+			 *        in the ACL immediately following the current
+			 *        ACE.
+			 */
+			copy = _acl_duplicate_entry(aclp, i);
+
+			/*
+			 * 1.3.2. In the first ACE, the flag
+			 *        ACL_ENTRY_INHERIT_ONLY is set.
+			 */
+			entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY;
+
+			/*
+			 * 1.3.3. In the second ACE, the following flags
+			 *        are cleared:
+			 *        ACL_ENTRY_FILE_INHERIT,
+			 *        ACL_ENTRY_DIRECTORY_INHERIT,
+			 *        ACL_ENTRY_NO_PROPAGATE_INHERIT.
+			 */
+			copy->ae_flags &= ~(ACL_ENTRY_FILE_INHERIT |
+			    ACL_ENTRY_DIRECTORY_INHERIT |
+			    ACL_ENTRY_NO_PROPAGATE_INHERIT);
+
+			/*
+			 * The algorithm continues on with the second ACE.
+			 */
+			i++;
+			entry = copy;
+		}
+
+		/*
+		 * 1.4. If it's owner@, group@ or everyone@ entry, clear
+		 *      ACL_READ_DATA, ACL_WRITE_DATA, ACL_APPEND_DATA
+		 *      and ACL_EXECUTE.  Continue to the next entry.
+		 */
+		if (entry->ae_tag == ACL_USER_OBJ ||
+		    entry->ae_tag == ACL_GROUP_OBJ ||
+		    entry->ae_tag == ACL_EVERYONE) {
+			entry->ae_perm &= ~(ACL_READ_DATA | ACL_WRITE_DATA |
+			    ACL_APPEND_DATA | ACL_EXECUTE);
+			continue;
+		}
+
+		/*
+		 * 1.5. Otherwise, if the "who" field did not match one
+		 *      of OWNER@, GROUP@, EVERYONE@:
+		 *
+		 * 1.5.1. If the type is ALLOW, check the preceding ACE.
+		 *        If it does not meet all of the following criteria:
+		 */
+		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW)
+			continue;
+
+		meets = 0;
+		if (i > 0) {
+			meets = 1;
+			previous = &(aclp->acl_entry[i - 1]);
+
+			/*
+			 * 1.5.1.1. The type field is DENY,
+			 */
+			if (previous->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+				meets = 0;
+
+			/*
+			 * 1.5.1.2. The "who" field is the same as the current
+			 *          ACE,
+			 *
+			 * 1.5.1.3. The flag bit ACE4_IDENTIFIER_GROUP
+			 *          is the same as it is in the current ACE,
+			 *          and no other flag bits are set,
+			 */
+			if (previous->ae_id != entry->ae_id ||
+			    previous->ae_tag != entry->ae_tag)
+				meets = 0;
+
+			if (previous->ae_flags)
+				meets = 0;
+
+			/*
+			 * 1.5.1.4. The mask bits are a subset of the mask bits
+			 *          of the current ACE, and are also subset of
+			 *          the following: ACL_READ_DATA,
+			 *          ACL_WRITE_DATA, ACL_APPEND_DATA, ACL_EXECUTE
+			 */
+			if (previous->ae_perm & ~(entry->ae_perm))
+				meets = 0;
+
+			if (previous->ae_perm & ~(ACL_READ_DATA |
+			    ACL_WRITE_DATA | ACL_APPEND_DATA | ACL_EXECUTE))
+				meets = 0;
+		}
+
+		if (!meets) {
+			/*
+		 	 * Then the ACE of type DENY, with a who equal
+			 * to the current ACE, flag bits equal to
+			 * (<current ACE flags> & <ACE_IDENTIFIER_GROUP>)
+			 * and no mask bits, is prepended.
+			 */
+			previous = entry;
+			entry = _acl_duplicate_entry(aclp, i);
+
+			/* Adjust counter, as we've just added an entry. */
+			i++;
+
+			previous->ae_tag = entry->ae_tag;
+			previous->ae_id = entry->ae_id;
+			previous->ae_flags = entry->ae_flags;
+			previous->ae_perm = 0;
+			previous->ae_entry_type = ACL_ENTRY_TYPE_DENY;
+		}
+
+		/*
+		 * 1.5.2. The following modifications are made to the prepended
+		 *        ACE.  The intent is to mask the following ACE
+		 *        to disallow ACL_READ_DATA, ACL_WRITE_DATA,
+		 *        ACL_APPEND_DATA, or ACL_EXECUTE, based upon the group
+		 *        permissions of the new mode.  As a special case,
+		 *        if the ACE matches the current owner of the file,
+		 *        the owner bits are used, rather than the group bits.
+		 *        This is reflected in the algorithm below.
+		 */
+		amode = mode >> 3;
+
+		/*
+		 * If ACE4_IDENTIFIER_GROUP is not set, and the "who" field
+		 * in ACE matches the owner of the file, we shift amode three
+		 * more bits, in order to have the owner permission bits
+		 * placed in the three low order bits of amode.
+		 */
+		if (entry->ae_tag == ACL_USER && entry->ae_id == file_owner_id)
+			amode = amode >> 3;
+
+		if (entry->ae_perm & ACL_READ_DATA) {
+			if (amode & READ)
+				previous->ae_perm &= ~ACL_READ_DATA;
+			else
+				previous->ae_perm |= ACL_READ_DATA;
+		}
+
+		if (entry->ae_perm & ACL_WRITE_DATA) {
+			if (amode & WRITE)
+				previous->ae_perm &= ~ACL_WRITE_DATA;
+			else
+				previous->ae_perm |= ACL_WRITE_DATA;
+		}
+
+		if (entry->ae_perm & ACL_APPEND_DATA) {
+			if (amode & WRITE)
+				previous->ae_perm &= ~ACL_APPEND_DATA;
+			else
+				previous->ae_perm |= ACL_APPEND_DATA;
+		}
+
+		if (entry->ae_perm & ACL_EXECUTE) {
+			if (amode & EXEC)
+				previous->ae_perm &= ~ACL_EXECUTE;
+			else
+				previous->ae_perm |= ACL_EXECUTE;
+		}
+
+		/*
+		 * 1.5.3. If ACE4_IDENTIFIER_GROUP is set in the flags
+		 *        of the ALLOW ace:
+		 *
+		 * XXX: This point is not there in the Falkner's draft.
+		 */
+		if (entry->ae_tag == ACL_GROUP &&
+		    entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) {
+			mode_t extramode, ownermode;
+			extramode = (mode >> 3) & 07;
+			ownermode = mode >> 6;
+			extramode &= ~ownermode;
+
+			if (extramode) {
+				if (extramode & READ) {
+					entry->ae_perm &= ~ACL_READ_DATA;
+					previous->ae_perm &= ~ACL_READ_DATA;
+				}
+
+				if (extramode & WRITE) {
+					entry->ae_perm &=
+					    ~(ACL_WRITE_DATA | ACL_APPEND_DATA);
+					previous->ae_perm &=
+					    ~(ACL_WRITE_DATA | ACL_APPEND_DATA);
+				}
+
+				if (extramode & EXEC) {
+					entry->ae_perm &= ~ACL_EXECUTE;
+					previous->ae_perm &= ~ACL_EXECUTE;
+				}
+			}
+		}
+	}
+
+	/*
+	 * 2. If there at least six ACEs, the final six ACEs are examined.
+	 *    If they are not equal to what we want, append six ACEs.
+	 */
+	must_append = 0;
+	if (aclp->acl_cnt < 6) {
+		must_append = 1;
+	} else {
+		a6 = &(aclp->acl_entry[aclp->acl_cnt - 1]);
+		a5 = &(aclp->acl_entry[aclp->acl_cnt - 2]);
+		a4 = &(aclp->acl_entry[aclp->acl_cnt - 3]);
+		a3 = &(aclp->acl_entry[aclp->acl_cnt - 4]);
+		a2 = &(aclp->acl_entry[aclp->acl_cnt - 5]);
+		a1 = &(aclp->acl_entry[aclp->acl_cnt - 6]);
+
+		if (!_acl_entry_matches(a1, ACL_USER_OBJ, 0,
+		    ACL_ENTRY_TYPE_DENY))
+			must_append = 1;
+		if (!_acl_entry_matches(a2, ACL_USER_OBJ, ACL_WRITE_ACL |
+		    ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
+		    ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_ALLOW))
+			must_append = 1;
+		if (!_acl_entry_matches(a3, ACL_GROUP_OBJ, 0,
+		    ACL_ENTRY_TYPE_DENY))
+			must_append = 1;
+		if (!_acl_entry_matches(a4, ACL_GROUP_OBJ, 0,
+		    ACL_ENTRY_TYPE_ALLOW))
+			must_append = 1;
+		if (!_acl_entry_matches(a5, ACL_EVERYONE, ACL_WRITE_ACL |
+		    ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
+		    ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_DENY))
+			must_append = 1;
+		if (!_acl_entry_matches(a6, ACL_EVERYONE, ACL_READ_ACL |
+		    ACL_READ_ATTRIBUTES | ACL_READ_NAMED_ATTRS |
+		    ACL_SYNCHRONIZE, ACL_ENTRY_TYPE_ALLOW))
+			must_append = 1;
+	}
+
+	if (must_append) {
+		KASSERT(aclp->acl_cnt + 6 <= ACL_MAX_ENTRIES,
+		    ("aclp->acl_cnt <= ACL_MAX_ENTRIES"));
+
+		a1 = _acl_append(aclp, ACL_USER_OBJ, 0, ACL_ENTRY_TYPE_DENY);
+		a2 = _acl_append(aclp, ACL_USER_OBJ, ACL_WRITE_ACL |
+		    ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
+		    ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_ALLOW);
+		a3 = _acl_append(aclp, ACL_GROUP_OBJ, 0, ACL_ENTRY_TYPE_DENY);
+		a4 = _acl_append(aclp, ACL_GROUP_OBJ, 0, ACL_ENTRY_TYPE_ALLOW);
+		a5 = _acl_append(aclp, ACL_EVERYONE, ACL_WRITE_ACL |
+		    ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
+		    ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_DENY);
+		a6 = _acl_append(aclp, ACL_EVERYONE, ACL_READ_ACL |
+		    ACL_READ_ATTRIBUTES | ACL_READ_NAMED_ATTRS |
+		    ACL_SYNCHRONIZE, ACL_ENTRY_TYPE_ALLOW);
+
+		KASSERT(a1 != NULL && a2 != NULL && a3 != NULL && a4 != NULL &&
+		    a5 != NULL && a6 != NULL, ("couldn't append to ACL."));
+	}
+
+	/*
+	 * 3. The final six ACEs are adjusted according to the incoming mode.
+	 */
+	if (mode & S_IRUSR)
+		a2->ae_perm |= ACL_READ_DATA;
+	else
+		a1->ae_perm |= ACL_READ_DATA;
+	if (mode & S_IWUSR)
+		a2->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+	else
+		a1->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+	if (mode & S_IXUSR)
+		a2->ae_perm |= ACL_EXECUTE;
+	else
+		a1->ae_perm |= ACL_EXECUTE;
+
+	if (mode & S_IRGRP)
+		a4->ae_perm |= ACL_READ_DATA;
+	else
+		a3->ae_perm |= ACL_READ_DATA;
+	if (mode & S_IWGRP)
+		a4->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+	else
+		a3->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+	if (mode & S_IXGRP)
+		a4->ae_perm |= ACL_EXECUTE;
+	else
+		a3->ae_perm |= ACL_EXECUTE;
+
+	if (mode & S_IROTH)
+		a6->ae_perm |= ACL_READ_DATA;
+	else
+		a5->ae_perm |= ACL_READ_DATA;
+	if (mode & S_IWOTH)
+		a6->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+	else
+		a5->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+	if (mode & S_IXOTH)
+		a6->ae_perm |= ACL_EXECUTE;
+	else
+		a5->ae_perm |= ACL_EXECUTE;
+}
+
+#ifdef _KERNEL
+void
+acl_nfs4_sync_acl_from_mode(struct acl *aclp, mode_t mode,
+    int file_owner_id)
+{
+
+	if (acl_nfs4_old_semantics)
+		acl_nfs4_sync_acl_from_mode_draft(aclp, mode, file_owner_id);
+	else
+		acl_nfs4_trivial_from_mode(aclp, mode);
+}
+#endif /* _KERNEL */
+
+void
+acl_nfs4_sync_mode_from_acl(mode_t *_mode, const struct acl *aclp)
+{
+	int i;
+	mode_t old_mode = *_mode, mode = 0, seen = 0;
+	const struct acl_entry *entry;
+
+	KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES,
+	    ("aclp->acl_cnt <= ACL_MAX_ENTRIES"));
+
+	/*
+	 * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt
+	 *
+	 * 3.16.6.1. Recomputing mode upon SETATTR of ACL
+	 */
+
+	for (i = 0; i < aclp->acl_cnt; i++) {
+		entry = &(aclp->acl_entry[i]);
+
+		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
+		    entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+			continue;
+
+		if (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY)
+			continue;
+
+		if (entry->ae_tag == ACL_USER_OBJ) {
+			if ((entry->ae_perm & ACL_READ_DATA) &&
+			    ((seen & S_IRUSR) == 0)) {
+				seen |= S_IRUSR;
+				if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+					mode |= S_IRUSR;
+			}
+			if ((entry->ae_perm & ACL_WRITE_DATA) &&
+			     ((seen & S_IWUSR) == 0)) {
+				seen |= S_IWUSR;
+				if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+					mode |= S_IWUSR;
+			}
+			if ((entry->ae_perm & ACL_EXECUTE) &&
+			    ((seen & S_IXUSR) == 0)) {
+				seen |= S_IXUSR;
+				if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+					mode |= S_IXUSR;
+			}
+		} else if (entry->ae_tag == ACL_GROUP_OBJ) {
+			if ((entry->ae_perm & ACL_READ_DATA) &&
+			    ((seen & S_IRGRP) == 0)) {
+				seen |= S_IRGRP;
+				if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+					mode |= S_IRGRP;
+			}
+			if ((entry->ae_perm & ACL_WRITE_DATA) &&
+			    ((seen & S_IWGRP) == 0)) {
+				seen |= S_IWGRP;
+				if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+					mode |= S_IWGRP;
+			}
+			if ((entry->ae_perm & ACL_EXECUTE) &&
+			    ((seen & S_IXGRP) == 0)) {
+				seen |= S_IXGRP;
+				if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+					mode |= S_IXGRP;
+			}
+		} else if (entry->ae_tag == ACL_EVERYONE) {
+			if (entry->ae_perm & ACL_READ_DATA) {
+				if ((seen & S_IRUSR) == 0) {
+					seen |= S_IRUSR;
+					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+						mode |= S_IRUSR;
+				}
+				if ((seen & S_IRGRP) == 0) {
+					seen |= S_IRGRP;
+					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+						mode |= S_IRGRP;
+				}
+				if ((seen & S_IROTH) == 0) {
+					seen |= S_IROTH;
+					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+						mode |= S_IROTH;
+				}
+			}
+			if (entry->ae_perm & ACL_WRITE_DATA) {
+				if ((seen & S_IWUSR) == 0) {
+					seen |= S_IWUSR;
+					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+						mode |= S_IWUSR;
+				}
+				if ((seen & S_IWGRP) == 0) {
+					seen |= S_IWGRP;
+					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+						mode |= S_IWGRP;
+				}
+				if ((seen & S_IWOTH) == 0) {
+					seen |= S_IWOTH;
+					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+						mode |= S_IWOTH;
+				}
+			}
+			if (entry->ae_perm & ACL_EXECUTE) {
+				if ((seen & S_IXUSR) == 0) {
+					seen |= S_IXUSR;
+					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+						mode |= S_IXUSR;
+				}
+				if ((seen & S_IXGRP) == 0) {
+					seen |= S_IXGRP;
+					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+						mode |= S_IXGRP;
+				}
+				if ((seen & S_IXOTH) == 0) {
+					seen |= S_IXOTH;
+					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+						mode |= S_IXOTH;
+				}
+			}
+		}
+	}
+
+	*_mode = mode | (old_mode & ACL_PRESERVE_MASK);
+}
+
+#ifdef _KERNEL
+/*
+ * Calculate inherited ACL in a manner compatible with NFSv4 Minor Version 1,
+ * draft-ietf-nfsv4-minorversion1-03.txt.
+ */
+static void		
+acl_nfs4_compute_inherited_acl_draft(const struct acl *parent_aclp,
+    struct acl *child_aclp, mode_t mode, int file_owner_id,
+    int is_directory)
+{
+	int i, flags;
+	const struct acl_entry *parent_entry;
+	struct acl_entry *entry, *copy;
+
+	KASSERT(child_aclp->acl_cnt == 0, ("child_aclp->acl_cnt == 0"));
+	KASSERT(parent_aclp->acl_cnt <= ACL_MAX_ENTRIES,
+	    ("parent_aclp->acl_cnt <= ACL_MAX_ENTRIES"));
+
+	/*
+	 * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt
+	 *
+	 * 3.16.6.2. Applying the mode given to CREATE or OPEN
+	 *           to an inherited ACL
+	 */
+
+	/*
+	 * 1. Form an ACL that is the concatenation of all inheritable ACEs.
+	 */
+	for (i = 0; i < parent_aclp->acl_cnt; i++) {
+		parent_entry = &(parent_aclp->acl_entry[i]);
+		flags = parent_entry->ae_flags;
+
+		/*
+		 * Entry is not inheritable at all.
+		 */
+		if ((flags & (ACL_ENTRY_DIRECTORY_INHERIT |
+		    ACL_ENTRY_FILE_INHERIT)) == 0)
+			continue;
+
+		/*
+		 * We're creating a file, but entry is not inheritable
+		 * by files.
+		 */
+		if (!is_directory && (flags & ACL_ENTRY_FILE_INHERIT) == 0)
+			continue;
+
+		/*
+		 * Entry is inheritable only by files, but has NO_PROPAGATE
+		 * flag set, and we're creating a directory, so it wouldn't
+		 * propagate to any file in that directory anyway.
+		 */
+		if (is_directory &&
+		    (flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0 &&
+		    (flags & ACL_ENTRY_NO_PROPAGATE_INHERIT))
+			continue;
+
+		KASSERT(child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES,
+		    ("child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES"));
+		child_aclp->acl_entry[child_aclp->acl_cnt] = *parent_entry;
+		child_aclp->acl_cnt++;
+	}
+
+	/*
+	 * 2. For each entry in the new ACL, adjust its flags, possibly
+	 *    creating two entries in place of one.
+	 */
+	for (i = 0; i < child_aclp->acl_cnt; i++) {
+		entry = &(child_aclp->acl_entry[i]);
+
+		/*
+		 * This is not in the specification, but SunOS
+		 * apparently does that.
+		 */
+		if (((entry->ae_flags & ACL_ENTRY_NO_PROPAGATE_INHERIT) ||
+		    !is_directory) &&
+		    entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+			entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER);
+
+		/*
+		 * 2.A. If the ACL_ENTRY_NO_PROPAGATE_INHERIT is set, or if the object
+		 *      being created is not a directory, then clear the
+		 *      following flags: ACL_ENTRY_NO_PROPAGATE_INHERIT,
+		 *      ACL_ENTRY_FILE_INHERIT, ACL_ENTRY_DIRECTORY_INHERIT,
+		 *      ACL_ENTRY_INHERIT_ONLY.
+		 */
+		if (entry->ae_flags & ACL_ENTRY_NO_PROPAGATE_INHERIT ||
+		    !is_directory) {
+			entry->ae_flags &= ~(ACL_ENTRY_NO_PROPAGATE_INHERIT |
+			ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT |
+			ACL_ENTRY_INHERIT_ONLY);
+
+			/*
+			 * Continue on to the next ACE.
+			 */
+			continue;
+		}
+
+		/*
+		 * 2.B. If the object is a directory and ACL_ENTRY_FILE_INHERIT
+		 *      is set, but ACL_ENTRY_NO_PROPAGATE_INHERIT is not set, ensure
+		 *      that ACL_ENTRY_INHERIT_ONLY is set.  Continue to the
+		 *      next ACE.  Otherwise...
+		 */
+		/*
+		 * XXX: Read it again and make sure what does the "otherwise"
+		 *      apply to.
+		 */
+		if (is_directory &&
+		    (entry->ae_flags & ACL_ENTRY_FILE_INHERIT) &&
+		    ((entry->ae_flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0)) {
+			entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY;
+			continue;
+		}
+
+		/*
+		 * 2.C. If the type of the ACE is neither ALLOW nor deny,
+		 *      then continue.
+		 */
+		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
+		    entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+			continue;
+
+		/*
+		 * 2.D. Copy the original ACE into a second, adjacent ACE.
+		 */
+		copy = _acl_duplicate_entry(child_aclp, i);
+
+		/*
+		 * 2.E. On the first ACE, ensure that ACL_ENTRY_INHERIT_ONLY
+		 *      is set.
+		 */
+		entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY;
+
+		/*
+		 * 2.F. On the second ACE, clear the following flags:
+		 *      ACL_ENTRY_NO_PROPAGATE_INHERIT, ACL_ENTRY_FILE_INHERIT,
+		 *      ACL_ENTRY_DIRECTORY_INHERIT, ACL_ENTRY_INHERIT_ONLY.
+		 */
+		copy->ae_flags &= ~(ACL_ENTRY_NO_PROPAGATE_INHERIT |
+		    ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT |
+		    ACL_ENTRY_INHERIT_ONLY);
+
+		/*
+		 * 2.G. On the second ACE, if the type is ALLOW,
+		 *      an implementation MAY clear the following
+		 *      mask bits: ACL_WRITE_ACL, ACL_WRITE_OWNER.
+		 */
+		if (copy->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+			copy->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER);
+
+		/*
+		 * Increment the counter to skip the copied entry.
+		 */
+		i++;
+	}
+
+	/*
+	 * 3. To ensure that the mode is honored, apply the algorithm describe
+	 *    in Section 2.16.6.3, using the mode that is to be used for file
+	 *    creation.
+	 */
+	acl_nfs4_sync_acl_from_mode(child_aclp, mode, file_owner_id);
+}
+#endif /* _KERNEL */
+
+/*
+ * Populate the ACL with entries inherited from parent_aclp.
+ */
+static void		
+acl_nfs4_inherit_entries(const struct acl *parent_aclp,
+    struct acl *child_aclp, mode_t mode, int file_owner_id,
+    int is_directory)
+{
+	int i, flags, tag;
+	const struct acl_entry *parent_entry;
+	struct acl_entry *entry;
+
+	KASSERT(parent_aclp->acl_cnt <= ACL_MAX_ENTRIES,
+	    ("parent_aclp->acl_cnt <= ACL_MAX_ENTRIES"));
+
+	for (i = 0; i < parent_aclp->acl_cnt; i++) {
+		parent_entry = &(parent_aclp->acl_entry[i]);
+		flags = parent_entry->ae_flags;
+		tag = parent_entry->ae_tag;
+
+		/*
+		 * Don't inherit owner@, group@, or everyone@ entries.
+		 */
+		if (tag == ACL_USER_OBJ || tag == ACL_GROUP_OBJ ||
+		    tag == ACL_EVERYONE)
+			continue;
+
+		/*
+		 * Entry is not inheritable at all.
+		 */
+		if ((flags & (ACL_ENTRY_DIRECTORY_INHERIT |
+		    ACL_ENTRY_FILE_INHERIT)) == 0)
+			continue;
+
+		/*
+		 * We're creating a file, but entry is not inheritable
+		 * by files.
+		 */
+		if (!is_directory && (flags & ACL_ENTRY_FILE_INHERIT) == 0)
+			continue;
+
+		/*
+		 * Entry is inheritable only by files, but has NO_PROPAGATE
+		 * flag set, and we're creating a directory, so it wouldn't
+		 * propagate to any file in that directory anyway.
+		 */
+		if (is_directory &&
+		    (flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0 &&
+		    (flags & ACL_ENTRY_NO_PROPAGATE_INHERIT))
+			continue;
+
+		/*
+		 * Entry qualifies for being inherited.
+		 */
+		KASSERT(child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES,
+		    ("child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES"));
+		entry = &(child_aclp->acl_entry[child_aclp->acl_cnt]);
+		*entry = *parent_entry;
+		child_aclp->acl_cnt++;
+
+		entry->ae_flags &= ~ACL_ENTRY_INHERIT_ONLY;
+
+		/*
+		 * If the type of the ACE is neither ALLOW nor DENY,
+		 * then leave it as it is and proceed to the next one.
+		 */
+		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
+		    entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+			continue;
+
+		/*
+		 * If the ACL_ENTRY_NO_PROPAGATE_INHERIT is set, or if
+		 * the object being created is not a directory, then clear
+		 * the following flags: ACL_ENTRY_NO_PROPAGATE_INHERIT,
+		 * ACL_ENTRY_FILE_INHERIT, ACL_ENTRY_DIRECTORY_INHERIT,
+		 * ACL_ENTRY_INHERIT_ONLY.
+		 */
+		if (entry->ae_flags & ACL_ENTRY_NO_PROPAGATE_INHERIT ||
+		    !is_directory) {
+			entry->ae_flags &= ~(ACL_ENTRY_NO_PROPAGATE_INHERIT |
+			ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT |
+			ACL_ENTRY_INHERIT_ONLY);
+		}
+
+		/*
+		 * If the object is a directory and ACL_ENTRY_FILE_INHERIT
+		 * is set, but ACL_ENTRY_DIRECTORY_INHERIT is not set, ensure
+		 * that ACL_ENTRY_INHERIT_ONLY is set.
+		 */
+		if (is_directory &&
+		    (entry->ae_flags & ACL_ENTRY_FILE_INHERIT) &&
+		    ((entry->ae_flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0)) {
+			entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY;
+		}
+
+		if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW &&
+		    (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY) == 0) {
+			/*
+			 * Some permissions must never be inherited.
+			 */
+			entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
+			    ACL_WRITE_NAMED_ATTRS | ACL_WRITE_ATTRIBUTES);
+
+			/*
+			 * Others must be masked according to the file mode.
+			 */
+			if ((mode & S_IRGRP) == 0)
+				entry->ae_perm &= ~ACL_READ_DATA;
+			if ((mode & S_IWGRP) == 0)
+				entry->ae_perm &=
+				    ~(ACL_WRITE_DATA | ACL_APPEND_DATA);
+			if ((mode & S_IXGRP) == 0)
+				entry->ae_perm &= ~ACL_EXECUTE;
+		}
+	}
+}
+
+/*
+ * Calculate inherited ACL in a manner compatible with PSARC/2010/029.
+ * It's also being used to calculate a trivial ACL, by inheriting from
+ * a NULL ACL.
+ */
+static void		
+acl_nfs4_compute_inherited_acl_psarc(const struct acl *parent_aclp,
+    struct acl *aclp, mode_t mode, int file_owner_id, int is_directory)
+{
+	acl_perm_t user_allow_first = 0, user_deny = 0, group_deny = 0;
+	acl_perm_t user_allow, group_allow, everyone_allow;
+
+	KASSERT(aclp->acl_cnt == 0, ("aclp->acl_cnt == 0"));
+
+	user_allow = group_allow = everyone_allow = ACL_READ_ACL |
+	    ACL_READ_ATTRIBUTES | ACL_READ_NAMED_ATTRS | ACL_SYNCHRONIZE;
+	user_allow |= ACL_WRITE_ACL | ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
+	    ACL_WRITE_NAMED_ATTRS;
+
+	if (mode & S_IRUSR)
+		user_allow |= ACL_READ_DATA;
+	if (mode & S_IWUSR)
+		user_allow |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+	if (mode & S_IXUSR)
+		user_allow |= ACL_EXECUTE;
+
+	if (mode & S_IRGRP)
+		group_allow |= ACL_READ_DATA;
+	if (mode & S_IWGRP)
+		group_allow |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+	if (mode & S_IXGRP)
+		group_allow |= ACL_EXECUTE;
+
+	if (mode & S_IROTH)
+		everyone_allow |= ACL_READ_DATA;
+	if (mode & S_IWOTH)
+		everyone_allow |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+	if (mode & S_IXOTH)
+		everyone_allow |= ACL_EXECUTE;
+
+	user_deny = ((group_allow | everyone_allow) & ~user_allow);
+	group_deny = everyone_allow & ~group_allow;
+	user_allow_first = group_deny & ~user_deny;
+
+	if (user_allow_first != 0)
+		_acl_append(aclp, ACL_USER_OBJ, user_allow_first,
+		    ACL_ENTRY_TYPE_ALLOW);
+	if (user_deny != 0)
+		_acl_append(aclp, ACL_USER_OBJ, user_deny,
+		    ACL_ENTRY_TYPE_DENY);
+	if (group_deny != 0)
+		_acl_append(aclp, ACL_GROUP_OBJ, group_deny,
+		    ACL_ENTRY_TYPE_DENY);
+
+	if (parent_aclp != NULL)
+		acl_nfs4_inherit_entries(parent_aclp, aclp, mode,
+		    file_owner_id, is_directory);
+
+	_acl_append(aclp, ACL_USER_OBJ, user_allow, ACL_ENTRY_TYPE_ALLOW);
+	_acl_append(aclp, ACL_GROUP_OBJ, group_allow, ACL_ENTRY_TYPE_ALLOW);
+	_acl_append(aclp, ACL_EVERYONE, everyone_allow, ACL_ENTRY_TYPE_ALLOW);
+}
+
+#ifdef _KERNEL
+void		
+acl_nfs4_compute_inherited_acl(const struct acl *parent_aclp,
+    struct acl *child_aclp, mode_t mode, int file_owner_id,
+    int is_directory)
+{
+
+	if (acl_nfs4_old_semantics)
+		acl_nfs4_compute_inherited_acl_draft(parent_aclp, child_aclp,
+		    mode, file_owner_id, is_directory);
+	else
+		acl_nfs4_compute_inherited_acl_psarc(parent_aclp, child_aclp,
+		    mode, file_owner_id, is_directory);
+}
+#endif /* _KERNEL */
+
+/*
+ * Calculate trivial ACL in a manner compatible with PSARC/2010/029.
+ * Note that this results in an ACL different from (but semantically
+ * equal to) the "canonical six" trivial ACL computed using algorithm
+ * described in draft-ietf-nfsv4-minorversion1-03.txt, 3.16.6.2.
+ */
+static void
+acl_nfs4_trivial_from_mode(struct acl *aclp, mode_t mode)
+{
+
+	aclp->acl_cnt = 0;
+	acl_nfs4_compute_inherited_acl_psarc(NULL, aclp, mode, -1, -1);
+}
+
+#ifndef _KERNEL
+/*
+ * This routine is used by libc to implement acl_strip_np(3)
+ * and acl_is_trivial_np(3).
+ */
+void
+acl_nfs4_trivial_from_mode_libc(struct acl *aclp, int mode, int canonical_six)
+{
+
+	aclp->acl_cnt = 0;
+	if (canonical_six)
+		acl_nfs4_sync_acl_from_mode_draft(aclp, mode, -1);
+	else
+		acl_nfs4_trivial_from_mode(aclp, mode);
+}
+#endif /* !_KERNEL */
+
+#ifdef _KERNEL
+static int
+_acls_are_equal(const struct acl *a, const struct acl *b)
+{
+	int i;
+	const struct acl_entry *entrya, *entryb;
+
+	if (a->acl_cnt != b->acl_cnt)
+		return (0);
+
+	for (i = 0; i < b->acl_cnt; i++) {
+		entrya = &(a->acl_entry[i]);
+		entryb = &(b->acl_entry[i]);
+
+		if (entrya->ae_tag != entryb->ae_tag ||
+		    entrya->ae_id != entryb->ae_id ||
+		    entrya->ae_perm != entryb->ae_perm ||
+		    entrya->ae_entry_type != entryb->ae_entry_type ||
+		    entrya->ae_flags != entryb->ae_flags)
+			return (0);
+	}
+
+	return (1);
+}
+
+/*
+ * This routine is used to determine whether to remove extended attribute
+ * that stores ACL contents.
+ */
+int
+acl_nfs4_is_trivial(const struct acl *aclp, int file_owner_id)
+{
+	int trivial;
+	mode_t tmpmode = 0;
+	struct acl *tmpaclp;
+
+	if (aclp->acl_cnt > 6)
+		return (0);
+
+	/*
+	 * Compute the mode from the ACL, then compute new ACL from that mode.
+	 * If the ACLs are identical, then the ACL is trivial.
+	 *
+	 * XXX: I guess there is a faster way to do this.  However, even
+	 *      this slow implementation significantly speeds things up
+	 *      for files that don't have non-trivial ACLs - it's critical
+	 *      for performance to not use EA when they are not needed.
+	 *
+	 * First try the PSARC/2010/029 semantics.
+	 */
+	tmpaclp = acl_alloc(M_WAITOK | M_ZERO);
+	acl_nfs4_sync_mode_from_acl(&tmpmode, aclp);
+	acl_nfs4_trivial_from_mode(tmpaclp, tmpmode);
+	trivial = _acls_are_equal(aclp, tmpaclp);
+	if (trivial) {
+		acl_free(tmpaclp);
+		return (trivial);
+	}
+
+	/*
+	 * Check if it's a draft-ietf-nfsv4-minorversion1-03.txt trivial ACL.
+	 */
+	tmpaclp->acl_cnt = 0;
+	acl_nfs4_sync_acl_from_mode_draft(tmpaclp, tmpmode, file_owner_id);
+	trivial = _acls_are_equal(aclp, tmpaclp);
+	acl_free(tmpaclp);
+
+	return (trivial);
+}
+#endif /* _KERNEL */
+
+int
+acl_nfs4_check(const struct acl *aclp, int is_directory)
+{
+	int i;
+	const struct acl_entry *entry;
+
+	/*
+	 * The spec doesn't seem to say anything about ACL validity.
+	 * It seems there is not much to do here.  There is even no need
+	 * to count "owner@" or "everyone@" (ACL_USER_OBJ and ACL_EVERYONE)
+	 * entries, as there can be several of them and that's perfectly
+	 * valid.  There can be none of them too.  Really.
+	 */
+
+	if (aclp->acl_cnt > ACL_MAX_ENTRIES || aclp->acl_cnt <= 0)
+		return (EINVAL);
+
+	for (i = 0; i < aclp->acl_cnt; i++) {
+		entry = &(aclp->acl_entry[i]);
+
+		switch (entry->ae_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_EVERYONE:
+			if (entry->ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			break;
+
+		case ACL_USER:
+		case ACL_GROUP:
+			if (entry->ae_id == ACL_UNDEFINED_ID)
+				return (EINVAL);
+			break;
+
+		default:
+			return (EINVAL);
+		}
+
+		if ((entry->ae_perm | ACL_NFS4_PERM_BITS) != ACL_NFS4_PERM_BITS)
+			return (EINVAL);
+
+		/*
+		 * Disallow ACL_ENTRY_TYPE_AUDIT and ACL_ENTRY_TYPE_ALARM for now.
+		 */
+		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
+		    entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+			return (EINVAL);
+
+		if ((entry->ae_flags | ACL_FLAGS_BITS) != ACL_FLAGS_BITS)
+			return (EINVAL);
+
+		/* Disallow unimplemented flags. */
+		if (entry->ae_flags & (ACL_ENTRY_SUCCESSFUL_ACCESS |
+		    ACL_ENTRY_FAILED_ACCESS))
+			return (EINVAL);
+
+		/* Disallow flags not allowed for ordinary files. */
+		if (!is_directory) {
+			if (entry->ae_flags & (ACL_ENTRY_FILE_INHERIT |
+			    ACL_ENTRY_DIRECTORY_INHERIT |
+			    ACL_ENTRY_NO_PROPAGATE_INHERIT | ACL_ENTRY_INHERIT_ONLY))
+				return (EINVAL);
+		}
+	}
+
+	return (0);
+}
+
+#ifdef	_KERNEL
+static int
+acl_nfs4_modload(module_t module, int what, void *arg)
+{
+	int ret;
+
+	ret = 0;
+
+	switch (what) {
+	case MOD_LOAD:
+	case MOD_SHUTDOWN:
+		break;
+
+	case MOD_QUIESCE:
+		/* XXX TODO */
+		ret = 0;
+		break;
+
+	case MOD_UNLOAD:
+		/* XXX TODO */
+		ret = 0;
+		break;
+	default:
+		ret = EINVAL;
+		break;
+	}
+
+	return (ret);
+}
+
+static moduledata_t acl_nfs4_mod = {
+	"acl_nfs4",
+	acl_nfs4_modload,
+	NULL
+};
+
+/*
+ * XXX TODO: which subsystem, order?
+ */
+DECLARE_MODULE(acl_nfs4, acl_nfs4_mod, SI_SUB_VFS, SI_ORDER_FIRST);
+MODULE_VERSION(acl_nfs4, 1);
+#endif	/* _KERNEL */
diff --git a/sys/kern/subr_acl_posix1e.c b/sys/kern/subr_acl_posix1e.c
new file mode 100644
index 0000000..3200932
--- /dev/null
+++ b/sys/kern/subr_acl_posix1e.c
@@ -0,0 +1,691 @@
+/*-
+ * Copyright (c) 1999-2006 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ *
+ * ACL support routines specific to POSIX.1e access control lists.  These are
+ * utility routines for code common across file systems implementing POSIX.1e
+ * ACLs.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+
+/*
+ * Implement a version of vaccess() that understands POSIX.1e ACL semantics;
+ * the access ACL has already been prepared for evaluation by the file system
+ * and is passed via 'uid', 'gid', and 'acl'.  Return 0 on success, else an
+ * errno value.
+ */
+int
+vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
+    struct acl *acl, accmode_t accmode, struct ucred *cred, int *privused)
+{
+	struct acl_entry *acl_other, *acl_mask;
+	accmode_t dac_granted;
+	accmode_t priv_granted;
+	accmode_t acl_mask_granted;
+	int group_matched, i;
+
+	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
+	    ("invalid bit in accmode"));
+	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
+	    	("VAPPEND without VWRITE"));
+
+	/*
+	 * Look for a normal, non-privileged way to access the file/directory
+	 * as requested.  If it exists, go with that.  Otherwise, attempt to
+	 * use privileges granted via priv_granted.  In some cases, which
+	 * privileges to use may be ambiguous due to "best match", in which
+	 * case fall back on first match for the time being.
+	 */
+	if (privused != NULL)
+		*privused = 0;
+
+	/*
+	 * Determine privileges now, but don't apply until we've found a DAC
+	 * entry that matches but has failed to allow access.
+	 *
+	 * XXXRW: Ideally, we'd determine the privileges required before
+	 * asking for them.
+	 */
+	priv_granted = 0;
+
+	if (type == VDIR) {
+		if ((accmode & VEXEC) && !priv_check_cred(cred,
+		     PRIV_VFS_LOOKUP, 0))
+			priv_granted |= VEXEC;
+	} else {
+		/*
+		 * Ensure that at least one execute bit is on. Otherwise,
+		 * a privileged user will always succeed, and we don't want
+		 * this to happen unless the file really is executable.
+		 */
+		if ((accmode & VEXEC) && (acl_posix1e_acl_to_mode(acl) &
+		    (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
+		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
+			priv_granted |= VEXEC;
+	}
+
+	if ((accmode & VREAD) && !priv_check_cred(cred, PRIV_VFS_READ, 0))
+		priv_granted |= VREAD;
+
+	if (((accmode & VWRITE) || (accmode & VAPPEND)) &&
+	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
+		priv_granted |= (VWRITE | VAPPEND);
+
+	if ((accmode & VADMIN) && !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
+		priv_granted |= VADMIN;
+
+	/*
+	 * The owner matches if the effective uid associated with the
+	 * credential matches that of the ACL_USER_OBJ entry.  While we're
+	 * doing the first scan, also cache the location of the ACL_MASK and
+	 * ACL_OTHER entries, preventing some future iterations.
+	 */
+	acl_mask = acl_other = NULL;
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			if (file_uid != cred->cr_uid)
+				break;
+			dac_granted = 0;
+			dac_granted |= VADMIN;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= (VWRITE | VAPPEND);
+			if ((accmode & dac_granted) == accmode)
+				return (0);
+
+			/*
+			 * XXXRW: Do privilege lookup here.
+			 */
+			if ((accmode & (dac_granted | priv_granted)) ==
+			    accmode) {
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+			}
+			goto error;
+
+		case ACL_MASK:
+			acl_mask = &acl->acl_entry[i];
+			break;
+
+		case ACL_OTHER:
+			acl_other = &acl->acl_entry[i];
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * An ACL_OTHER entry should always exist in a valid access ACL.  If
+	 * it doesn't, then generate a serious failure.  For now, this means
+	 * a debugging message and EPERM, but in the future should probably
+	 * be a panic.
+	 */
+	if (acl_other == NULL) {
+		/*
+		 * XXX This should never happen
+		 */
+		printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
+		return (EPERM);
+	}
+
+	/*
+	 * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields are
+	 * masked by an ACL_MASK entry, if any.  As such, first identify the
+	 * ACL_MASK field, then iterate through identifying potential user
+	 * matches, then group matches.  If there is no ACL_MASK, assume that
+	 * the mask allows all requests to succeed.
+	 */
+	if (acl_mask != NULL) {
+		acl_mask_granted = 0;
+		if (acl_mask->ae_perm & ACL_EXECUTE)
+			acl_mask_granted |= VEXEC;
+		if (acl_mask->ae_perm & ACL_READ)
+			acl_mask_granted |= VREAD;
+		if (acl_mask->ae_perm & ACL_WRITE)
+			acl_mask_granted |= (VWRITE | VAPPEND);
+	} else
+		acl_mask_granted = VEXEC | VREAD | VWRITE | VAPPEND;
+
+	/*
+	 * Check ACL_USER ACL entries.  There will either be one or no
+	 * matches; if there is one, we accept or rejected based on the
+	 * match; otherwise, we continue on to groups.
+	 */
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_USER:
+			if (acl->acl_entry[i].ae_id != cred->cr_uid)
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= (VWRITE | VAPPEND);
+			dac_granted &= acl_mask_granted;
+			if ((accmode & dac_granted) == accmode)
+				return (0);
+			/*
+			 * XXXRW: Do privilege lookup here.
+			 */
+			if ((accmode & (dac_granted | priv_granted)) !=
+			    accmode)
+				goto error;
+
+			if (privused != NULL)
+				*privused = 1;
+			return (0);
+		}
+	}
+
+	/*
+	 * Group match is best-match, not first-match, so find a "best"
+	 * match.  Iterate across, testing each potential group match.  Make
+	 * sure we keep track of whether we found a match or not, so that we
+	 * know if we should try again with any available privilege, or if we
+	 * should move on to ACL_OTHER.
+	 */
+	group_matched = 0;
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_GROUP_OBJ:
+			if (!groupmember(file_gid, cred))
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= (VWRITE | VAPPEND);
+			dac_granted  &= acl_mask_granted;
+
+			if ((accmode & dac_granted) == accmode)
+				return (0);
+
+			group_matched = 1;
+			break;
+
+		case ACL_GROUP:
+			if (!groupmember(acl->acl_entry[i].ae_id, cred))
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= (VWRITE | VAPPEND);
+			dac_granted  &= acl_mask_granted;
+
+			if ((accmode & dac_granted) == accmode)
+				return (0);
+
+			group_matched = 1;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	if (group_matched == 1) {
+		/*
+		 * There was a match, but it did not grant rights via pure
+		 * DAC.  Try again, this time with privilege.
+		 */
+		for (i = 0; i < acl->acl_cnt; i++) {
+			switch (acl->acl_entry[i].ae_tag) {
+			case ACL_GROUP_OBJ:
+				if (!groupmember(file_gid, cred))
+					break;
+				dac_granted = 0;
+				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+					dac_granted |= VEXEC;
+				if (acl->acl_entry[i].ae_perm & ACL_READ)
+					dac_granted |= VREAD;
+				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+					dac_granted |= (VWRITE | VAPPEND);
+				dac_granted &= acl_mask_granted;
+
+				/*
+				 * XXXRW: Do privilege lookup here.
+				 */
+				if ((accmode & (dac_granted | priv_granted))
+				    != accmode)
+					break;
+
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+
+			case ACL_GROUP:
+				if (!groupmember(acl->acl_entry[i].ae_id,
+				    cred))
+					break;
+				dac_granted = 0;
+				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+				if (acl->acl_entry[i].ae_perm & ACL_READ)
+					dac_granted |= VREAD;
+				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+					dac_granted |= (VWRITE | VAPPEND);
+				dac_granted &= acl_mask_granted;
+
+				/*
+				 * XXXRW: Do privilege lookup here.
+				 */
+				if ((accmode & (dac_granted | priv_granted))
+				    != accmode)
+					break;
+
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+
+			default:
+				break;
+			}
+		}
+		/*
+		 * Even with privilege, group membership was not sufficient.
+		 * Return failure.
+		 */
+		goto error;
+	}
+		
+	/*
+	 * Fall back on ACL_OTHER.  ACL_MASK is not applied to ACL_OTHER.
+	 */
+	dac_granted = 0;
+	if (acl_other->ae_perm & ACL_EXECUTE)
+		dac_granted |= VEXEC;
+	if (acl_other->ae_perm & ACL_READ)
+		dac_granted |= VREAD;
+	if (acl_other->ae_perm & ACL_WRITE)
+		dac_granted |= (VWRITE | VAPPEND);
+
+	if ((accmode & dac_granted) == accmode)
+		return (0);
+	/*
+	 * XXXRW: Do privilege lookup here.
+	 */
+	if ((accmode & (dac_granted | priv_granted)) == accmode) {
+		if (privused != NULL)
+			*privused = 1;
+		return (0);
+	}
+
+error:
+	return ((accmode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * For the purposes of filesystems maintaining the _OBJ entries in an inode
+ * with a mode_t field, this routine converts a mode_t entry to an
+ * acl_perm_t.
+ */
+acl_perm_t
+acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
+{
+	acl_perm_t	perm = 0;
+
+	switch(tag) {
+	case ACL_USER_OBJ:
+		if (mode & S_IXUSR)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IRUSR)
+			perm |= ACL_READ;
+		if (mode & S_IWUSR)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	case ACL_GROUP_OBJ:
+		if (mode & S_IXGRP)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IRGRP)
+			perm |= ACL_READ;
+		if (mode & S_IWGRP)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	case ACL_OTHER:
+		if (mode & S_IXOTH)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IROTH)
+			perm |= ACL_READ;
+		if (mode & S_IWOTH)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	default:
+		printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
+		return (0);
+	}
+}
+
+/*
+ * Given inode information (uid, gid, mode), return an acl entry of the
+ * appropriate type.
+ */
+struct acl_entry
+acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
+{
+	struct acl_entry	acl_entry;
+
+	acl_entry.ae_tag = tag;
+	acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
+	acl_entry.ae_entry_type = 0;
+	acl_entry.ae_flags = 0;
+	switch(tag) {
+	case ACL_USER_OBJ:
+		acl_entry.ae_id = uid;
+		break;
+
+	case ACL_GROUP_OBJ:
+		acl_entry.ae_id = gid;
+		break;
+
+	case ACL_OTHER:
+		acl_entry.ae_id = ACL_UNDEFINED_ID;
+		break;
+
+	default:
+		acl_entry.ae_id = ACL_UNDEFINED_ID;
+		printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
+	}
+
+	return (acl_entry);
+}
+
+/*
+ * Utility function to generate a file mode given appropriate ACL entries.
+ */
+mode_t
+acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
+    struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
+{
+	mode_t	mode;
+
+	mode = 0;
+	if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXUSR;
+	if (acl_user_obj_entry->ae_perm & ACL_READ)
+		mode |= S_IRUSR;
+	if (acl_user_obj_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWUSR;
+	if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXGRP;
+	if (acl_group_obj_entry->ae_perm & ACL_READ)
+		mode |= S_IRGRP;
+	if (acl_group_obj_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWGRP;
+	if (acl_other_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXOTH;
+	if (acl_other_entry->ae_perm & ACL_READ)
+		mode |= S_IROTH;
+	if (acl_other_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWOTH;
+
+	return (mode);
+}
+
+/*
+ * Utility function to generate a file mode given a complete POSIX.1e access
+ * ACL.  Note that if the ACL is improperly formed, this may result in a
+ * panic.
+ */
+mode_t
+acl_posix1e_acl_to_mode(struct acl *acl)
+{
+	struct acl_entry *acl_mask, *acl_user_obj, *acl_group_obj, *acl_other;
+	int i;
+
+	/*
+	 * Find the ACL entries relevant to a POSIX permission mode.
+	 */
+	acl_user_obj = acl_group_obj = acl_other = acl_mask = NULL;
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			acl_user_obj = &acl->acl_entry[i];
+			break;
+
+		case ACL_GROUP_OBJ:
+			acl_group_obj = &acl->acl_entry[i];
+			break;
+
+		case ACL_OTHER:
+			acl_other = &acl->acl_entry[i];
+			break;
+
+		case ACL_MASK:
+			acl_mask = &acl->acl_entry[i];
+			break;
+
+		case ACL_USER:
+		case ACL_GROUP:
+			break;
+
+		default:
+			panic("acl_posix1e_acl_to_mode: bad ae_tag");
+		}
+	}
+
+	if (acl_user_obj == NULL || acl_group_obj == NULL || acl_other == NULL)
+		panic("acl_posix1e_acl_to_mode: missing base ae_tags");
+
+	/*
+	 * POSIX.1e specifies that if there is an ACL_MASK entry, we replace
+	 * the mode "group" bits with its permissions.  If there isn't, we
+	 * use the ACL_GROUP_OBJ permissions.
+	 */
+	if (acl_mask != NULL)
+		return (acl_posix1e_perms_to_mode(acl_user_obj, acl_mask,
+		    acl_other));
+	else
+		return (acl_posix1e_perms_to_mode(acl_user_obj, acl_group_obj,
+		    acl_other));
+}
+
+/*
+ * Perform a syntactic check of the ACL, sufficient to allow an implementing
+ * filesystem to determine if it should accept this and rely on the POSIX.1e
+ * ACL properties.
+ */
+int
+acl_posix1e_check(struct acl *acl)
+{
+	int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
+	int num_acl_mask, num_acl_other, i;
+
+	/*
+	 * Verify that the number of entries does not exceed the maximum
+	 * defined for acl_t.
+	 *
+	 * Verify that the correct number of various sorts of ae_tags are
+	 * present:
+	 *   Exactly one ACL_USER_OBJ
+	 *   Exactly one ACL_GROUP_OBJ
+	 *   Exactly one ACL_OTHER
+	 *   If any ACL_USER or ACL_GROUP entries appear, then exactly one
+	 *   ACL_MASK entry must also appear.
+	 *
+	 * Verify that all ae_perm entries are in ACL_PERM_BITS.
+	 *
+	 * Verify all ae_tag entries are understood by this implementation.
+	 *
+	 * Note: Does not check for uniqueness of qualifier (ae_id) field.
+	 */
+	num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
+	    num_acl_mask = num_acl_other = 0;
+	if (acl->acl_cnt > ACL_MAX_ENTRIES)
+		return (EINVAL);
+	for (i = 0; i < acl->acl_cnt; i++) {
+		/*
+		 * Check for a valid tag.
+		 */
+		switch(acl->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_user_obj++;
+			break;
+		case ACL_GROUP_OBJ:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_group_obj++;
+			break;
+		case ACL_USER:
+			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_user++;
+			break;
+		case ACL_GROUP:
+			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_group++;
+			break;
+		case ACL_OTHER:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_other++;
+			break;
+		case ACL_MASK:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_mask++;
+			break;
+		default:
+			return (EINVAL);
+		}
+		/*
+		 * Check for valid perm entries.
+		 */
+		if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
+		    ACL_PERM_BITS)
+			return (EINVAL);
+	}
+	if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
+	    (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
+		return (EINVAL);
+	if (((num_acl_group != 0) || (num_acl_user != 0)) &&
+	    (num_acl_mask != 1))
+		return (EINVAL);
+	return (0);
+}
+
+/*
+ * Given a requested mode for a new object, and a default ACL, combine the
+ * two to produce a new mode.  Be careful not to clear any bits that aren't
+ * intended to be affected by the POSIX.1e ACL.  Eventually, this might also
+ * take the cmask as an argument, if we push that down into
+ * per-filesystem-code.
+ */
+mode_t
+acl_posix1e_newfilemode(mode_t cmode, struct acl *dacl)
+{
+	mode_t mode;
+
+	mode = cmode;
+	/*
+	 * The current composition policy is that a permission bit must be
+	 * set in *both* the ACL and the requested creation mode for it to
+	 * appear in the resulting mode/ACL.  First clear any possibly
+	 * effected bits, then reconstruct.
+	 */
+	mode &= ACL_PRESERVE_MASK;
+	mode |= (ACL_OVERRIDE_MASK & cmode & acl_posix1e_acl_to_mode(dacl));
+
+	return (mode);
+}
+
+
+static int
+acl_posix1e_modload(module_t mod, int what, void *arg)
+{
+	int ret;
+
+	ret = 0;
+
+	switch (what) {
+	case MOD_LOAD:
+	case MOD_SHUTDOWN:
+		break;
+
+	case MOD_QUIESCE:
+		/* XXX TODO */
+		ret = 0;
+		break;
+
+	case MOD_UNLOAD:
+		/* XXX TODO */
+		ret = 0;
+		break;
+	default:
+		ret = EINVAL;
+		break;
+	}
+
+	return (ret);
+}
+
+static moduledata_t acl_posix1e_mod = {
+	"acl_posix1e",
+	acl_posix1e_modload,
+	NULL
+};
+
+DECLARE_MODULE(acl_posix1e, acl_posix1e_mod, SI_SUB_VFS, SI_ORDER_FIRST);
+MODULE_VERSION(acl_posix1e, 1);
diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c
new file mode 100644
index 0000000..6384056
--- /dev/null
+++ b/sys/kern/subr_autoconf.c
@@ -0,0 +1,230 @@
+/*-
+ * Copyright (c) 1992, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This software was developed by the Computer Systems Engineering group
+ * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
+ * contributed to Berkeley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_autoconf.c	8.1 (Berkeley) 6/10/93
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+/*
+ * Autoconfiguration subroutines.
+ */
+
+/*
+ * "Interrupt driven config" functions.
+ */
+static TAILQ_HEAD(, intr_config_hook) intr_config_hook_list =
+	TAILQ_HEAD_INITIALIZER(intr_config_hook_list);
+static struct intr_config_hook *next_to_notify;
+static struct mtx intr_config_hook_lock;
+MTX_SYSINIT(intr_config_hook, &intr_config_hook_lock, "intr config", MTX_DEF);
+
+/* ARGSUSED */
+static void run_interrupt_driven_config_hooks(void);
+
+/*
+ * If we wait too long for an interrupt-driven config hook to return, print
+ * a diagnostic.
+ */
+#define	WARNING_INTERVAL_SECS	60
+static void
+run_interrupt_driven_config_hooks_warning(int warned)
+{
+	struct intr_config_hook *hook_entry;
+	char namebuf[64];
+	long offset;
+
+	if (warned < 6) {
+		printf("run_interrupt_driven_hooks: still waiting after %d "
+		    "seconds for", warned * WARNING_INTERVAL_SECS);
+		TAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links) {
+			if (linker_search_symbol_name(
+			    (caddr_t)hook_entry->ich_func, namebuf,
+			    sizeof(namebuf), &offset) == 0)
+				printf(" %s", namebuf);
+			else
+				printf(" %p", hook_entry->ich_func);
+		}
+		printf("\n");
+	}
+	KASSERT(warned < 6,
+	    ("run_interrupt_driven_config_hooks: waited too long"));
+}
+
+static void
+run_interrupt_driven_config_hooks()
+{
+	static int running;
+	struct intr_config_hook *hook_entry;
+
+	mtx_lock(&intr_config_hook_lock);
+
+	/*
+	 * If hook processing is already active, any newly
+	 * registered hooks will eventually be notified.
+	 * Let the currently running session issue these
+	 * notifications.
+	 */
+	if (running != 0) {
+		mtx_unlock(&intr_config_hook_lock);
+		return;
+	}
+	running = 1;
+
+	while (next_to_notify != NULL) {
+		hook_entry = next_to_notify;
+		next_to_notify = TAILQ_NEXT(hook_entry, ich_links);
+		mtx_unlock(&intr_config_hook_lock);
+		(*hook_entry->ich_func)(hook_entry->ich_arg);
+		mtx_lock(&intr_config_hook_lock);
+	}
+
+	running = 0;
+	mtx_unlock(&intr_config_hook_lock);
+}
+
+static void
+boot_run_interrupt_driven_config_hooks(void *dummy)
+{
+	int warned;
+
+	run_interrupt_driven_config_hooks();
+
+	/* Block boot processing until all hooks are disestablished. */
+	mtx_lock(&intr_config_hook_lock);
+	warned = 0;
+	while (!TAILQ_EMPTY(&intr_config_hook_list)) {
+		if (msleep(&intr_config_hook_list, &intr_config_hook_lock,
+		    0, "conifhk", WARNING_INTERVAL_SECS * hz) ==
+		    EWOULDBLOCK) {
+			mtx_unlock(&intr_config_hook_lock);
+			warned++;
+			run_interrupt_driven_config_hooks_warning(warned);
+			mtx_lock(&intr_config_hook_lock);
+		}
+	}
+	mtx_unlock(&intr_config_hook_lock);
+}
+
+SYSINIT(intr_config_hooks, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_FIRST,
+	boot_run_interrupt_driven_config_hooks, NULL);
+
+/*
+ * Register a hook that will be called after "cold"
+ * autoconfiguration is complete and interrupts can
+ * be used to complete initialization.
+ */
+int
+config_intrhook_establish(struct intr_config_hook *hook)
+{
+	struct intr_config_hook *hook_entry;
+
+	mtx_lock(&intr_config_hook_lock);
+	TAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links)
+		if (hook_entry == hook)
+			break;
+	if (hook_entry != NULL) {
+		mtx_unlock(&intr_config_hook_lock);
+		printf("config_intrhook_establish: establishing an "
+		       "already established hook.\n");
+		return (1);
+	}
+	TAILQ_INSERT_TAIL(&intr_config_hook_list, hook, ich_links);
+	if (next_to_notify == NULL)
+		next_to_notify = hook;
+	mtx_unlock(&intr_config_hook_lock);
+	if (cold == 0)
+		/*
+		 * XXX Call from a task since not all drivers expect
+		 *     to be re-entered at the time a hook is established.
+		 */
+		/* XXX Sufficient for modules loaded after initial config??? */
+		run_interrupt_driven_config_hooks();	
+	return (0);
+}
+
+void
+config_intrhook_disestablish(struct intr_config_hook *hook)
+{
+	struct intr_config_hook *hook_entry;
+
+	mtx_lock(&intr_config_hook_lock);
+	TAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links)
+		if (hook_entry == hook)
+			break;
+	if (hook_entry == NULL)
+		panic("config_intrhook_disestablish: disestablishing an "
+		      "unestablished hook");
+
+	if (next_to_notify == hook)
+		next_to_notify = TAILQ_NEXT(hook, ich_links);
+	TAILQ_REMOVE(&intr_config_hook_list, hook, ich_links);
+
+	/* Wakeup anyone watching the list */
+	wakeup(&intr_config_hook_list);
+	mtx_unlock(&intr_config_hook_lock);
+}
+
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(conifhk, db_show_conifhk)
+{
+	struct intr_config_hook *hook_entry;
+	char namebuf[64];
+	long offset;
+
+	TAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links) {
+		if (linker_ddb_search_symbol_name(
+		    (caddr_t)hook_entry->ich_func, namebuf, sizeof(namebuf),
+		    &offset) == 0) {
+			db_printf("hook: %p at %s+%#lx arg: %p\n",
+			    hook_entry->ich_func, namebuf, offset,
+			    hook_entry->ich_arg);
+		} else {
+			db_printf("hook: %p at ??+?? arg %p\n",
+			    hook_entry->ich_func, hook_entry->ich_arg);
+		}
+	}
+}
+#endif /* DDB */
diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c
new file mode 100644
index 0000000..5c45b81
--- /dev/null
+++ b/sys/kern/subr_blist.c
@@ -0,0 +1,1095 @@
+/*-
+ * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
+ *
+ *	This module implements a general bitmap allocator/deallocator.  The
+ *	allocator eats around 2 bits per 'block'.  The module does not 
+ *	try to interpret the meaning of a 'block' other than to return 
+ *	SWAPBLK_NONE on an allocation failure.
+ *
+ *	A radix tree is used to maintain the bitmap.  Two radix constants are
+ *	involved:  One for the bitmaps contained in the leaf nodes (typically
+ *	32), and one for the meta nodes (typically 16).  Both meta and leaf
+ *	nodes have a hint field.  This field gives us a hint as to the largest
+ *	free contiguous range of blocks under the node.  It may contain a
+ *	value that is too high, but will never contain a value that is too 
+ *	low.  When the radix tree is searched, allocation failures in subtrees
+ *	update the hint. 
+ *
+ *	The radix tree also implements two collapsed states for meta nodes:
+ *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
+ *	in either of these two states, all information contained underneath
+ *	the node is considered stale.  These states are used to optimize
+ *	allocation and freeing operations.
+ *
+ * 	The hinting greatly increases code efficiency for allocations while
+ *	the general radix structure optimizes both allocations and frees.  The
+ *	radix tree should be able to operate well no matter how much 
+ *	fragmentation there is and no matter how large a bitmap is used.
+ *
+ *	The blist code wires all necessary memory at creation time.  Neither
+ *	allocations nor frees require interaction with the memory subsystem.
+ *	The non-blocking features of the blist code are used in the swap code
+ *	(vm/swap_pager.c).
+ *
+ *	LAYOUT: The radix tree is layed out recursively using a
+ *	linear array.  Each meta node is immediately followed (layed out
+ *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
+ *	is a recursive structure but one that can be easily scanned through
+ *	a very simple 'skip' calculation.  In order to support large radixes, 
+ *	portions of the tree may reside outside our memory allocation.  We 
+ *	handle this with an early-termination optimization (when bighint is 
+ *	set to -1) on the scan.  The memory allocation is only large enough 
+ *	to cover the number of blocks requested at creation time even if it
+ *	must be encompassed in larger root-node radix.
+ *
+ *	NOTE: the allocator cannot currently allocate more than 
+ *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too 
+ *	large' if you try.  This is an area that could use improvement.  The 
+ *	radix is large enough that this restriction does not effect the swap 
+ *	system, though.  Currently only the allocation code is effected by
+ *	this algorithmic unfeature.  The freeing code can handle arbitrary
+ *	ranges.
+ *
+ *	This code can be compiled stand-alone for debugging.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef _KERNEL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/blist.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/mutex.h> 
+
+#else
+
+#ifndef BLIST_NO_DEBUG
+#define BLIST_DEBUG
+#endif
+
+#define SWAPBLK_NONE ((daddr_t)-1)
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#define malloc(a,b,c)	calloc(a, 1)
+#define free(a,b)	free(a)
+
+typedef unsigned int u_daddr_t;
+
+#include <sys/blist.h>
+
+void panic(const char *ctl, ...);
+
+#endif
+
+/*
+ * static support functions
+ */
+
+static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
+static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk, 
+				daddr_t count, daddr_t radix, int skip);
+static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
+static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 
+					daddr_t radix, int skip, daddr_t blk);
+static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 
+				daddr_t skip, blist_t dest, daddr_t count);
+static int blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
+static int blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
+				daddr_t radix, int skip, daddr_t blk);
+static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, 
+						int skip, daddr_t count);
+#ifndef _KERNEL
+static void	blst_radix_print(blmeta_t *scan, daddr_t blk, 
+					daddr_t radix, int skip, int tab);
+#endif
+
+#ifdef _KERNEL
+static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
+#endif
+
+/*
+ * blist_create() - create a blist capable of handling up to the specified
+ *		    number of blocks
+ *
+ *	blocks - must be greater than 0
+ * 	flags  - malloc flags
+ *
+ *	The smallest blist consists of a single leaf node capable of 
+ *	managing BLIST_BMAP_RADIX blocks.
+ */
+
+blist_t 
+blist_create(daddr_t blocks, int flags)
+{
+	blist_t bl;
+	int radix;
+	int skip = 0;
+
+	/*
+	 * Calculate radix and skip field used for scanning.
+	 */
+	radix = BLIST_BMAP_RADIX;
+
+	while (radix < blocks) {
+		radix *= BLIST_META_RADIX;
+		skip = (skip + 1) * BLIST_META_RADIX;
+	}
+
+	bl = malloc(sizeof(struct blist), M_SWAP, flags | M_ZERO);
+
+	bl->bl_blocks = blocks;
+	bl->bl_radix = radix;
+	bl->bl_skip = skip;
+	bl->bl_rootblks = 1 +
+	    blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
+	bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, flags);
+
+#if defined(BLIST_DEBUG)
+	printf(
+		"BLIST representing %lld blocks (%lld MB of swap)"
+		", requiring %lldK of ram\n",
+		(long long)bl->bl_blocks,
+		(long long)bl->bl_blocks * 4 / 1024,
+		(long long)(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
+	);
+	printf("BLIST raw radix tree contains %lld records\n",
+	    (long long)bl->bl_rootblks);
+#endif
+	blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
+
+	return(bl);
+}
+
+void 
+blist_destroy(blist_t bl)
+{
+	free(bl->bl_root, M_SWAP);
+	free(bl, M_SWAP);
+}
+
+/*
+ * blist_alloc() - reserve space in the block bitmap.  Return the base
+ *		     of a contiguous region or SWAPBLK_NONE if space could
+ *		     not be allocated.
+ */
+
+daddr_t 
+blist_alloc(blist_t bl, daddr_t count)
+{
+	daddr_t blk = SWAPBLK_NONE;
+
+	if (bl) {
+		if (bl->bl_radix == BLIST_BMAP_RADIX)
+			blk = blst_leaf_alloc(bl->bl_root, 0, count);
+		else
+			blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip);
+		if (blk != SWAPBLK_NONE)
+			bl->bl_free -= count;
+	}
+	return(blk);
+}
+
+/*
+ * blist_free() -	free up space in the block bitmap.  Return the base
+ *		     	of a contiguous region.  Panic if an inconsistancy is
+ *			found.
+ */
+
+void 
+blist_free(blist_t bl, daddr_t blkno, daddr_t count)
+{
+	if (bl) {
+		if (bl->bl_radix == BLIST_BMAP_RADIX)
+			blst_leaf_free(bl->bl_root, blkno, count);
+		else
+			blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
+		bl->bl_free += count;
+	}
+}
+
+/*
+ * blist_fill() -	mark a region in the block bitmap as off-limits
+ *			to the allocator (i.e. allocate it), ignoring any
+ *			existing allocations.  Return the number of blocks
+ *			actually filled that were free before the call.
+ */
+
+int
+blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
+{
+	int filled;
+
+	if (bl) {
+		if (bl->bl_radix == BLIST_BMAP_RADIX)
+			filled = blst_leaf_fill(bl->bl_root, blkno, count);
+		else
+			filled = blst_meta_fill(bl->bl_root, blkno, count,
+			    bl->bl_radix, bl->bl_skip, 0);
+		bl->bl_free -= filled;
+		return filled;
+	} else
+		return 0;
+}
+
+/*
+ * blist_resize() -	resize an existing radix tree to handle the
+ *			specified number of blocks.  This will reallocate
+ *			the tree and transfer the previous bitmap to the new
+ *			one.  When extending the tree you can specify whether
+ *			the new blocks are to left allocated or freed.
+ */
+
+void
+blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
+{
+    blist_t newbl = blist_create(count, flags);
+    blist_t save = *pbl;
+
+    *pbl = newbl;
+    if (count > save->bl_blocks)
+	    count = save->bl_blocks;
+    blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
+
+    /*
+     * If resizing upwards, should we free the new space or not?
+     */
+    if (freenew && count < newbl->bl_blocks) {
+	    blist_free(newbl, count, newbl->bl_blocks - count);
+    }
+    blist_destroy(save);
+}
+
+#ifdef BLIST_DEBUG
+
+/*
+ * blist_print()    - dump radix tree
+ */
+
+void
+blist_print(blist_t bl)
+{
+	printf("BLIST {\n");
+	blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
+	printf("}\n");
+}
+
+#endif
+
+/************************************************************************
+ *			  ALLOCATION SUPPORT FUNCTIONS			*
+ ************************************************************************
+ *
+ *	These support functions do all the actual work.  They may seem 
+ *	rather longish, but that's because I've commented them up.  The
+ *	actual code is straight forward.
+ *
+ */
+
+/*
+ * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
+ *
+ *	This is the core of the allocator and is optimized for the 1 block
+ *	and the BLIST_BMAP_RADIX block allocation cases.  Other cases are
+ *	somewhat slower.  The 1 block allocation case is log2 and extremely
+ *	quick.
+ */
+
+static daddr_t
+blst_leaf_alloc(
+	blmeta_t *scan,
+	daddr_t blk,
+	int count
+) {
+	u_daddr_t orig = scan->u.bmu_bitmap;
+
+	if (orig == 0) {
+		/*
+		 * Optimize bitmap all-allocated case.  Also, count = 1
+		 * case assumes at least 1 bit is free in the bitmap, so
+		 * we have to take care of this case here.
+		 */
+		scan->bm_bighint = 0;
+		return(SWAPBLK_NONE);
+	}
+	if (count == 1) {
+		/*
+		 * Optimized code to allocate one bit out of the bitmap
+		 */
+		u_daddr_t mask;
+		int j = BLIST_BMAP_RADIX/2;
+		int r = 0;
+
+		mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2);
+
+		while (j) {
+			if ((orig & mask) == 0) {
+			    r += j;
+			    orig >>= j;
+			}
+			j >>= 1;
+			mask >>= j;
+		}
+		scan->u.bmu_bitmap &= ~(1 << r);
+		return(blk + r);
+	}
+	if (count <= BLIST_BMAP_RADIX) {
+		/*
+		 * non-optimized code to allocate N bits out of the bitmap.
+		 * The more bits, the faster the code runs.  It will run
+		 * the slowest allocating 2 bits, but since there aren't any
+		 * memory ops in the core loop (or shouldn't be, anyway),
+		 * you probably won't notice the difference.
+		 */
+		int j;
+		int n = BLIST_BMAP_RADIX - count;
+		u_daddr_t mask;
+
+		mask = (u_daddr_t)-1 >> n;
+
+		for (j = 0; j <= n; ++j) {
+			if ((orig & mask) == mask) {
+				scan->u.bmu_bitmap &= ~mask;
+				return(blk + j);
+			}
+			mask = (mask << 1);
+		}
+	}
+	/*
+	 * We couldn't allocate count in this subtree, update bighint.
+	 */
+	scan->bm_bighint = count - 1;
+	return(SWAPBLK_NONE);
+}
+
+/*
+ * blist_meta_alloc() -	allocate at a meta in the radix tree.
+ *
+ *	Attempt to allocate at a meta node.  If we can't, we update
+ *	bighint and return a failure.  Updating bighint optimize future
+ *	calls that hit this node.  We have to check for our collapse cases
+ *	and we have a few optimizations strewn in as well.
+ */
+
+static daddr_t
+blst_meta_alloc(
+	blmeta_t *scan, 
+	daddr_t blk,
+	daddr_t count,
+	daddr_t radix, 
+	int skip
+) {
+	int i;
+	int next_skip = ((u_int)skip / BLIST_META_RADIX);
+
+	if (scan->u.bmu_avail == 0)  {
+		/*
+		 * ALL-ALLOCATED special case
+		 */
+		scan->bm_bighint = count;
+		return(SWAPBLK_NONE);
+	}
+
+	if (scan->u.bmu_avail == radix) {
+		radix /= BLIST_META_RADIX;
+
+		/*
+		 * ALL-FREE special case, initialize uninitialize
+		 * sublevel.
+		 */
+		for (i = 1; i <= skip; i += next_skip) {
+			if (scan[i].bm_bighint == (daddr_t)-1)
+				break;
+			if (next_skip == 1) {
+				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
+				scan[i].bm_bighint = BLIST_BMAP_RADIX;
+			} else {
+				scan[i].bm_bighint = radix;
+				scan[i].u.bmu_avail = radix;
+			}
+		}
+	} else {
+		radix /= BLIST_META_RADIX;
+	}
+
+	for (i = 1; i <= skip; i += next_skip) {
+		if (count <= scan[i].bm_bighint) {
+			/*
+			 * count fits in object
+			 */
+			daddr_t r;
+			if (next_skip == 1) {
+				r = blst_leaf_alloc(&scan[i], blk, count);
+			} else {
+				r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1);
+			}
+			if (r != SWAPBLK_NONE) {
+				scan->u.bmu_avail -= count;
+				if (scan->bm_bighint > scan->u.bmu_avail)
+					scan->bm_bighint = scan->u.bmu_avail;
+				return(r);
+			}
+		} else if (scan[i].bm_bighint == (daddr_t)-1) {
+			/*
+			 * Terminator
+			 */
+			break;
+		} else if (count > radix) {
+			/*
+			 * count does not fit in object even if it were
+			 * complete free.
+			 */
+			panic("blist_meta_alloc: allocation too large");
+		}
+		blk += radix;
+	}
+
+	/*
+	 * We couldn't allocate count in this subtree, update bighint.
+	 */
+	if (scan->bm_bighint >= count)
+		scan->bm_bighint = count - 1;
+	return(SWAPBLK_NONE);
+}
+
+/*
+ * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
+ *
+ */
+
+static void
+blst_leaf_free(
+	blmeta_t *scan,
+	daddr_t blk,
+	int count
+) {
+	/*
+	 * free some data in this bitmap
+	 *
+	 * e.g.
+	 *	0000111111111110000
+	 *          \_________/\__/
+	 *		v        n
+	 */
+	int n = blk & (BLIST_BMAP_RADIX - 1);
+	u_daddr_t mask;
+
+	mask = ((u_daddr_t)-1 << n) &
+	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
+
+	if (scan->u.bmu_bitmap & mask)
+		panic("blst_radix_free: freeing free block");
+	scan->u.bmu_bitmap |= mask;
+
+	/*
+	 * We could probably do a better job here.  We are required to make
+	 * bighint at least as large as the biggest contiguous block of 
+	 * data.  If we just shoehorn it, a little extra overhead will
+	 * be incured on the next allocation (but only that one typically).
+	 */
+	scan->bm_bighint = BLIST_BMAP_RADIX;
+}
+
+/*
+ * BLST_META_FREE() - free allocated blocks from radix tree meta info
+ *
+ *	This support routine frees a range of blocks from the bitmap.
+ *	The range must be entirely enclosed by this radix node.  If a
+ *	meta node, we break the range down recursively to free blocks
+ *	in subnodes (which means that this code can free an arbitrary
+ *	range whereas the allocation code cannot allocate an arbitrary
+ *	range).
+ */
+
+static void 
+blst_meta_free(
+	blmeta_t *scan, 
+	daddr_t freeBlk,
+	daddr_t count,
+	daddr_t radix, 
+	int skip,
+	daddr_t blk
+) {
+	int i;
+	int next_skip = ((u_int)skip / BLIST_META_RADIX);
+
+#if 0
+	printf("free (%llx,%lld) FROM (%llx,%lld)\n",
+	    (long long)freeBlk, (long long)count,
+	    (long long)blk, (long long)radix
+	);
+#endif
+
+	if (scan->u.bmu_avail == 0) {
+		/*
+		 * ALL-ALLOCATED special case, with possible
+		 * shortcut to ALL-FREE special case.
+		 */
+		scan->u.bmu_avail = count;
+		scan->bm_bighint = count;
+
+		if (count != radix)  {
+			for (i = 1; i <= skip; i += next_skip) {
+				if (scan[i].bm_bighint == (daddr_t)-1)
+					break;
+				scan[i].bm_bighint = 0;
+				if (next_skip == 1) {
+					scan[i].u.bmu_bitmap = 0;
+				} else {
+					scan[i].u.bmu_avail = 0;
+				}
+			}
+			/* fall through */
+		}
+	} else {
+		scan->u.bmu_avail += count;
+		/* scan->bm_bighint = radix; */
+	}
+
+	/*
+	 * ALL-FREE special case.
+	 */
+
+	if (scan->u.bmu_avail == radix)
+		return;
+	if (scan->u.bmu_avail > radix)
+		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
+		    (long long)count, (long long)scan->u.bmu_avail,
+		    (long long)radix);
+
+	/*
+	 * Break the free down into its components
+	 */
+
+	radix /= BLIST_META_RADIX;
+
+	i = (freeBlk - blk) / radix;
+	blk += i * radix;
+	i = i * next_skip + 1;
+
+	while (i <= skip && blk < freeBlk + count) {
+		daddr_t v;
+
+		v = blk + radix - freeBlk;
+		if (v > count)
+			v = count;
+
+		if (scan->bm_bighint == (daddr_t)-1)
+			panic("blst_meta_free: freeing unexpected range");
+
+		if (next_skip == 1) {
+			blst_leaf_free(&scan[i], freeBlk, v);
+		} else {
+			blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
+		}
+		if (scan->bm_bighint < scan[i].bm_bighint)
+		    scan->bm_bighint = scan[i].bm_bighint;
+		count -= v;
+		freeBlk += v;
+		blk += radix;
+		i += next_skip;
+	}
+}
+
+/*
+ * BLIST_RADIX_COPY() - copy one radix tree to another
+ *
+ *	Locates free space in the source tree and frees it in the destination
+ *	tree.  The space may not already be free in the destination.
+ */
+
+static void blst_copy(
+	blmeta_t *scan, 
+	daddr_t blk,
+	daddr_t radix, 
+	daddr_t skip, 
+	blist_t dest,
+	daddr_t count
+) {
+	int next_skip;
+	int i;
+
+	/*
+	 * Leaf node
+	 */
+
+	if (radix == BLIST_BMAP_RADIX) {
+		u_daddr_t v = scan->u.bmu_bitmap;
+
+		if (v == (u_daddr_t)-1) {
+			blist_free(dest, blk, count);
+		} else if (v != 0) {
+			int i;
+
+			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
+				if (v & (1 << i))
+					blist_free(dest, blk + i, 1);
+			}
+		}
+		return;
+	}
+
+	/*
+	 * Meta node
+	 */
+
+	if (scan->u.bmu_avail == 0) {
+		/*
+		 * Source all allocated, leave dest allocated
+		 */
+		return;
+	} 
+	if (scan->u.bmu_avail == radix) {
+		/*
+		 * Source all free, free entire dest
+		 */
+		if (count < radix)
+			blist_free(dest, blk, count);
+		else
+			blist_free(dest, blk, radix);
+		return;
+	}
+
+
+	radix /= BLIST_META_RADIX;
+	next_skip = ((u_int)skip / BLIST_META_RADIX);
+
+	for (i = 1; count && i <= skip; i += next_skip) {
+		if (scan[i].bm_bighint == (daddr_t)-1)
+			break;
+
+		if (count >= radix) {
+			blst_copy(
+			    &scan[i],
+			    blk,
+			    radix,
+			    next_skip - 1,
+			    dest,
+			    radix
+			);
+			count -= radix;
+		} else {
+			if (count) {
+				blst_copy(
+				    &scan[i],
+				    blk,
+				    radix,
+				    next_skip - 1,
+				    dest,
+				    count
+				);
+			}
+			count = 0;
+		}
+		blk += radix;
+	}
+}
+
+/*
+ * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
+ *
+ *	This routine allocates all blocks in the specified range
+ *	regardless of any existing allocations in that range.  Returns
+ *	the number of blocks allocated by the call.
+ */
+
+static int
+blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
+{
+	int n = blk & (BLIST_BMAP_RADIX - 1);
+	int nblks;
+	u_daddr_t mask, bitmap;
+
+	mask = ((u_daddr_t)-1 << n) &
+	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
+
+	/* Count the number of blocks we're about to allocate */
+	bitmap = scan->u.bmu_bitmap & mask;
+	for (nblks = 0; bitmap != 0; nblks++)
+		bitmap &= bitmap - 1;
+
+	scan->u.bmu_bitmap &= ~mask;
+	return nblks;
+}
+
+/*
+ * BLIST_META_FILL() -	allocate specific blocks at a meta node
+ *
+ *	This routine allocates the specified range of blocks,
+ *	regardless of any existing allocations in the range.  The
+ *	range must be within the extent of this node.  Returns the
+ *	number of blocks allocated by the call.
+ */
+static int
+blst_meta_fill(
+	blmeta_t *scan,
+	daddr_t allocBlk,
+	daddr_t count,
+	daddr_t radix, 
+	int skip,
+	daddr_t blk
+) {
+	int i;
+	int next_skip = ((u_int)skip / BLIST_META_RADIX);
+	int nblks = 0;
+
+	if (count == radix || scan->u.bmu_avail == 0)  {
+		/*
+		 * ALL-ALLOCATED special case
+		 */
+		nblks = scan->u.bmu_avail;
+		scan->u.bmu_avail = 0;
+		scan->bm_bighint = count;
+		return nblks;
+	}
+
+	if (scan->u.bmu_avail == radix) {
+		radix /= BLIST_META_RADIX;
+
+		/*
+		 * ALL-FREE special case, initialize sublevel
+		 */
+		for (i = 1; i <= skip; i += next_skip) {
+			if (scan[i].bm_bighint == (daddr_t)-1)
+				break;
+			if (next_skip == 1) {
+				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
+				scan[i].bm_bighint = BLIST_BMAP_RADIX;
+			} else {
+				scan[i].bm_bighint = radix;
+				scan[i].u.bmu_avail = radix;
+			}
+		}
+	} else {
+		radix /= BLIST_META_RADIX;
+	}
+
+	if (count > radix)
+		panic("blist_meta_fill: allocation too large");
+
+	i = (allocBlk - blk) / radix;
+	blk += i * radix;
+	i = i * next_skip + 1;
+
+	while (i <= skip && blk < allocBlk + count) {
+		daddr_t v;
+
+		v = blk + radix - allocBlk;
+		if (v > count)
+			v = count;
+
+		if (scan->bm_bighint == (daddr_t)-1)
+			panic("blst_meta_fill: filling unexpected range");
+
+		if (next_skip == 1) {
+			nblks += blst_leaf_fill(&scan[i], allocBlk, v);
+		} else {
+			nblks += blst_meta_fill(&scan[i], allocBlk, v,
+			    radix, next_skip - 1, blk);
+		}
+		count -= v;
+		allocBlk += v;
+		blk += radix;
+		i += next_skip;
+	}
+	scan->u.bmu_avail -= nblks;
+	return nblks;
+}
+
+/*
+ * BLST_RADIX_INIT() - initialize radix tree
+ *
+ *	Initialize our meta structures and bitmaps and calculate the exact
+ *	amount of space required to manage 'count' blocks - this space may
+ *	be considerably less than the calculated radix due to the large
+ *	RADIX values we use.
+ */
+
+static daddr_t	
+blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
+{
+	int i;
+	int next_skip;
+	daddr_t memindex = 0;
+
+	/*
+	 * Leaf node
+	 */
+
+	if (radix == BLIST_BMAP_RADIX) {
+		if (scan) {
+			scan->bm_bighint = 0;
+			scan->u.bmu_bitmap = 0;
+		}
+		return(memindex);
+	}
+
+	/*
+	 * Meta node.  If allocating the entire object we can special
+	 * case it.  However, we need to figure out how much memory
+	 * is required to manage 'count' blocks, so we continue on anyway.
+	 */
+
+	if (scan) {
+		scan->bm_bighint = 0;
+		scan->u.bmu_avail = 0;
+	}
+
+	radix /= BLIST_META_RADIX;
+	next_skip = ((u_int)skip / BLIST_META_RADIX);
+
+	for (i = 1; i <= skip; i += next_skip) {
+		if (count >= radix) {
+			/*
+			 * Allocate the entire object
+			 */
+			memindex = i + blst_radix_init(
+			    ((scan) ? &scan[i] : NULL),
+			    radix,
+			    next_skip - 1,
+			    radix
+			);
+			count -= radix;
+		} else if (count > 0) {
+			/*
+			 * Allocate a partial object
+			 */
+			memindex = i + blst_radix_init(
+			    ((scan) ? &scan[i] : NULL),
+			    radix,
+			    next_skip - 1,
+			    count
+			);
+			count = 0;
+		} else {
+			/*
+			 * Add terminator and break out
+			 */
+			if (scan)
+				scan[i].bm_bighint = (daddr_t)-1;
+			break;
+		}
+	}
+	if (memindex < i)
+		memindex = i;
+	return(memindex);
+}
+
+#ifdef BLIST_DEBUG
+
+static void	
+blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
+{
+	int i;
+	int next_skip;
+	int lastState = 0;
+
+	if (radix == BLIST_BMAP_RADIX) {
+		printf(
+		    "%*.*s(%08llx,%lld): bitmap %08llx big=%lld\n", 
+		    tab, tab, "",
+		    (long long)blk, (long long)radix,
+		    (long long)scan->u.bmu_bitmap,
+		    (long long)scan->bm_bighint
+		);
+		return;
+	}
+
+	if (scan->u.bmu_avail == 0) {
+		printf(
+		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
+		    tab, tab, "",
+		    (long long)blk,
+		    (long long)radix
+		);
+		return;
+	}
+	if (scan->u.bmu_avail == radix) {
+		printf(
+		    "%*.*s(%08llx,%lld) ALL FREE\n",
+		    tab, tab, "",
+		    (long long)blk,
+		    (long long)radix
+		);
+		return;
+	}
+
+	printf(
+	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
+	    tab, tab, "",
+	    (long long)blk, (long long)radix,
+	    (long long)scan->u.bmu_avail,
+	    (long long)radix,
+	    (long long)scan->bm_bighint
+	);
+
+	radix /= BLIST_META_RADIX;
+	next_skip = ((u_int)skip / BLIST_META_RADIX);
+	tab += 4;
+
+	for (i = 1; i <= skip; i += next_skip) {
+		if (scan[i].bm_bighint == (daddr_t)-1) {
+			printf(
+			    "%*.*s(%08llx,%lld): Terminator\n",
+			    tab, tab, "",
+			    (long long)blk, (long long)radix
+			);
+			lastState = 0;
+			break;
+		}
+		blst_radix_print(
+		    &scan[i],
+		    blk,
+		    radix,
+		    next_skip - 1,
+		    tab
+		);
+		blk += radix;
+	}
+	tab -= 4;
+
+	printf(
+	    "%*.*s}\n",
+	    tab, tab, ""
+	);
+}
+
+#endif
+
+#ifdef BLIST_DEBUG
+
+int
+main(int ac, char **av)
+{
+	int size = 1024;
+	int i;
+	blist_t bl;
+
+	for (i = 1; i < ac; ++i) {
+		const char *ptr = av[i];
+		if (*ptr != '-') {
+			size = strtol(ptr, NULL, 0);
+			continue;
+		}
+		ptr += 2;
+		fprintf(stderr, "Bad option: %s\n", ptr - 2);
+		exit(1);
+	}
+	bl = blist_create(size, M_WAITOK);
+	blist_free(bl, 0, size);
+
+	for (;;) {
+		char buf[1024];
+		daddr_t da = 0;
+		daddr_t count = 0;
+
+
+		printf("%lld/%lld/%lld> ", (long long)bl->bl_free,
+		    (long long)size, (long long)bl->bl_radix);
+		fflush(stdout);
+		if (fgets(buf, sizeof(buf), stdin) == NULL)
+			break;
+		switch(buf[0]) {
+		case 'r':
+			if (sscanf(buf + 1, "%lld", &count) == 1) {
+				blist_resize(&bl, count, 1);
+			} else {
+				printf("?\n");
+			}
+		case 'p':
+			blist_print(bl);
+			break;
+		case 'a':
+			if (sscanf(buf + 1, "%lld", &count) == 1) {
+				daddr_t blk = blist_alloc(bl, count);
+				printf("    R=%08llx\n", (long long)blk);
+			} else {
+				printf("?\n");
+			}
+			break;
+		case 'f':
+			if (sscanf(buf + 1, "%llx %lld",
+			    (long long *)&da, (long long *)&count) == 2) {
+				blist_free(bl, da, count);
+			} else {
+				printf("?\n");
+			}
+			break;
+		case 'l':
+			if (sscanf(buf + 1, "%llx %lld",
+			    (long long *)&da, (long long *)&count) == 2) {
+				printf("    n=%d\n",
+				    blist_fill(bl, da, count));
+			} else {
+				printf("?\n");
+			}
+			break;
+		case '?':
+		case 'h':
+			puts(
+			    "p          -print\n"
+			    "a %d       -allocate\n"
+			    "f %x %d    -free\n"
+			    "l %x %d    -fill\n"
+			    "r %d       -resize\n"
+			    "h/?        -help"
+			);
+			break;
+		default:
+			printf("?\n");
+			break;
+		}
+	}
+	return(0);
+}
+
+void
+panic(const char *ctl, ...)
+{
+	va_list va;
+
+	va_start(va, ctl);
+	vfprintf(stderr, ctl, va);
+	fprintf(stderr, "\n");
+	va_end(va);
+	exit(1);
+}
+
+#endif
+
diff --git a/sys/kern/subr_bufring.c b/sys/kern/subr_bufring.c
new file mode 100644
index 0000000..4cd3929
--- /dev/null
+++ b/sys/kern/subr_bufring.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2007, 2008 Kip Macy <kmacy@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/ktr.h>
+#include <sys/buf_ring.h>
+
+
+struct buf_ring *
+buf_ring_alloc(int count, struct malloc_type *type, int flags, struct mtx *lock)
+{
+	struct buf_ring *br;
+
+	KASSERT(powerof2(count), ("buf ring must be size power of 2"));
+	
+	br = malloc(sizeof(struct buf_ring) + count*sizeof(caddr_t),
+	    type, flags|M_ZERO);
+	if (br == NULL)
+		return (NULL);
+#ifdef DEBUG_BUFRING
+	br->br_lock = lock;
+#endif	
+	br->br_prod_size = br->br_cons_size = count;
+	br->br_prod_mask = br->br_cons_mask = count-1;
+	br->br_prod_head = br->br_cons_head = 0;
+	br->br_prod_tail = br->br_cons_tail = 0;
+		
+	return (br);
+}
+
+void
+buf_ring_free(struct buf_ring *br, struct malloc_type *type)
+{
+	free(br, type);
+}
diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c
new file mode 100644
index 0000000..b3b1852
--- /dev/null
+++ b/sys/kern/subr_bus.c
@@ -0,0 +1,4885 @@
+/*-
+ * Copyright (c) 1997,1998,2003 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_bus.h"
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/filio.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/condvar.h>
+#include <sys/queue.h>
+#include <machine/bus.h>
+#include <sys/rman.h>
+#include <sys/selinfo.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/uio.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+
+#include <net/vnet.h>
+
+#include <machine/stdarg.h>
+
+#include <vm/uma.h>
+
+SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW, NULL, NULL);
+SYSCTL_NODE(, OID_AUTO, dev, CTLFLAG_RW, NULL, NULL);
+
+/*
+ * Used to attach drivers to devclasses.
+ */
+typedef struct driverlink *driverlink_t;
+struct driverlink {
+	kobj_class_t	driver;
+	TAILQ_ENTRY(driverlink) link;	/* list of drivers in devclass */
+	int		pass;
+	TAILQ_ENTRY(driverlink) passlink;
+};
+
+/*
+ * Forward declarations
+ */
+typedef TAILQ_HEAD(devclass_list, devclass) devclass_list_t;
+typedef TAILQ_HEAD(driver_list, driverlink) driver_list_t;
+typedef TAILQ_HEAD(device_list, device) device_list_t;
+
+struct devclass {
+	TAILQ_ENTRY(devclass) link;
+	devclass_t	parent;		/* parent in devclass hierarchy */
+	driver_list_t	drivers;     /* bus devclasses store drivers for bus */
+	char		*name;
+	device_t	*devices;	/* array of devices indexed by unit */
+	int		maxunit;	/* size of devices array */
+	int		flags;
+#define DC_HAS_CHILDREN		1
+
+	struct sysctl_ctx_list sysctl_ctx;
+	struct sysctl_oid *sysctl_tree;
+};
+
+/**
+ * @brief Implementation of device.
+ */
+struct device {
+	/*
+	 * A device is a kernel object. The first field must be the
+	 * current ops table for the object.
+	 */
+	KOBJ_FIELDS;
+
+	/*
+	 * Device hierarchy.
+	 */
+	TAILQ_ENTRY(device)	link;	/**< list of devices in parent */
+	TAILQ_ENTRY(device)	devlink; /**< global device list membership */
+	device_t	parent;		/**< parent of this device  */
+	device_list_t	children;	/**< list of child devices */
+
+	/*
+	 * Details of this device.
+	 */
+	driver_t	*driver;	/**< current driver */
+	devclass_t	devclass;	/**< current device class */
+	int		unit;		/**< current unit number */
+	char*		nameunit;	/**< name+unit e.g. foodev0 */
+	char*		desc;		/**< driver specific description */
+	int		busy;		/**< count of calls to device_busy() */
+	device_state_t	state;		/**< current device state  */
+	uint32_t	devflags;	/**< api level flags for device_get_flags() */
+	u_int		flags;		/**< internal device flags  */
+#define	DF_ENABLED	0x01		/* device should be probed/attached */
+#define	DF_FIXEDCLASS	0x02		/* devclass specified at create time */
+#define	DF_WILDCARD	0x04		/* unit was originally wildcard */
+#define	DF_DESCMALLOCED	0x08		/* description was malloced */
+#define	DF_QUIET	0x10		/* don't print verbose attach message */
+#define	DF_DONENOMATCH	0x20		/* don't execute DEVICE_NOMATCH again */
+#define	DF_EXTERNALSOFTC 0x40		/* softc not allocated by us */
+#define	DF_REBID	0x80		/* Can rebid after attach */
+	u_int	order;			/**< order from device_add_child_ordered() */
+	void	*ivars;			/**< instance variables  */
+	void	*softc;			/**< current driver's variables  */
+
+	struct sysctl_ctx_list sysctl_ctx; /**< state for sysctl variables  */
+	struct sysctl_oid *sysctl_tree;	/**< state for sysctl variables */
+};
+
+static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures");
+static MALLOC_DEFINE(M_BUS_SC, "bus-sc", "Bus data structures, softc");
+
+#ifdef BUS_DEBUG
+
+static int bus_debug = 1;
+TUNABLE_INT("bus.debug", &bus_debug);
+SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RW, &bus_debug, 0,
+    "Debug bus code");
+
+#define PDEBUG(a)	if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a; printf("\n");}
+#define DEVICENAME(d)	((d)? device_get_name(d): "no device")
+#define DRIVERNAME(d)	((d)? d->name : "no driver")
+#define DEVCLANAME(d)	((d)? d->name : "no devclass")
+
+/**
+ * Produce the indenting, indent*2 spaces plus a '.' ahead of that to
+ * prevent syslog from deleting initial spaces
+ */
+#define indentprintf(p)	do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf("  "); printf p ; } while (0)
+
+static void print_device_short(device_t dev, int indent);
+static void print_device(device_t dev, int indent);
+void print_device_tree_short(device_t dev, int indent);
+void print_device_tree(device_t dev, int indent);
+static void print_driver_short(driver_t *driver, int indent);
+static void print_driver(driver_t *driver, int indent);
+static void print_driver_list(driver_list_t drivers, int indent);
+static void print_devclass_short(devclass_t dc, int indent);
+static void print_devclass(devclass_t dc, int indent);
+void print_devclass_list_short(void);
+void print_devclass_list(void);
+
+#else
+/* Make the compiler ignore the function calls */
+#define PDEBUG(a)			/* nop */
+#define DEVICENAME(d)			/* nop */
+#define DRIVERNAME(d)			/* nop */
+#define DEVCLANAME(d)			/* nop */
+
+#define print_device_short(d,i)		/* nop */
+#define print_device(d,i)		/* nop */
+#define print_device_tree_short(d,i)	/* nop */
+#define print_device_tree(d,i)		/* nop */
+#define print_driver_short(d,i)		/* nop */
+#define print_driver(d,i)		/* nop */
+#define print_driver_list(d,i)		/* nop */
+#define print_devclass_short(d,i)	/* nop */
+#define print_devclass(d,i)		/* nop */
+#define print_devclass_list_short()	/* nop */
+#define print_devclass_list()		/* nop */
+#endif
+
+/*
+ * dev sysctl tree
+ */
+
+enum {
+	DEVCLASS_SYSCTL_PARENT,
+};
+
+static int
+devclass_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+	devclass_t dc = (devclass_t)arg1;
+	const char *value;
+
+	switch (arg2) {
+	case DEVCLASS_SYSCTL_PARENT:
+		value = dc->parent ? dc->parent->name : "";
+		break;
+	default:
+		return (EINVAL);
+	}
+	return (SYSCTL_OUT(req, value, strlen(value)));
+}
+
+static void
+devclass_sysctl_init(devclass_t dc)
+{
+
+	if (dc->sysctl_tree != NULL)
+		return;
+	sysctl_ctx_init(&dc->sysctl_ctx);
+	dc->sysctl_tree = SYSCTL_ADD_NODE(&dc->sysctl_ctx,
+	    SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, dc->name,
+	    CTLFLAG_RD, NULL, "");
+	SYSCTL_ADD_PROC(&dc->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree),
+	    OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD,
+	    dc, DEVCLASS_SYSCTL_PARENT, devclass_sysctl_handler, "A",
+	    "parent class");
+}
+
+enum {
+	DEVICE_SYSCTL_DESC,
+	DEVICE_SYSCTL_DRIVER,
+	DEVICE_SYSCTL_LOCATION,
+	DEVICE_SYSCTL_PNPINFO,
+	DEVICE_SYSCTL_PARENT,
+};
+
+static int
+device_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+	device_t dev = (device_t)arg1;
+	const char *value;
+	char *buf;
+	int error;
+
+	buf = NULL;
+	switch (arg2) {
+	case DEVICE_SYSCTL_DESC:
+		value = dev->desc ? dev->desc : "";
+		break;
+	case DEVICE_SYSCTL_DRIVER:
+		value = dev->driver ? dev->driver->name : "";
+		break;
+	case DEVICE_SYSCTL_LOCATION:
+		value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
+		bus_child_location_str(dev, buf, 1024);
+		break;
+	case DEVICE_SYSCTL_PNPINFO:
+		value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
+		bus_child_pnpinfo_str(dev, buf, 1024);
+		break;
+	case DEVICE_SYSCTL_PARENT:
+		value = dev->parent ? dev->parent->nameunit : "";
+		break;
+	default:
+		return (EINVAL);
+	}
+	error = SYSCTL_OUT(req, value, strlen(value));
+	if (buf != NULL)
+		free(buf, M_BUS);
+	return (error);
+}
+
+static void
+device_sysctl_init(device_t dev)
+{
+	devclass_t dc = dev->devclass;
+
+	if (dev->sysctl_tree != NULL)
+		return;
+	devclass_sysctl_init(dc);
+	sysctl_ctx_init(&dev->sysctl_ctx);
+	dev->sysctl_tree = SYSCTL_ADD_NODE(&dev->sysctl_ctx,
+	    SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO,
+	    dev->nameunit + strlen(dc->name),
+	    CTLFLAG_RD, NULL, "");
+	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
+	    OID_AUTO, "%desc", CTLTYPE_STRING | CTLFLAG_RD,
+	    dev, DEVICE_SYSCTL_DESC, device_sysctl_handler, "A",
+	    "device description");
+	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
+	    OID_AUTO, "%driver", CTLTYPE_STRING | CTLFLAG_RD,
+	    dev, DEVICE_SYSCTL_DRIVER, device_sysctl_handler, "A",
+	    "device driver name");
+	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
+	    OID_AUTO, "%location", CTLTYPE_STRING | CTLFLAG_RD,
+	    dev, DEVICE_SYSCTL_LOCATION, device_sysctl_handler, "A",
+	    "device location relative to parent");
+	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
+	    OID_AUTO, "%pnpinfo", CTLTYPE_STRING | CTLFLAG_RD,
+	    dev, DEVICE_SYSCTL_PNPINFO, device_sysctl_handler, "A",
+	    "device identification");
+	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
+	    OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD,
+	    dev, DEVICE_SYSCTL_PARENT, device_sysctl_handler, "A",
+	    "parent device");
+}
+
+static void
+device_sysctl_update(device_t dev)
+{
+	devclass_t dc = dev->devclass;
+
+	if (dev->sysctl_tree == NULL)
+		return;
+	sysctl_rename_oid(dev->sysctl_tree, dev->nameunit + strlen(dc->name));
+}
+
+static void
+device_sysctl_fini(device_t dev)
+{
+	if (dev->sysctl_tree == NULL)
+		return;
+	sysctl_ctx_free(&dev->sysctl_ctx);
+	dev->sysctl_tree = NULL;
+}
+
+/*
+ * /dev/devctl implementation
+ */
+
+/*
+ * This design allows only one reader for /dev/devctl.  This is not desirable
+ * in the long run, but will get a lot of hair out of this implementation.
+ * Maybe we should make this device a clonable device.
+ *
+ * Also note: we specifically do not attach a device to the device_t tree
+ * to avoid potential chicken and egg problems.  One could argue that all
+ * of this belongs to the root node.  One could also further argue that the
+ * sysctl interface that we have not might more properly be an ioctl
+ * interface, but at this stage of the game, I'm not inclined to rock that
+ * boat.
+ *
+ * I'm also not sure that the SIGIO support is done correctly or not, as
+ * I copied it from a driver that had SIGIO support that likely hasn't been
+ * tested since 3.4 or 2.2.8!
+ */
+
+/* Deprecated way to adjust queue length */
+static int sysctl_devctl_disable(SYSCTL_HANDLER_ARGS);
+/* XXX Need to support old-style tunable hw.bus.devctl_disable" */
+SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_disable, CTLTYPE_INT | CTLFLAG_RW, NULL,
+    0, sysctl_devctl_disable, "I", "devctl disable -- deprecated");
+
+#define DEVCTL_DEFAULT_QUEUE_LEN 1000
+static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS);
+static int devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
+TUNABLE_INT("hw.bus.devctl_queue", &devctl_queue_length);
+SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_queue, CTLTYPE_INT | CTLFLAG_RW, NULL,
+    0, sysctl_devctl_queue, "I", "devctl queue length");
+
+static d_open_t		devopen;
+static d_close_t	devclose;
+static d_read_t		devread;
+static d_ioctl_t	devioctl;
+static d_poll_t		devpoll;
+
+static struct cdevsw dev_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_flags =	D_NEEDGIANT,
+	.d_open =	devopen,
+	.d_close =	devclose,
+	.d_read =	devread,
+	.d_ioctl =	devioctl,
+	.d_poll =	devpoll,
+	.d_name =	"devctl",
+};
+
+struct dev_event_info
+{
+	char *dei_data;
+	TAILQ_ENTRY(dev_event_info) dei_link;
+};
+
+TAILQ_HEAD(devq, dev_event_info);
+
+static struct dev_softc
+{
+	int	inuse;
+	int	nonblock;
+	int	queued;
+	struct mtx mtx;
+	struct cv cv;
+	struct selinfo sel;
+	struct devq devq;
+	struct proc *async_proc;
+} devsoftc;
+
+static struct cdev *devctl_dev;
+
+static void
+devinit(void)
+{
+	devctl_dev = make_dev_credf(MAKEDEV_ETERNAL, &dev_cdevsw, 0, NULL,
+	    UID_ROOT, GID_WHEEL, 0600, "devctl");
+	mtx_init(&devsoftc.mtx, "dev mtx", "devd", MTX_DEF);
+	cv_init(&devsoftc.cv, "dev cv");
+	TAILQ_INIT(&devsoftc.devq);
+}
+
+static int
+devopen(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+	if (devsoftc.inuse)
+		return (EBUSY);
+	/* move to init */
+	devsoftc.inuse = 1;
+	devsoftc.nonblock = 0;
+	devsoftc.async_proc = NULL;
+	return (0);
+}
+
+static int
+devclose(struct cdev *dev, int fflag, int devtype, struct thread *td)
+{
+	devsoftc.inuse = 0;
+	mtx_lock(&devsoftc.mtx);
+	cv_broadcast(&devsoftc.cv);
+	mtx_unlock(&devsoftc.mtx);
+	devsoftc.async_proc = NULL;
+	return (0);
+}
+
+/*
+ * The read channel for this device is used to report changes to
+ * userland in realtime.  We are required to free the data as well as
+ * the n1 object because we allocate them separately.  Also note that
+ * we return one record at a time.  If you try to read this device a
+ * character at a time, you will lose the rest of the data.  Listening
+ * programs are expected to cope.
+ */
+static int
+devread(struct cdev *dev, struct uio *uio, int ioflag)
+{
+	struct dev_event_info *n1;
+	int rv;
+
+	mtx_lock(&devsoftc.mtx);
+	while (TAILQ_EMPTY(&devsoftc.devq)) {
+		if (devsoftc.nonblock) {
+			mtx_unlock(&devsoftc.mtx);
+			return (EAGAIN);
+		}
+		rv = cv_wait_sig(&devsoftc.cv, &devsoftc.mtx);
+		if (rv) {
+			/*
+			 * Need to translate ERESTART to EINTR here? -- jake
+			 */
+			mtx_unlock(&devsoftc.mtx);
+			return (rv);
+		}
+	}
+	n1 = TAILQ_FIRST(&devsoftc.devq);
+	TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
+	devsoftc.queued--;
+	mtx_unlock(&devsoftc.mtx);
+	rv = uiomove(n1->dei_data, strlen(n1->dei_data), uio);
+	free(n1->dei_data, M_BUS);
+	free(n1, M_BUS);
+	return (rv);
+}
+
+static	int
+devioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
+{
+	switch (cmd) {
+
+	case FIONBIO:
+		if (*(int*)data)
+			devsoftc.nonblock = 1;
+		else
+			devsoftc.nonblock = 0;
+		return (0);
+	case FIOASYNC:
+		if (*(int*)data)
+			devsoftc.async_proc = td->td_proc;
+		else
+			devsoftc.async_proc = NULL;
+		return (0);
+
+		/* (un)Support for other fcntl() calls. */
+	case FIOCLEX:
+	case FIONCLEX:
+	case FIONREAD:
+	case FIOSETOWN:
+	case FIOGETOWN:
+	default:
+		break;
+	}
+	return (ENOTTY);
+}
+
+static	int
+devpoll(struct cdev *dev, int events, struct thread *td)
+{
+	int	revents = 0;
+
+	mtx_lock(&devsoftc.mtx);
+	if (events & (POLLIN | POLLRDNORM)) {
+		if (!TAILQ_EMPTY(&devsoftc.devq))
+			revents = events & (POLLIN | POLLRDNORM);
+		else
+			selrecord(td, &devsoftc.sel);
+	}
+	mtx_unlock(&devsoftc.mtx);
+
+	return (revents);
+}
+
+/**
+ * @brief Return whether the userland process is running
+ */
+boolean_t
+devctl_process_running(void)
+{
+	return (devsoftc.inuse == 1);
+}
+
+/**
+ * @brief Queue data to be read from the devctl device
+ *
+ * Generic interface to queue data to the devctl device.  It is
+ * assumed that @p data is properly formatted.  It is further assumed
+ * that @p data is allocated using the M_BUS malloc type.
+ */
+void
+devctl_queue_data_f(char *data, int flags)
+{
+	struct dev_event_info *n1 = NULL, *n2 = NULL;
+	struct proc *p;
+
+	if (strlen(data) == 0)
+		goto out;
+	if (devctl_queue_length == 0)
+		goto out;
+	n1 = malloc(sizeof(*n1), M_BUS, flags);
+	if (n1 == NULL)
+		goto out;
+	n1->dei_data = data;
+	mtx_lock(&devsoftc.mtx);
+	if (devctl_queue_length == 0) {
+		mtx_unlock(&devsoftc.mtx);
+		free(n1->dei_data, M_BUS);
+		free(n1, M_BUS);
+		return;
+	}
+	/* Leave at least one spot in the queue... */
+	while (devsoftc.queued > devctl_queue_length - 1) {
+		n2 = TAILQ_FIRST(&devsoftc.devq);
+		TAILQ_REMOVE(&devsoftc.devq, n2, dei_link);
+		free(n2->dei_data, M_BUS);
+		free(n2, M_BUS);
+		devsoftc.queued--;
+	}
+	TAILQ_INSERT_TAIL(&devsoftc.devq, n1, dei_link);
+	devsoftc.queued++;
+	cv_broadcast(&devsoftc.cv);
+	mtx_unlock(&devsoftc.mtx);
+	selwakeup(&devsoftc.sel);
+	p = devsoftc.async_proc;
+	if (p != NULL) {
+		PROC_LOCK(p);
+		kern_psignal(p, SIGIO);
+		PROC_UNLOCK(p);
+	}
+	return;
+out:
+	/*
+	 * We have to free data on all error paths since the caller
+	 * assumes it will be free'd when this item is dequeued.
+	 */
+	free(data, M_BUS);
+	return;
+}
+
+void
+devctl_queue_data(char *data)
+{
+
+	devctl_queue_data_f(data, M_NOWAIT);
+}
+
+/**
+ * @brief Send a 'notification' to userland, using standard ways
+ */
+void
+devctl_notify_f(const char *system, const char *subsystem, const char *type,
+    const char *data, int flags)
+{
+	int len = 0;
+	char *msg;
+
+	if (system == NULL)
+		return;		/* BOGUS!  Must specify system. */
+	if (subsystem == NULL)
+		return;		/* BOGUS!  Must specify subsystem. */
+	if (type == NULL)
+		return;		/* BOGUS!  Must specify type. */
+	len += strlen(" system=") + strlen(system);
+	len += strlen(" subsystem=") + strlen(subsystem);
+	len += strlen(" type=") + strlen(type);
+	/* add in the data message plus newline. */
+	if (data != NULL)
+		len += strlen(data);
+	len += 3;	/* '!', '\n', and NUL */
+	msg = malloc(len, M_BUS, flags);
+	if (msg == NULL)
+		return;		/* Drop it on the floor */
+	if (data != NULL)
+		snprintf(msg, len, "!system=%s subsystem=%s type=%s %s\n",
+		    system, subsystem, type, data);
+	else
+		snprintf(msg, len, "!system=%s subsystem=%s type=%s\n",
+		    system, subsystem, type);
+	devctl_queue_data_f(msg, flags);
+}
+
+void
+devctl_notify(const char *system, const char *subsystem, const char *type,
+    const char *data)
+{
+
+	devctl_notify_f(system, subsystem, type, data, M_NOWAIT);
+}
+
+/*
+ * Common routine that tries to make sending messages as easy as possible.
+ * We allocate memory for the data, copy strings into that, but do not
+ * free it unless there's an error.  The dequeue part of the driver should
+ * free the data.  We don't send data when the device is disabled.  We do
+ * send data, even when we have no listeners, because we wish to avoid
+ * races relating to startup and restart of listening applications.
+ *
+ * devaddq is designed to string together the type of event, with the
+ * object of that event, plus the plug and play info and location info
+ * for that event.  This is likely most useful for devices, but less
+ * useful for other consumers of this interface.  Those should use
+ * the devctl_queue_data() interface instead.
+ */
+static void
+devaddq(const char *type, const char *what, device_t dev)
+{
+	char *data = NULL;
+	char *loc = NULL;
+	char *pnp = NULL;
+	const char *parstr;
+
+	if (!devctl_queue_length)/* Rare race, but lost races safely discard */
+		return;
+	data = malloc(1024, M_BUS, M_NOWAIT);
+	if (data == NULL)
+		goto bad;
+
+	/* get the bus specific location of this device */
+	loc = malloc(1024, M_BUS, M_NOWAIT);
+	if (loc == NULL)
+		goto bad;
+	*loc = '\0';
+	bus_child_location_str(dev, loc, 1024);
+
+	/* Get the bus specific pnp info of this device */
+	pnp = malloc(1024, M_BUS, M_NOWAIT);
+	if (pnp == NULL)
+		goto bad;
+	*pnp = '\0';
+	bus_child_pnpinfo_str(dev, pnp, 1024);
+
+	/* Get the parent of this device, or / if high enough in the tree. */
+	if (device_get_parent(dev) == NULL)
+		parstr = ".";	/* Or '/' ? */
+	else
+		parstr = device_get_nameunit(device_get_parent(dev));
+	/* String it all together. */
+	snprintf(data, 1024, "%s%s at %s %s on %s\n", type, what, loc, pnp,
+	  parstr);
+	free(loc, M_BUS);
+	free(pnp, M_BUS);
+	devctl_queue_data(data);
+	return;
+bad:
+	free(pnp, M_BUS);
+	free(loc, M_BUS);
+	free(data, M_BUS);
+	return;
+}
+
+/*
+ * A device was added to the tree.  We are called just after it successfully
+ * attaches (that is, probe and attach success for this device).  No call
+ * is made if a device is merely parented into the tree.  See devnomatch
+ * if probe fails.  If attach fails, no notification is sent (but maybe
+ * we should have a different message for this).
+ */
+static void
+devadded(device_t dev)
+{
+	devaddq("+", device_get_nameunit(dev), dev);
+}
+
+/*
+ * A device was removed from the tree.  We are called just before this
+ * happens.
+ */
+static void
+devremoved(device_t dev)
+{
+	devaddq("-", device_get_nameunit(dev), dev);
+}
+
+/*
+ * Called when there's no match for this device.  This is only called
+ * the first time that no match happens, so we don't keep getting this
+ * message.  Should that prove to be undesirable, we can change it.
+ * This is called when all drivers that can attach to a given bus
+ * decline to accept this device.  Other errors may not be detected.
+ */
+static void
+devnomatch(device_t dev)
+{
+	devaddq("?", "", dev);
+}
+
+static int
+sysctl_devctl_disable(SYSCTL_HANDLER_ARGS)
+{
+	struct dev_event_info *n1;
+	int dis, error;
+
+	dis = devctl_queue_length == 0;
+	error = sysctl_handle_int(oidp, &dis, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	mtx_lock(&devsoftc.mtx);
+	if (dis) {
+		while (!TAILQ_EMPTY(&devsoftc.devq)) {
+			n1 = TAILQ_FIRST(&devsoftc.devq);
+			TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
+			free(n1->dei_data, M_BUS);
+			free(n1, M_BUS);
+		}
+		devsoftc.queued = 0;
+		devctl_queue_length = 0;
+	} else {
+		devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
+	}
+	mtx_unlock(&devsoftc.mtx);
+	return (0);
+}
+
+static int
+sysctl_devctl_queue(SYSCTL_HANDLER_ARGS)
+{
+	struct dev_event_info *n1;
+	int q, error;
+
+	q = devctl_queue_length;
+	error = sysctl_handle_int(oidp, &q, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	if (q < 0)
+		return (EINVAL);
+	mtx_lock(&devsoftc.mtx);
+	devctl_queue_length = q;
+	while (devsoftc.queued > devctl_queue_length) {
+		n1 = TAILQ_FIRST(&devsoftc.devq);
+		TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
+		free(n1->dei_data, M_BUS);
+		free(n1, M_BUS);
+		devsoftc.queued--;
+	}
+	mtx_unlock(&devsoftc.mtx);
+	return (0);
+}
+
+/* End of /dev/devctl code */
+
+static TAILQ_HEAD(,device)	bus_data_devices;
+static int bus_data_generation = 1;
+
+static kobj_method_t null_methods[] = {
+	KOBJMETHOD_END
+};
+
+DEFINE_CLASS(null, null_methods, 0);
+
+/*
+ * Bus pass implementation
+ */
+
+static driver_list_t passes = TAILQ_HEAD_INITIALIZER(passes);
+int bus_current_pass = BUS_PASS_ROOT;
+
+/**
+ * @internal
+ * @brief Register the pass level of a new driver attachment
+ *
+ * Register a new driver attachment's pass level.  If no driver
+ * attachment with the same pass level has been added, then @p new
+ * will be added to the global passes list.
+ *
+ * @param new		the new driver attachment
+ */
+static void
+driver_register_pass(struct driverlink *new)
+{
+	struct driverlink *dl;
+
+	/* We only consider pass numbers during boot. */
+	if (bus_current_pass == BUS_PASS_DEFAULT)
+		return;
+
+	/*
+	 * Walk the passes list.  If we already know about this pass
+	 * then there is nothing to do.  If we don't, then insert this
+	 * driver link into the list.
+	 */
+	TAILQ_FOREACH(dl, &passes, passlink) {
+		if (dl->pass < new->pass)
+			continue;
+		if (dl->pass == new->pass)
+			return;
+		TAILQ_INSERT_BEFORE(dl, new, passlink);
+		return;
+	}
+	TAILQ_INSERT_TAIL(&passes, new, passlink);
+}
+
+/**
+ * @brief Raise the current bus pass
+ *
+ * Raise the current bus pass level to @p pass.  Call the BUS_NEW_PASS()
+ * method on the root bus to kick off a new device tree scan for each
+ * new pass level that has at least one driver.
+ */
+void
+bus_set_pass(int pass)
+{
+	struct driverlink *dl;
+
+	if (bus_current_pass > pass)
+		panic("Attempt to lower bus pass level");
+
+	TAILQ_FOREACH(dl, &passes, passlink) {
+		/* Skip pass values below the current pass level. */
+		if (dl->pass <= bus_current_pass)
+			continue;
+
+		/*
+		 * Bail once we hit a driver with a pass level that is
+		 * too high.
+		 */
+		if (dl->pass > pass)
+			break;
+
+		/*
+		 * Raise the pass level to the next level and rescan
+		 * the tree.
+		 */
+		bus_current_pass = dl->pass;
+		BUS_NEW_PASS(root_bus);
+	}
+
+	/*
+	 * If there isn't a driver registered for the requested pass,
+	 * then bus_current_pass might still be less than 'pass'.  Set
+	 * it to 'pass' in that case.
+	 */
+	if (bus_current_pass < pass)
+		bus_current_pass = pass;
+	KASSERT(bus_current_pass == pass, ("Failed to update bus pass level"));
+}
+
+/*
+ * Devclass implementation
+ */
+
+static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses);
+
+/**
+ * @internal
+ * @brief Find or create a device class
+ *
+ * If a device class with the name @p classname exists, return it,
+ * otherwise if @p create is non-zero create and return a new device
+ * class.
+ *
+ * If @p parentname is non-NULL, the parent of the devclass is set to
+ * the devclass of that name.
+ *
+ * @param classname	the devclass name to find or create
+ * @param parentname	the parent devclass name or @c NULL
+ * @param create	non-zero to create a devclass
+ */
+static devclass_t
+devclass_find_internal(const char *classname, const char *parentname,
+		       int create)
+{
+	devclass_t dc;
+
+	PDEBUG(("looking for %s", classname));
+	if (!classname)
+		return (NULL);
+
+	TAILQ_FOREACH(dc, &devclasses, link) {
+		if (!strcmp(dc->name, classname))
+			break;
+	}
+
+	if (create && !dc) {
+		PDEBUG(("creating %s", classname));
+		dc = malloc(sizeof(struct devclass) + strlen(classname) + 1,
+		    M_BUS, M_NOWAIT | M_ZERO);
+		if (!dc)
+			return (NULL);
+		dc->parent = NULL;
+		dc->name = (char*) (dc + 1);
+		strcpy(dc->name, classname);
+		TAILQ_INIT(&dc->drivers);
+		TAILQ_INSERT_TAIL(&devclasses, dc, link);
+
+		bus_data_generation_update();
+	}
+
+	/*
+	 * If a parent class is specified, then set that as our parent so
+	 * that this devclass will support drivers for the parent class as
+	 * well.  If the parent class has the same name don't do this though
+	 * as it creates a cycle that can trigger an infinite loop in
+	 * device_probe_child() if a device exists for which there is no
+	 * suitable driver.
+	 */
+	if (parentname && dc && !dc->parent &&
+	    strcmp(classname, parentname) != 0) {
+		dc->parent = devclass_find_internal(parentname, NULL, TRUE);
+		dc->parent->flags |= DC_HAS_CHILDREN;
+	}
+
+	return (dc);
+}
+
+/**
+ * @brief Create a device class
+ *
+ * If a device class with the name @p classname exists, return it,
+ * otherwise create and return a new device class.
+ *
+ * @param classname	the devclass name to find or create
+ */
+devclass_t
+devclass_create(const char *classname)
+{
+	return (devclass_find_internal(classname, NULL, TRUE));
+}
+
+/**
+ * @brief Find a device class
+ *
+ * If a device class with the name @p classname exists, return it,
+ * otherwise return @c NULL.
+ *
+ * @param classname	the devclass name to find
+ */
+devclass_t
+devclass_find(const char *classname)
+{
+	return (devclass_find_internal(classname, NULL, FALSE));
+}
+
+/**
+ * @brief Register that a device driver has been added to a devclass
+ *
+ * Register that a device driver has been added to a devclass.  This
+ * is called by devclass_add_driver to accomplish the recursive
+ * notification of all the children classes of dc, as well as dc.
+ * Each layer will have BUS_DRIVER_ADDED() called for all instances of
+ * the devclass.
+ *
+ * We do a full search here of the devclass list at each iteration
+ * level to save storing children-lists in the devclass structure.  If
+ * we ever move beyond a few dozen devices doing this, we may need to
+ * reevaluate...
+ *
+ * @param dc		the devclass to edit
+ * @param driver	the driver that was just added
+ */
+static void
+devclass_driver_added(devclass_t dc, driver_t *driver)
+{
+	devclass_t parent;
+	int i;
+
+	/*
+	 * Call BUS_DRIVER_ADDED for any existing busses in this class.
+	 */
+	for (i = 0; i < dc->maxunit; i++)
+		if (dc->devices[i] && device_is_attached(dc->devices[i]))
+			BUS_DRIVER_ADDED(dc->devices[i], driver);
+
+	/*
+	 * Walk through the children classes.  Since we only keep a
+	 * single parent pointer around, we walk the entire list of
+	 * devclasses looking for children.  We set the
+	 * DC_HAS_CHILDREN flag when a child devclass is created on
+	 * the parent, so we only walk the list for those devclasses
+	 * that have children.
+	 */
+	if (!(dc->flags & DC_HAS_CHILDREN))
+		return;
+	parent = dc;
+	TAILQ_FOREACH(dc, &devclasses, link) {
+		if (dc->parent == parent)
+			devclass_driver_added(dc, driver);
+	}
+}
+
+/**
+ * @brief Add a device driver to a device class
+ *
+ * Add a device driver to a devclass. This is normally called
+ * automatically by DRIVER_MODULE(). The BUS_DRIVER_ADDED() method of
+ * all devices in the devclass will be called to allow them to attempt
+ * to re-probe any unmatched children.
+ *
+ * @param dc		the devclass to edit
+ * @param driver	the driver to register
+ */
+int
+devclass_add_driver(devclass_t dc, driver_t *driver, int pass, devclass_t *dcp)
+{
+	driverlink_t dl;
+	const char *parentname;
+
+	PDEBUG(("%s", DRIVERNAME(driver)));
+
+	/* Don't allow invalid pass values. */
+	if (pass <= BUS_PASS_ROOT)
+		return (EINVAL);
+
+	dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO);
+	if (!dl)
+		return (ENOMEM);
+
+	/*
+	 * Compile the driver's methods. Also increase the reference count
+	 * so that the class doesn't get freed when the last instance
+	 * goes. This means we can safely use static methods and avoids a
+	 * double-free in devclass_delete_driver.
+	 */
+	kobj_class_compile((kobj_class_t) driver);
+
+	/*
+	 * If the driver has any base classes, make the
+	 * devclass inherit from the devclass of the driver's
+	 * first base class. This will allow the system to
+	 * search for drivers in both devclasses for children
+	 * of a device using this driver.
+	 */
+	if (driver->baseclasses)
+		parentname = driver->baseclasses[0]->name;
+	else
+		parentname = NULL;
+	*dcp = devclass_find_internal(driver->name, parentname, TRUE);
+
+	dl->driver = driver;
+	TAILQ_INSERT_TAIL(&dc->drivers, dl, link);
+	driver->refs++;		/* XXX: kobj_mtx */
+	dl->pass = pass;
+	driver_register_pass(dl);
+
+	devclass_driver_added(dc, driver);
+	bus_data_generation_update();
+	return (0);
+}
+
+/**
+ * @brief Register that a device driver has been deleted from a devclass
+ *
+ * Register that a device driver has been removed from a devclass.
+ * This is called by devclass_delete_driver to accomplish the
+ * recursive notification of all the children classes of busclass, as
+ * well as busclass.  Each layer will attempt to detach the driver
+ * from any devices that are children of the bus's devclass.  The function
+ * will return an error if a device fails to detach.
+ * 
+ * We do a full search here of the devclass list at each iteration
+ * level to save storing children-lists in the devclass structure.  If
+ * we ever move beyond a few dozen devices doing this, we may need to
+ * reevaluate...
+ *
+ * @param busclass	the devclass of the parent bus
+ * @param dc		the devclass of the driver being deleted
+ * @param driver	the driver being deleted
+ */
+static int
+devclass_driver_deleted(devclass_t busclass, devclass_t dc, driver_t *driver)
+{
+	devclass_t parent;
+	device_t dev;
+	int error, i;
+
+	/*
+	 * Disassociate from any devices.  We iterate through all the
+	 * devices in the devclass of the driver and detach any which are
+	 * using the driver and which have a parent in the devclass which
+	 * we are deleting from.
+	 *
+	 * Note that since a driver can be in multiple devclasses, we
+	 * should not detach devices which are not children of devices in
+	 * the affected devclass.
+	 */
+	for (i = 0; i < dc->maxunit; i++) {
+		if (dc->devices[i]) {
+			dev = dc->devices[i];
+			if (dev->driver == driver && dev->parent &&
+			    dev->parent->devclass == busclass) {
+				if ((error = device_detach(dev)) != 0)
+					return (error);
+				BUS_PROBE_NOMATCH(dev->parent, dev);
+				devnomatch(dev);
+				dev->flags |= DF_DONENOMATCH;
+			}
+		}
+	}
+
+	/*
+	 * Walk through the children classes.  Since we only keep a
+	 * single parent pointer around, we walk the entire list of
+	 * devclasses looking for children.  We set the
+	 * DC_HAS_CHILDREN flag when a child devclass is created on
+	 * the parent, so we only walk the list for those devclasses
+	 * that have children.
+	 */
+	if (!(busclass->flags & DC_HAS_CHILDREN))
+		return (0);
+	parent = busclass;
+	TAILQ_FOREACH(busclass, &devclasses, link) {
+		if (busclass->parent == parent) {
+			error = devclass_driver_deleted(busclass, dc, driver);
+			if (error)
+				return (error);
+		}
+	}
+	return (0);
+}
+
+/**
+ * @brief Delete a device driver from a device class
+ *
+ * Delete a device driver from a devclass. This is normally called
+ * automatically by DRIVER_MODULE().
+ *
+ * If the driver is currently attached to any devices,
+ * devclass_delete_driver() will first attempt to detach from each
+ * device. If one of the detach calls fails, the driver will not be
+ * deleted.
+ *
+ * @param dc		the devclass to edit
+ * @param driver	the driver to unregister
+ */
+int
+devclass_delete_driver(devclass_t busclass, driver_t *driver)
+{
+	devclass_t dc = devclass_find(driver->name);
+	driverlink_t dl;
+	int error;
+
+	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
+
+	if (!dc)
+		return (0);
+
+	/*
+	 * Find the link structure in the bus' list of drivers.
+	 */
+	TAILQ_FOREACH(dl, &busclass->drivers, link) {
+		if (dl->driver == driver)
+			break;
+	}
+
+	if (!dl) {
+		PDEBUG(("%s not found in %s list", driver->name,
+		    busclass->name));
+		return (ENOENT);
+	}
+
+	error = devclass_driver_deleted(busclass, dc, driver);
+	if (error != 0)
+		return (error);
+
+	TAILQ_REMOVE(&busclass->drivers, dl, link);
+	free(dl, M_BUS);
+
+	/* XXX: kobj_mtx */
+	driver->refs--;
+	if (driver->refs == 0)
+		kobj_class_free((kobj_class_t) driver);
+
+	bus_data_generation_update();
+	return (0);
+}
+
+/**
+ * @brief Quiesces a set of device drivers from a device class
+ *
+ * Quiesce a device driver from a devclass. This is normally called
+ * automatically by DRIVER_MODULE().
+ *
+ * If the driver is currently attached to any devices,
+ * devclass_quiesece_driver() will first attempt to quiesce each
+ * device.
+ *
+ * @param dc		the devclass to edit
+ * @param driver	the driver to unregister
+ */
+static int
+devclass_quiesce_driver(devclass_t busclass, driver_t *driver)
+{
+	devclass_t dc = devclass_find(driver->name);
+	driverlink_t dl;
+	device_t dev;
+	int i;
+	int error;
+
+	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
+
+	if (!dc)
+		return (0);
+
+	/*
+	 * Find the link structure in the bus' list of drivers.
+	 */
+	TAILQ_FOREACH(dl, &busclass->drivers, link) {
+		if (dl->driver == driver)
+			break;
+	}
+
+	if (!dl) {
+		PDEBUG(("%s not found in %s list", driver->name,
+		    busclass->name));
+		return (ENOENT);
+	}
+
+	/*
+	 * Quiesce all devices.  We iterate through all the devices in
+	 * the devclass of the driver and quiesce any which are using
+	 * the driver and which have a parent in the devclass which we
+	 * are quiescing.
+	 *
+	 * Note that since a driver can be in multiple devclasses, we
+	 * should not quiesce devices which are not children of
+	 * devices in the affected devclass.
+	 */
+	for (i = 0; i < dc->maxunit; i++) {
+		if (dc->devices[i]) {
+			dev = dc->devices[i];
+			if (dev->driver == driver && dev->parent &&
+			    dev->parent->devclass == busclass) {
+				if ((error = device_quiesce(dev)) != 0)
+					return (error);
+			}
+		}
+	}
+
+	return (0);
+}
+
+/**
+ * @internal
+ */
+static driverlink_t
+devclass_find_driver_internal(devclass_t dc, const char *classname)
+{
+	driverlink_t dl;
+
+	PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc)));
+
+	TAILQ_FOREACH(dl, &dc->drivers, link) {
+		if (!strcmp(dl->driver->name, classname))
+			return (dl);
+	}
+
+	PDEBUG(("not found"));
+	return (NULL);
+}
+
+/**
+ * @brief Return the name of the devclass
+ */
+const char *
+devclass_get_name(devclass_t dc)
+{
+	return (dc->name);
+}
+
+/**
+ * @brief Find a device given a unit number
+ *
+ * @param dc		the devclass to search
+ * @param unit		the unit number to search for
+ * 
+ * @returns		the device with the given unit number or @c
+ *			NULL if there is no such device
+ */
+device_t
+devclass_get_device(devclass_t dc, int unit)
+{
+	if (dc == NULL || unit < 0 || unit >= dc->maxunit)
+		return (NULL);
+	return (dc->devices[unit]);
+}
+
+/**
+ * @brief Find the softc field of a device given a unit number
+ *
+ * @param dc		the devclass to search
+ * @param unit		the unit number to search for
+ * 
+ * @returns		the softc field of the device with the given
+ *			unit number or @c NULL if there is no such
+ *			device
+ */
+void *
+devclass_get_softc(devclass_t dc, int unit)
+{
+	device_t dev;
+
+	dev = devclass_get_device(dc, unit);
+	if (!dev)
+		return (NULL);
+
+	return (device_get_softc(dev));
+}
+
+/**
+ * @brief Get a list of devices in the devclass
+ *
+ * An array containing a list of all the devices in the given devclass
+ * is allocated and returned in @p *devlistp. The number of devices
+ * in the array is returned in @p *devcountp. The caller should free
+ * the array using @c free(p, M_TEMP), even if @p *devcountp is 0.
+ *
+ * @param dc		the devclass to examine
+ * @param devlistp	points at location for array pointer return
+ *			value
+ * @param devcountp	points at location for array size return value
+ *
+ * @retval 0		success
+ * @retval ENOMEM	the array allocation failed
+ */
+int
+devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp)
+{
+	int count, i;
+	device_t *list;
+
+	count = devclass_get_count(dc);
+	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
+	if (!list)
+		return (ENOMEM);
+
+	count = 0;
+	for (i = 0; i < dc->maxunit; i++) {
+		if (dc->devices[i]) {
+			list[count] = dc->devices[i];
+			count++;
+		}
+	}
+
+	*devlistp = list;
+	*devcountp = count;
+
+	return (0);
+}
+
+/**
+ * @brief Get a list of drivers in the devclass
+ *
+ * An array containing a list of pointers to all the drivers in the
+ * given devclass is allocated and returned in @p *listp.  The number
+ * of drivers in the array is returned in @p *countp. The caller should
+ * free the array using @c free(p, M_TEMP).
+ *
+ * @param dc		the devclass to examine
+ * @param listp		gives location for array pointer return value
+ * @param countp	gives location for number of array elements
+ *			return value
+ *
+ * @retval 0		success
+ * @retval ENOMEM	the array allocation failed
+ */
+int
+devclass_get_drivers(devclass_t dc, driver_t ***listp, int *countp)
+{
+	driverlink_t dl;
+	driver_t **list;
+	int count;
+
+	count = 0;
+	TAILQ_FOREACH(dl, &dc->drivers, link)
+		count++;
+	list = malloc(count * sizeof(driver_t *), M_TEMP, M_NOWAIT);
+	if (list == NULL)
+		return (ENOMEM);
+
+	count = 0;
+	TAILQ_FOREACH(dl, &dc->drivers, link) {
+		list[count] = dl->driver;
+		count++;
+	}
+	*listp = list;
+	*countp = count;
+
+	return (0);
+}
+
+/**
+ * @brief Get the number of devices in a devclass
+ *
+ * @param dc		the devclass to examine
+ */
+int
+devclass_get_count(devclass_t dc)
+{
+	int count, i;
+
+	count = 0;
+	for (i = 0; i < dc->maxunit; i++)
+		if (dc->devices[i])
+			count++;
+	return (count);
+}
+
+/**
+ * @brief Get the maximum unit number used in a devclass
+ *
+ * Note that this is one greater than the highest currently-allocated
+ * unit.  If a null devclass_t is passed in, -1 is returned to indicate
+ * that not even the devclass has been allocated yet.
+ *
+ * @param dc		the devclass to examine
+ */
+int
+devclass_get_maxunit(devclass_t dc)
+{
+	if (dc == NULL)
+		return (-1);
+	return (dc->maxunit);
+}
+
+/**
+ * @brief Find a free unit number in a devclass
+ *
+ * This function searches for the first unused unit number greater
+ * that or equal to @p unit.
+ *
+ * @param dc		the devclass to examine
+ * @param unit		the first unit number to check
+ */
+int
+devclass_find_free_unit(devclass_t dc, int unit)
+{
+	if (dc == NULL)
+		return (unit);
+	while (unit < dc->maxunit && dc->devices[unit] != NULL)
+		unit++;
+	return (unit);
+}
+
+/**
+ * @brief Set the parent of a devclass
+ *
+ * The parent class is normally initialised automatically by
+ * DRIVER_MODULE().
+ *
+ * @param dc		the devclass to edit
+ * @param pdc		the new parent devclass
+ */
+void
+devclass_set_parent(devclass_t dc, devclass_t pdc)
+{
+	dc->parent = pdc;
+}
+
+/**
+ * @brief Get the parent of a devclass
+ *
+ * @param dc		the devclass to examine
+ */
+devclass_t
+devclass_get_parent(devclass_t dc)
+{
+	return (dc->parent);
+}
+
+struct sysctl_ctx_list *
+devclass_get_sysctl_ctx(devclass_t dc)
+{
+	return (&dc->sysctl_ctx);
+}
+
+struct sysctl_oid *
+devclass_get_sysctl_tree(devclass_t dc)
+{
+	return (dc->sysctl_tree);
+}
+
+/**
+ * @internal
+ * @brief Allocate a unit number
+ *
+ * On entry, @p *unitp is the desired unit number (or @c -1 if any
+ * will do). The allocated unit number is returned in @p *unitp.
+
+ * @param dc		the devclass to allocate from
+ * @param unitp		points at the location for the allocated unit
+ *			number
+ *
+ * @retval 0		success
+ * @retval EEXIST	the requested unit number is already allocated
+ * @retval ENOMEM	memory allocation failure
+ */
+static int
+devclass_alloc_unit(devclass_t dc, device_t dev, int *unitp)
+{
+	const char *s;
+	int unit = *unitp;
+
+	PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc)));
+
+	/* Ask the parent bus if it wants to wire this device. */
+	if (unit == -1)
+		BUS_HINT_DEVICE_UNIT(device_get_parent(dev), dev, dc->name,
+		    &unit);
+
+	/* If we were given a wired unit number, check for existing device */
+	/* XXX imp XXX */
+	if (unit != -1) {
+		if (unit >= 0 && unit < dc->maxunit &&
+		    dc->devices[unit] != NULL) {
+			if (bootverbose)
+				printf("%s: %s%d already exists; skipping it\n",
+				    dc->name, dc->name, *unitp);
+			return (EEXIST);
+		}
+	} else {
+		/* Unwired device, find the next available slot for it */
+		unit = 0;
+		for (unit = 0;; unit++) {
+			/* If there is an "at" hint for a unit then skip it. */
+			if (resource_string_value(dc->name, unit, "at", &s) ==
+			    0)
+				continue;
+
+			/* If this device slot is already in use, skip it. */
+			if (unit < dc->maxunit && dc->devices[unit] != NULL)
+				continue;
+
+			break;
+		}
+	}
+
+	/*
+	 * We've selected a unit beyond the length of the table, so let's
+	 * extend the table to make room for all units up to and including
+	 * this one.
+	 */
+	if (unit >= dc->maxunit) {
+		device_t *newlist, *oldlist;
+		int newsize;
+
+		oldlist = dc->devices;
+		newsize = roundup((unit + 1), MINALLOCSIZE / sizeof(device_t));
+		newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT);
+		if (!newlist)
+			return (ENOMEM);
+		if (oldlist != NULL)
+			bcopy(oldlist, newlist, sizeof(device_t) * dc->maxunit);
+		bzero(newlist + dc->maxunit,
+		    sizeof(device_t) * (newsize - dc->maxunit));
+		dc->devices = newlist;
+		dc->maxunit = newsize;
+		if (oldlist != NULL)
+			free(oldlist, M_BUS);
+	}
+	PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc)));
+
+	*unitp = unit;
+	return (0);
+}
+
+/**
+ * @internal
+ * @brief Add a device to a devclass
+ *
+ * A unit number is allocated for the device (using the device's
+ * preferred unit number if any) and the device is registered in the
+ * devclass. This allows the device to be looked up by its unit
+ * number, e.g. by decoding a dev_t minor number.
+ *
+ * @param dc		the devclass to add to
+ * @param dev		the device to add
+ *
+ * @retval 0		success
+ * @retval EEXIST	the requested unit number is already allocated
+ * @retval ENOMEM	memory allocation failure
+ */
+static int
+devclass_add_device(devclass_t dc, device_t dev)
+{
+	int buflen, error;
+
+	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
+
+	buflen = snprintf(NULL, 0, "%s%d$", dc->name, INT_MAX);
+	if (buflen < 0)
+		return (ENOMEM);
+	dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO);
+	if (!dev->nameunit)
+		return (ENOMEM);
+
+	if ((error = devclass_alloc_unit(dc, dev, &dev->unit)) != 0) {
+		free(dev->nameunit, M_BUS);
+		dev->nameunit = NULL;
+		return (error);
+	}
+	dc->devices[dev->unit] = dev;
+	dev->devclass = dc;
+	snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit);
+
+	return (0);
+}
+
+/**
+ * @internal
+ * @brief Delete a device from a devclass
+ *
+ * The device is removed from the devclass's device list and its unit
+ * number is freed.
+
+ * @param dc		the devclass to delete from
+ * @param dev		the device to delete
+ *
+ * @retval 0		success
+ */
+static int
+devclass_delete_device(devclass_t dc, device_t dev)
+{
+	if (!dc || !dev)
+		return (0);
+
+	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
+
+	if (dev->devclass != dc || dc->devices[dev->unit] != dev)
+		panic("devclass_delete_device: inconsistent device class");
+	dc->devices[dev->unit] = NULL;
+	if (dev->flags & DF_WILDCARD)
+		dev->unit = -1;
+	dev->devclass = NULL;
+	free(dev->nameunit, M_BUS);
+	dev->nameunit = NULL;
+
+	return (0);
+}
+
+/**
+ * @internal
+ * @brief Make a new device and add it as a child of @p parent
+ *
+ * @param parent	the parent of the new device
+ * @param name		the devclass name of the new device or @c NULL
+ *			to leave the devclass unspecified
+ * @parem unit		the unit number of the new device of @c -1 to
+ *			leave the unit number unspecified
+ *
+ * @returns the new device
+ */
+static device_t
+make_device(device_t parent, const char *name, int unit)
+{
+	device_t dev;
+	devclass_t dc;
+
+	PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit));
+
+	if (name) {
+		dc = devclass_find_internal(name, NULL, TRUE);
+		if (!dc) {
+			printf("make_device: can't find device class %s\n",
+			    name);
+			return (NULL);
+		}
+	} else {
+		dc = NULL;
+	}
+
+	dev = malloc(sizeof(struct device), M_BUS, M_NOWAIT|M_ZERO);
+	if (!dev)
+		return (NULL);
+
+	dev->parent = parent;
+	TAILQ_INIT(&dev->children);
+	kobj_init((kobj_t) dev, &null_class);
+	dev->driver = NULL;
+	dev->devclass = NULL;
+	dev->unit = unit;
+	dev->nameunit = NULL;
+	dev->desc = NULL;
+	dev->busy = 0;
+	dev->devflags = 0;
+	dev->flags = DF_ENABLED;
+	dev->order = 0;
+	if (unit == -1)
+		dev->flags |= DF_WILDCARD;
+	if (name) {
+		dev->flags |= DF_FIXEDCLASS;
+		if (devclass_add_device(dc, dev)) {
+			kobj_delete((kobj_t) dev, M_BUS);
+			return (NULL);
+		}
+	}
+	dev->ivars = NULL;
+	dev->softc = NULL;
+
+	dev->state = DS_NOTPRESENT;
+
+	TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink);
+	bus_data_generation_update();
+
+	return (dev);
+}
+
+/**
+ * @internal
+ * @brief Print a description of a device.
+ */
+static int
+device_print_child(device_t dev, device_t child)
+{
+	int retval = 0;
+
+	if (device_is_alive(child))
+		retval += BUS_PRINT_CHILD(dev, child);
+	else
+		retval += device_printf(child, " not found\n");
+
+	return (retval);
+}
+
+/**
+ * @brief Create a new device
+ *
+ * This creates a new device and adds it as a child of an existing
+ * parent device. The new device will be added after the last existing
+ * child with order zero.
+ * 
+ * @param dev		the device which will be the parent of the
+ *			new child device
+ * @param name		devclass name for new device or @c NULL if not
+ *			specified
+ * @param unit		unit number for new device or @c -1 if not
+ *			specified
+ * 
+ * @returns		the new device
+ */
+device_t
+device_add_child(device_t dev, const char *name, int unit)
+{
+	return (device_add_child_ordered(dev, 0, name, unit));
+}
+
+/**
+ * @brief Create a new device
+ *
+ * This creates a new device and adds it as a child of an existing
+ * parent device. The new device will be added after the last existing
+ * child with the same order.
+ * 
+ * @param dev		the device which will be the parent of the
+ *			new child device
+ * @param order		a value which is used to partially sort the
+ *			children of @p dev - devices created using
+ *			lower values of @p order appear first in @p
+ *			dev's list of children
+ * @param name		devclass name for new device or @c NULL if not
+ *			specified
+ * @param unit		unit number for new device or @c -1 if not
+ *			specified
+ * 
+ * @returns		the new device
+ */
+device_t
+device_add_child_ordered(device_t dev, u_int order, const char *name, int unit)
+{
+	device_t child;
+	device_t place;
+
+	PDEBUG(("%s at %s with order %u as unit %d",
+	    name, DEVICENAME(dev), order, unit));
+	KASSERT(name != NULL || unit == -1,
+	    ("child device with wildcard name and specific unit number"));
+
+	child = make_device(dev, name, unit);
+	if (child == NULL)
+		return (child);
+	child->order = order;
+
+	TAILQ_FOREACH(place, &dev->children, link) {
+		if (place->order > order)
+			break;
+	}
+
+	if (place) {
+		/*
+		 * The device 'place' is the first device whose order is
+		 * greater than the new child.
+		 */
+		TAILQ_INSERT_BEFORE(place, child, link);
+	} else {
+		/*
+		 * The new child's order is greater or equal to the order of
+		 * any existing device. Add the child to the tail of the list.
+		 */
+		TAILQ_INSERT_TAIL(&dev->children, child, link);
+	}
+
+	bus_data_generation_update();
+	return (child);
+}
+
+/**
+ * @brief Delete a device
+ *
+ * This function deletes a device along with all of its children. If
+ * the device currently has a driver attached to it, the device is
+ * detached first using device_detach().
+ * 
+ * @param dev		the parent device
+ * @param child		the device to delete
+ *
+ * @retval 0		success
+ * @retval non-zero	a unit error code describing the error
+ */
+int
+device_delete_child(device_t dev, device_t child)
+{
+	int error;
+	device_t grandchild;
+
+	PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));
+
+	/* remove children first */
+	while ((grandchild = TAILQ_FIRST(&child->children)) != NULL) {
+		error = device_delete_child(child, grandchild);
+		if (error)
+			return (error);
+	}
+
+	if ((error = device_detach(child)) != 0)
+		return (error);
+	if (child->devclass)
+		devclass_delete_device(child->devclass, child);
+	if (child->parent)
+		BUS_CHILD_DELETED(dev, child);
+	TAILQ_REMOVE(&dev->children, child, link);
+	TAILQ_REMOVE(&bus_data_devices, child, devlink);
+	kobj_delete((kobj_t) child, M_BUS);
+
+	bus_data_generation_update();
+	return (0);
+}
+
+/**
+ * @brief Delete all children devices of the given device, if any.
+ *
+ * This function deletes all children devices of the given device, if
+ * any, using the device_delete_child() function for each device it
+ * finds. If a child device cannot be deleted, this function will
+ * return an error code.
+ * 
+ * @param dev		the parent device
+ *
+ * @retval 0		success
+ * @retval non-zero	a device would not detach
+ */
+int
+device_delete_children(device_t dev)
+{
+	device_t child;
+	int error;
+
+	PDEBUG(("Deleting all children of %s", DEVICENAME(dev)));
+
+	error = 0;
+
+	while ((child = TAILQ_FIRST(&dev->children)) != NULL) {
+		error = device_delete_child(dev, child);
+		if (error) {
+			PDEBUG(("Failed deleting %s", DEVICENAME(child)));
+			break;
+		}
+	}
+	return (error);
+}
+
+/**
+ * @brief Find a device given a unit number
+ *
+ * This is similar to devclass_get_devices() but only searches for
+ * devices which have @p dev as a parent.
+ *
+ * @param dev		the parent device to search
+ * @param unit		the unit number to search for.  If the unit is -1,
+ *			return the first child of @p dev which has name
+ *			@p classname (that is, the one with the lowest unit.)
+ *
+ * @returns		the device with the given unit number or @c
+ *			NULL if there is no such device
+ */
+device_t
+device_find_child(device_t dev, const char *classname, int unit)
+{
+	devclass_t dc;
+	device_t child;
+
+	dc = devclass_find(classname);
+	if (!dc)
+		return (NULL);
+
+	if (unit != -1) {
+		child = devclass_get_device(dc, unit);
+		if (child && child->parent == dev)
+			return (child);
+	} else {
+		for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
+			child = devclass_get_device(dc, unit);
+			if (child && child->parent == dev)
+				return (child);
+		}
+	}
+	return (NULL);
+}
+
+/**
+ * @internal
+ */
+static driverlink_t
+first_matching_driver(devclass_t dc, device_t dev)
+{
+	if (dev->devclass)
+		return (devclass_find_driver_internal(dc, dev->devclass->name));
+	return (TAILQ_FIRST(&dc->drivers));
+}
+
+/**
+ * @internal
+ */
+static driverlink_t
+next_matching_driver(devclass_t dc, device_t dev, driverlink_t last)
+{
+	if (dev->devclass) {
+		driverlink_t dl;
+		for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link))
+			if (!strcmp(dev->devclass->name, dl->driver->name))
+				return (dl);
+		return (NULL);
+	}
+	return (TAILQ_NEXT(last, link));
+}
+
+/**
+ * @internal
+ */
+int
+device_probe_child(device_t dev, device_t child)
+{
+	devclass_t dc;
+	driverlink_t best = NULL;
+	driverlink_t dl;
+	int result, pri = 0;
+	int hasclass = (child->devclass != NULL);
+
+	GIANT_REQUIRED;
+
+	dc = dev->devclass;
+	if (!dc)
+		panic("device_probe_child: parent device has no devclass");
+
+	/*
+	 * If the state is already probed, then return.  However, don't
+	 * return if we can rebid this object.
+	 */
+	if (child->state == DS_ALIVE && (child->flags & DF_REBID) == 0)
+		return (0);
+
+	for (; dc; dc = dc->parent) {
+		for (dl = first_matching_driver(dc, child);
+		     dl;
+		     dl = next_matching_driver(dc, child, dl)) {
+			/* If this driver's pass is too high, then ignore it. */
+			if (dl->pass > bus_current_pass)
+				continue;
+
+			PDEBUG(("Trying %s", DRIVERNAME(dl->driver)));
+			result = device_set_driver(child, dl->driver);
+			if (result == ENOMEM)
+				return (result);
+			else if (result != 0)
+				continue;
+			if (!hasclass) {
+				if (device_set_devclass(child,
+				    dl->driver->name) != 0) {
+					char const * devname =
+					    device_get_name(child);
+					if (devname == NULL)
+						devname = "(unknown)";
+					printf("driver bug: Unable to set "
+					    "devclass (class: %s "
+					    "devname: %s)\n",
+					    dl->driver->name,
+					    devname);
+					(void)device_set_driver(child, NULL);
+					continue;
+				}
+			}
+
+			/* Fetch any flags for the device before probing. */
+			resource_int_value(dl->driver->name, child->unit,
+			    "flags", &child->devflags);
+
+			result = DEVICE_PROBE(child);
+
+			/* Reset flags and devclass before the next probe. */
+			child->devflags = 0;
+			if (!hasclass)
+				(void)device_set_devclass(child, NULL);
+
+			/*
+			 * If the driver returns SUCCESS, there can be
+			 * no higher match for this device.
+			 */
+			if (result == 0) {
+				best = dl;
+				pri = 0;
+				break;
+			}
+
+			/*
+			 * The driver returned an error so it
+			 * certainly doesn't match.
+			 */
+			if (result > 0) {
+				(void)device_set_driver(child, NULL);
+				continue;
+			}
+
+			/*
+			 * A priority lower than SUCCESS, remember the
+			 * best matching driver. Initialise the value
+			 * of pri for the first match.
+			 */
+			if (best == NULL || result > pri) {
+				/*
+				 * Probes that return BUS_PROBE_NOWILDCARD
+				 * or lower only match on devices whose
+				 * driver was explicitly specified.
+				 */
+				if (result <= BUS_PROBE_NOWILDCARD &&
+				    !(child->flags & DF_FIXEDCLASS))
+					continue;
+				best = dl;
+				pri = result;
+				continue;
+			}
+		}
+		/*
+		 * If we have an unambiguous match in this devclass,
+		 * don't look in the parent.
+		 */
+		if (best && pri == 0)
+			break;
+	}
+
+	/*
+	 * If we found a driver, change state and initialise the devclass.
+	 */
+	/* XXX What happens if we rebid and got no best? */
+	if (best) {
+		/*
+		 * If this device was attached, and we were asked to
+		 * rescan, and it is a different driver, then we have
+		 * to detach the old driver and reattach this new one.
+		 * Note, we don't have to check for DF_REBID here
+		 * because if the state is > DS_ALIVE, we know it must
+		 * be.
+		 *
+		 * This assumes that all DF_REBID drivers can have
+		 * their probe routine called at any time and that
+		 * they are idempotent as well as completely benign in
+		 * normal operations.
+		 *
+		 * We also have to make sure that the detach
+		 * succeeded, otherwise we fail the operation (or
+		 * maybe it should just fail silently?  I'm torn).
+		 */
+		if (child->state > DS_ALIVE && best->driver != child->driver)
+			if ((result = device_detach(dev)) != 0)
+				return (result);
+
+		/* Set the winning driver, devclass, and flags. */
+		if (!child->devclass) {
+			result = device_set_devclass(child, best->driver->name);
+			if (result != 0)
+				return (result);
+		}
+		result = device_set_driver(child, best->driver);
+		if (result != 0)
+			return (result);
+		resource_int_value(best->driver->name, child->unit,
+		    "flags", &child->devflags);
+
+		if (pri < 0) {
+			/*
+			 * A bit bogus. Call the probe method again to make
+			 * sure that we have the right description.
+			 */
+			DEVICE_PROBE(child);
+#if 0
+			child->flags |= DF_REBID;
+#endif
+		} else
+			child->flags &= ~DF_REBID;
+		child->state = DS_ALIVE;
+
+		bus_data_generation_update();
+		return (0);
+	}
+
+	return (ENXIO);
+}
+
+/**
+ * @brief Return the parent of a device
+ */
+device_t
+device_get_parent(device_t dev)
+{
+	return (dev->parent);
+}
+
+/**
+ * @brief Get a list of children of a device
+ *
+ * An array containing a list of all the children of the given device
+ * is allocated and returned in @p *devlistp. The number of devices
+ * in the array is returned in @p *devcountp. The caller should free
+ * the array using @c free(p, M_TEMP).
+ *
+ * @param dev		the device to examine
+ * @param devlistp	points at location for array pointer return
+ *			value
+ * @param devcountp	points at location for array size return value
+ *
+ * @retval 0		success
+ * @retval ENOMEM	the array allocation failed
+ */
+int
+device_get_children(device_t dev, device_t **devlistp, int *devcountp)
+{
+	int count;
+	device_t child;
+	device_t *list;
+
+	count = 0;
+	TAILQ_FOREACH(child, &dev->children, link) {
+		count++;
+	}
+	if (count == 0) {
+		*devlistp = NULL;
+		*devcountp = 0;
+		return (0);
+	}
+
+	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
+	if (!list)
+		return (ENOMEM);
+
+	count = 0;
+	TAILQ_FOREACH(child, &dev->children, link) {
+		list[count] = child;
+		count++;
+	}
+
+	*devlistp = list;
+	*devcountp = count;
+
+	return (0);
+}
+
+/**
+ * @brief Return the current driver for the device or @c NULL if there
+ * is no driver currently attached
+ */
+driver_t *
+device_get_driver(device_t dev)
+{
+	return (dev->driver);
+}
+
+/**
+ * @brief Return the current devclass for the device or @c NULL if
+ * there is none.
+ */
+devclass_t
+device_get_devclass(device_t dev)
+{
+	return (dev->devclass);
+}
+
+/**
+ * @brief Return the name of the device's devclass or @c NULL if there
+ * is none.
+ */
+const char *
+device_get_name(device_t dev)
+{
+	if (dev != NULL && dev->devclass)
+		return (devclass_get_name(dev->devclass));
+	return (NULL);
+}
+
+/**
+ * @brief Return a string containing the device's devclass name
+ * followed by an ascii representation of the device's unit number
+ * (e.g. @c "foo2").
+ */
+const char *
+device_get_nameunit(device_t dev)
+{
+	return (dev->nameunit);
+}
+
+/**
+ * @brief Return the device's unit number.
+ */
+int
+device_get_unit(device_t dev)
+{
+	return (dev->unit);
+}
+
+/**
+ * @brief Return the device's description string
+ */
+const char *
+device_get_desc(device_t dev)
+{
+	return (dev->desc);
+}
+
+/**
+ * @brief Return the device's flags
+ */
+uint32_t
+device_get_flags(device_t dev)
+{
+	return (dev->devflags);
+}
+
+struct sysctl_ctx_list *
+device_get_sysctl_ctx(device_t dev)
+{
+	return (&dev->sysctl_ctx);
+}
+
+struct sysctl_oid *
+device_get_sysctl_tree(device_t dev)
+{
+	return (dev->sysctl_tree);
+}
+
+/**
+ * @brief Print the name of the device followed by a colon and a space
+ *
+ * @returns the number of characters printed
+ */
+int
+device_print_prettyname(device_t dev)
+{
+	const char *name = device_get_name(dev);
+
+	if (name == NULL)
+		return (printf("unknown: "));
+	return (printf("%s%d: ", name, device_get_unit(dev)));
+}
+
+/**
+ * @brief Print the name of the device followed by a colon, a space
+ * and the result of calling vprintf() with the value of @p fmt and
+ * the following arguments.
+ *
+ * @returns the number of characters printed
+ */
+int
+device_printf(device_t dev, const char * fmt, ...)
+{
+	va_list ap;
+	int retval;
+
+	retval = device_print_prettyname(dev);
+	va_start(ap, fmt);
+	retval += vprintf(fmt, ap);
+	va_end(ap);
+	return (retval);
+}
+
+/**
+ * @internal
+ */
+static void
+device_set_desc_internal(device_t dev, const char* desc, int copy)
+{
+	if (dev->desc && (dev->flags & DF_DESCMALLOCED)) {
+		free(dev->desc, M_BUS);
+		dev->flags &= ~DF_DESCMALLOCED;
+		dev->desc = NULL;
+	}
+
+	if (copy && desc) {
+		dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT);
+		if (dev->desc) {
+			strcpy(dev->desc, desc);
+			dev->flags |= DF_DESCMALLOCED;
+		}
+	} else {
+		/* Avoid a -Wcast-qual warning */
+		dev->desc = (char *)(uintptr_t) desc;
+	}
+
+	bus_data_generation_update();
+}
+
+/**
+ * @brief Set the device's description
+ *
+ * The value of @c desc should be a string constant that will not
+ * change (at least until the description is changed in a subsequent
+ * call to device_set_desc() or device_set_desc_copy()).
+ */
+void
+device_set_desc(device_t dev, const char* desc)
+{
+	device_set_desc_internal(dev, desc, FALSE);
+}
+
+/**
+ * @brief Set the device's description
+ *
+ * The string pointed to by @c desc is copied. Use this function if
+ * the device description is generated, (e.g. with sprintf()).
+ */
+void
+device_set_desc_copy(device_t dev, const char* desc)
+{
+	device_set_desc_internal(dev, desc, TRUE);
+}
+
+/**
+ * @brief Set the device's flags
+ */
+void
+device_set_flags(device_t dev, uint32_t flags)
+{
+	dev->devflags = flags;
+}
+
+/**
+ * @brief Return the device's softc field
+ *
+ * The softc is allocated and zeroed when a driver is attached, based
+ * on the size field of the driver.
+ */
+void *
+device_get_softc(device_t dev)
+{
+	return (dev->softc);
+}
+
+/**
+ * @brief Set the device's softc field
+ *
+ * Most drivers do not need to use this since the softc is allocated
+ * automatically when the driver is attached.
+ */
+void
+device_set_softc(device_t dev, void *softc)
+{
+	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC))
+		free(dev->softc, M_BUS_SC);
+	dev->softc = softc;
+	if (dev->softc)
+		dev->flags |= DF_EXTERNALSOFTC;
+	else
+		dev->flags &= ~DF_EXTERNALSOFTC;
+}
+
+/**
+ * @brief Free claimed softc
+ *
+ * Most drivers do not need to use this since the softc is freed
+ * automatically when the driver is detached.
+ */
+void
+device_free_softc(void *softc)
+{
+	free(softc, M_BUS_SC);
+}
+
+/**
+ * @brief Claim softc
+ *
+ * This function can be used to let the driver free the automatically
+ * allocated softc using "device_free_softc()". This function is
+ * useful when the driver is refcounting the softc and the softc
+ * cannot be freed when the "device_detach" method is called.
+ */
+void
+device_claim_softc(device_t dev)
+{
+	if (dev->softc)
+		dev->flags |= DF_EXTERNALSOFTC;
+	else
+		dev->flags &= ~DF_EXTERNALSOFTC;
+}
+
+/**
+ * @brief Get the device's ivars field
+ *
+ * The ivars field is used by the parent device to store per-device
+ * state (e.g. the physical location of the device or a list of
+ * resources).
+ */
+void *
+device_get_ivars(device_t dev)
+{
+
+	KASSERT(dev != NULL, ("device_get_ivars(NULL, ...)"));
+	return (dev->ivars);
+}
+
+/**
+ * @brief Set the device's ivars field
+ */
+void
+device_set_ivars(device_t dev, void * ivars)
+{
+
+	KASSERT(dev != NULL, ("device_set_ivars(NULL, ...)"));
+	dev->ivars = ivars;
+}
+
+/**
+ * @brief Return the device's state
+ */
+device_state_t
+device_get_state(device_t dev)
+{
+	return (dev->state);
+}
+
+/**
+ * @brief Set the DF_ENABLED flag for the device
+ */
+void
+device_enable(device_t dev)
+{
+	dev->flags |= DF_ENABLED;
+}
+
+/**
+ * @brief Clear the DF_ENABLED flag for the device
+ */
+void
+device_disable(device_t dev)
+{
+	dev->flags &= ~DF_ENABLED;
+}
+
+/**
+ * @brief Increment the busy counter for the device
+ */
+void
+device_busy(device_t dev)
+{
+	if (dev->state < DS_ATTACHING)
+		panic("device_busy: called for unattached device");
+	if (dev->busy == 0 && dev->parent)
+		device_busy(dev->parent);
+	dev->busy++;
+	if (dev->state == DS_ATTACHED)
+		dev->state = DS_BUSY;
+}
+
+/**
+ * @brief Decrement the busy counter for the device
+ */
+void
+device_unbusy(device_t dev)
+{
+	if (dev->busy != 0 && dev->state != DS_BUSY &&
+	    dev->state != DS_ATTACHING)
+		panic("device_unbusy: called for non-busy device %s",
+		    device_get_nameunit(dev));
+	dev->busy--;
+	if (dev->busy == 0) {
+		if (dev->parent)
+			device_unbusy(dev->parent);
+		if (dev->state == DS_BUSY)
+			dev->state = DS_ATTACHED;
+	}
+}
+
+/**
+ * @brief Set the DF_QUIET flag for the device
+ */
+void
+device_quiet(device_t dev)
+{
+	dev->flags |= DF_QUIET;
+}
+
+/**
+ * @brief Clear the DF_QUIET flag for the device
+ */
+void
+device_verbose(device_t dev)
+{
+	dev->flags &= ~DF_QUIET;
+}
+
+/**
+ * @brief Return non-zero if the DF_QUIET flag is set on the device
+ */
+int
+device_is_quiet(device_t dev)
+{
+	return ((dev->flags & DF_QUIET) != 0);
+}
+
+/**
+ * @brief Return non-zero if the DF_ENABLED flag is set on the device
+ */
+int
+device_is_enabled(device_t dev)
+{
+	return ((dev->flags & DF_ENABLED) != 0);
+}
+
+/**
+ * @brief Return non-zero if the device was successfully probed
+ */
+int
+device_is_alive(device_t dev)
+{
+	return (dev->state >= DS_ALIVE);
+}
+
+/**
+ * @brief Return non-zero if the device currently has a driver
+ * attached to it
+ */
+int
+device_is_attached(device_t dev)
+{
+	return (dev->state >= DS_ATTACHED);
+}
+
+/**
+ * @brief Set the devclass of a device
+ * @see devclass_add_device().
+ */
+int
+device_set_devclass(device_t dev, const char *classname)
+{
+	devclass_t dc;
+	int error;
+
+	if (!classname) {
+		if (dev->devclass)
+			devclass_delete_device(dev->devclass, dev);
+		return (0);
+	}
+
+	if (dev->devclass) {
+		printf("device_set_devclass: device class already set\n");
+		return (EINVAL);
+	}
+
+	dc = devclass_find_internal(classname, NULL, TRUE);
+	if (!dc)
+		return (ENOMEM);
+
+	error = devclass_add_device(dc, dev);
+
+	bus_data_generation_update();
+	return (error);
+}
+
+/**
+ * @brief Set the driver of a device
+ *
+ * @retval 0		success
+ * @retval EBUSY	the device already has a driver attached
+ * @retval ENOMEM	a memory allocation failure occurred
+ */
+int
+device_set_driver(device_t dev, driver_t *driver)
+{
+	if (dev->state >= DS_ATTACHED)
+		return (EBUSY);
+
+	if (dev->driver == driver)
+		return (0);
+
+	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) {
+		free(dev->softc, M_BUS_SC);
+		dev->softc = NULL;
+	}
+	device_set_desc(dev, NULL);
+	kobj_delete((kobj_t) dev, NULL);
+	dev->driver = driver;
+	if (driver) {
+		kobj_init((kobj_t) dev, (kobj_class_t) driver);
+		if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) {
+			dev->softc = malloc(driver->size, M_BUS_SC,
+			    M_NOWAIT | M_ZERO);
+			if (!dev->softc) {
+				kobj_delete((kobj_t) dev, NULL);
+				kobj_init((kobj_t) dev, &null_class);
+				dev->driver = NULL;
+				return (ENOMEM);
+			}
+		}
+	} else {
+		kobj_init((kobj_t) dev, &null_class);
+	}
+
+	bus_data_generation_update();
+	return (0);
+}
+
+/**
+ * @brief Probe a device, and return this status.
+ *
+ * This function is the core of the device autoconfiguration
+ * system. Its purpose is to select a suitable driver for a device and
+ * then call that driver to initialise the hardware appropriately. The
+ * driver is selected by calling the DEVICE_PROBE() method of a set of
+ * candidate drivers and then choosing the driver which returned the
+ * best value. This driver is then attached to the device using
+ * device_attach().
+ *
+ * The set of suitable drivers is taken from the list of drivers in
+ * the parent device's devclass. If the device was originally created
+ * with a specific class name (see device_add_child()), only drivers
+ * with that name are probed, otherwise all drivers in the devclass
+ * are probed. If no drivers return successful probe values in the
+ * parent devclass, the search continues in the parent of that
+ * devclass (see devclass_get_parent()) if any.
+ *
+ * @param dev		the device to initialise
+ *
+ * @retval 0		success
+ * @retval ENXIO	no driver was found
+ * @retval ENOMEM	memory allocation failure
+ * @retval non-zero	some other unix error code
+ * @retval -1		Device already attached
+ */
+int
+device_probe(device_t dev)
+{
+	int error;
+
+	GIANT_REQUIRED;
+
+	if (dev->state >= DS_ALIVE && (dev->flags & DF_REBID) == 0)
+		return (-1);
+
+	if (!(dev->flags & DF_ENABLED)) {
+		if (bootverbose && device_get_name(dev) != NULL) {
+			device_print_prettyname(dev);
+			printf("not probed (disabled)\n");
+		}
+		return (-1);
+	}
+	if ((error = device_probe_child(dev->parent, dev)) != 0) {		
+		if (bus_current_pass == BUS_PASS_DEFAULT &&
+		    !(dev->flags & DF_DONENOMATCH)) {
+			BUS_PROBE_NOMATCH(dev->parent, dev);
+			devnomatch(dev);
+			dev->flags |= DF_DONENOMATCH;
+		}
+		return (error);
+	}
+	return (0);
+}
+
+/**
+ * @brief Probe a device and attach a driver if possible
+ *
+ * calls device_probe() and attaches if that was successful.
+ */
+int
+device_probe_and_attach(device_t dev)
+{
+	int error;
+
+	GIANT_REQUIRED;
+
+	error = device_probe(dev);
+	if (error == -1)
+		return (0);
+	else if (error != 0)
+		return (error);
+
+	CURVNET_SET_QUIET(vnet0);
+	error = device_attach(dev);
+	CURVNET_RESTORE();
+	return error;
+}
+
+/**
+ * @brief Attach a device driver to a device
+ *
+ * This function is a wrapper around the DEVICE_ATTACH() driver
+ * method. In addition to calling DEVICE_ATTACH(), it initialises the
+ * device's sysctl tree, optionally prints a description of the device
+ * and queues a notification event for user-based device management
+ * services.
+ *
+ * Normally this function is only called internally from
+ * device_probe_and_attach().
+ *
+ * @param dev		the device to initialise
+ *
+ * @retval 0		success
+ * @retval ENXIO	no driver was found
+ * @retval ENOMEM	memory allocation failure
+ * @retval non-zero	some other unix error code
+ */
+int
+device_attach(device_t dev)
+{
+	int error;
+
+	if (resource_disabled(dev->driver->name, dev->unit)) {
+		device_disable(dev);
+		if (bootverbose)
+			 device_printf(dev, "disabled via hints entry\n");
+		return (ENXIO);
+	}
+
+	device_sysctl_init(dev);
+	if (!device_is_quiet(dev))
+		device_print_child(dev->parent, dev);
+	dev->state = DS_ATTACHING;
+	if ((error = DEVICE_ATTACH(dev)) != 0) {
+		printf("device_attach: %s%d attach returned %d\n",
+		    dev->driver->name, dev->unit, error);
+		if (!(dev->flags & DF_FIXEDCLASS))
+			devclass_delete_device(dev->devclass, dev);
+		(void)device_set_driver(dev, NULL);
+		device_sysctl_fini(dev);
+		KASSERT(dev->busy == 0, ("attach failed but busy"));
+		dev->state = DS_NOTPRESENT;
+		return (error);
+	}
+	device_sysctl_update(dev);
+	if (dev->busy)
+		dev->state = DS_BUSY;
+	else
+		dev->state = DS_ATTACHED;
+	dev->flags &= ~DF_DONENOMATCH;
+	devadded(dev);
+	return (0);
+}
+
+/**
+ * @brief Detach a driver from a device
+ *
+ * This function is a wrapper around the DEVICE_DETACH() driver
+ * method. If the call to DEVICE_DETACH() succeeds, it calls
+ * BUS_CHILD_DETACHED() for the parent of @p dev, queues a
+ * notification event for user-based device management services and
+ * cleans up the device's sysctl tree.
+ *
+ * @param dev		the device to un-initialise
+ *
+ * @retval 0		success
+ * @retval ENXIO	no driver was found
+ * @retval ENOMEM	memory allocation failure
+ * @retval non-zero	some other unix error code
+ */
+int
+device_detach(device_t dev)
+{
+	int error;
+
+	GIANT_REQUIRED;
+
+	PDEBUG(("%s", DEVICENAME(dev)));
+	if (dev->state == DS_BUSY)
+		return (EBUSY);
+	if (dev->state != DS_ATTACHED)
+		return (0);
+
+	if ((error = DEVICE_DETACH(dev)) != 0)
+		return (error);
+	devremoved(dev);
+	if (!device_is_quiet(dev))
+		device_printf(dev, "detached\n");
+	if (dev->parent)
+		BUS_CHILD_DETACHED(dev->parent, dev);
+
+	if (!(dev->flags & DF_FIXEDCLASS))
+		devclass_delete_device(dev->devclass, dev);
+
+	dev->state = DS_NOTPRESENT;
+	(void)device_set_driver(dev, NULL);
+	device_sysctl_fini(dev);
+
+	return (0);
+}
+
+/**
+ * @brief Tells a driver to quiesce itself.
+ *
+ * This function is a wrapper around the DEVICE_QUIESCE() driver
+ * method. If the call to DEVICE_QUIESCE() succeeds.
+ *
+ * @param dev		the device to quiesce
+ *
+ * @retval 0		success
+ * @retval ENXIO	no driver was found
+ * @retval ENOMEM	memory allocation failure
+ * @retval non-zero	some other unix error code
+ */
+int
+device_quiesce(device_t dev)
+{
+
+	PDEBUG(("%s", DEVICENAME(dev)));
+	if (dev->state == DS_BUSY)
+		return (EBUSY);
+	if (dev->state != DS_ATTACHED)
+		return (0);
+
+	return (DEVICE_QUIESCE(dev));
+}
+
+/**
+ * @brief Notify a device of system shutdown
+ *
+ * This function calls the DEVICE_SHUTDOWN() driver method if the
+ * device currently has an attached driver.
+ *
+ * @returns the value returned by DEVICE_SHUTDOWN()
+ */
+int
+device_shutdown(device_t dev)
+{
+	if (dev->state < DS_ATTACHED)
+		return (0);
+	return (DEVICE_SHUTDOWN(dev));
+}
+
+/**
+ * @brief Set the unit number of a device
+ *
+ * This function can be used to override the unit number used for a
+ * device (e.g. to wire a device to a pre-configured unit number).
+ */
+int
+device_set_unit(device_t dev, int unit)
+{
+	devclass_t dc;
+	int err;
+
+	dc = device_get_devclass(dev);
+	if (unit < dc->maxunit && dc->devices[unit])
+		return (EBUSY);
+	err = devclass_delete_device(dc, dev);
+	if (err)
+		return (err);
+	dev->unit = unit;
+	err = devclass_add_device(dc, dev);
+	if (err)
+		return (err);
+
+	bus_data_generation_update();
+	return (0);
+}
+
+/*======================================*/
+/*
+ * Some useful method implementations to make life easier for bus drivers.
+ */
+
+/**
+ * @brief Initialise a resource list.
+ *
+ * @param rl		the resource list to initialise
+ */
+void
+resource_list_init(struct resource_list *rl)
+{
+	STAILQ_INIT(rl);
+}
+
+/**
+ * @brief Reclaim memory used by a resource list.
+ *
+ * This function frees the memory for all resource entries on the list
+ * (if any).
+ *
+ * @param rl		the resource list to free		
+ */
+void
+resource_list_free(struct resource_list *rl)
+{
+	struct resource_list_entry *rle;
+
+	while ((rle = STAILQ_FIRST(rl)) != NULL) {
+		if (rle->res)
+			panic("resource_list_free: resource entry is busy");
+		STAILQ_REMOVE_HEAD(rl, link);
+		free(rle, M_BUS);
+	}
+}
+
+/**
+ * @brief Add a resource entry.
+ *
+ * This function adds a resource entry using the given @p type, @p
+ * start, @p end and @p count values. A rid value is chosen by
+ * searching sequentially for the first unused rid starting at zero.
+ *
+ * @param rl		the resource list to edit
+ * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
+ * @param start		the start address of the resource
+ * @param end		the end address of the resource
+ * @param count		XXX end-start+1
+ */
+int
+resource_list_add_next(struct resource_list *rl, int type, u_long start,
+    u_long end, u_long count)
+{
+	int rid;
+
+	rid = 0;
+	while (resource_list_find(rl, type, rid) != NULL)
+		rid++;
+	resource_list_add(rl, type, rid, start, end, count);
+	return (rid);
+}
+
+/**
+ * @brief Add or modify a resource entry.
+ *
+ * If an existing entry exists with the same type and rid, it will be
+ * modified using the given values of @p start, @p end and @p
+ * count. If no entry exists, a new one will be created using the
+ * given values.  The resource list entry that matches is then returned.
+ *
+ * @param rl		the resource list to edit
+ * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
+ * @param rid		the resource identifier
+ * @param start		the start address of the resource
+ * @param end		the end address of the resource
+ * @param count		XXX end-start+1
+ */
+struct resource_list_entry *
+resource_list_add(struct resource_list *rl, int type, int rid,
+    u_long start, u_long end, u_long count)
+{
+	struct resource_list_entry *rle;
+
+	rle = resource_list_find(rl, type, rid);
+	if (!rle) {
+		rle = malloc(sizeof(struct resource_list_entry), M_BUS,
+		    M_NOWAIT);
+		if (!rle)
+			panic("resource_list_add: can't record entry");
+		STAILQ_INSERT_TAIL(rl, rle, link);
+		rle->type = type;
+		rle->rid = rid;
+		rle->res = NULL;
+		rle->flags = 0;
+	}
+
+	if (rle->res)
+		panic("resource_list_add: resource entry is busy");
+
+	rle->start = start;
+	rle->end = end;
+	rle->count = count;
+	return (rle);
+}
+
+/**
+ * @brief Determine if a resource entry is busy.
+ *
+ * Returns true if a resource entry is busy meaning that it has an
+ * associated resource that is not an unallocated "reserved" resource.
+ *
+ * @param rl		the resource list to search
+ * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
+ * @param rid		the resource identifier
+ *
+ * @returns Non-zero if the entry is busy, zero otherwise.
+ */
+int
+resource_list_busy(struct resource_list *rl, int type, int rid)
+{
+	struct resource_list_entry *rle;
+
+	rle = resource_list_find(rl, type, rid);
+	if (rle == NULL || rle->res == NULL)
+		return (0);
+	if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) == RLE_RESERVED) {
+		KASSERT(!(rman_get_flags(rle->res) & RF_ACTIVE),
+		    ("reserved resource is active"));
+		return (0);
+	}
+	return (1);
+}
+
+/**
+ * @brief Determine if a resource entry is reserved.
+ *
+ * Returns true if a resource entry is reserved meaning that it has an
+ * associated "reserved" resource.  The resource can either be
+ * allocated or unallocated.
+ *
+ * @param rl		the resource list to search
+ * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
+ * @param rid		the resource identifier
+ *
+ * @returns Non-zero if the entry is reserved, zero otherwise.
+ */
+int
+resource_list_reserved(struct resource_list *rl, int type, int rid)
+{
+	struct resource_list_entry *rle;
+
+	rle = resource_list_find(rl, type, rid);
+	if (rle != NULL && rle->flags & RLE_RESERVED)
+		return (1);
+	return (0);
+}
+
+/**
+ * @brief Find a resource entry by type and rid.
+ *
+ * @param rl		the resource list to search
+ * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
+ * @param rid		the resource identifier
+ *
+ * @returns the resource entry pointer or NULL if there is no such
+ * entry.
+ */
+struct resource_list_entry *
+resource_list_find(struct resource_list *rl, int type, int rid)
+{
+	struct resource_list_entry *rle;
+
+	STAILQ_FOREACH(rle, rl, link) {
+		if (rle->type == type && rle->rid == rid)
+			return (rle);
+	}
+	return (NULL);
+}
+
+/**
+ * @brief Delete a resource entry.
+ *
+ * @param rl		the resource list to edit
+ * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
+ * @param rid		the resource identifier
+ */
+void
+resource_list_delete(struct resource_list *rl, int type, int rid)
+{
+	struct resource_list_entry *rle = resource_list_find(rl, type, rid);
+
+	if (rle) {
+		if (rle->res != NULL)
+			panic("resource_list_delete: resource has not been released");
+		STAILQ_REMOVE(rl, rle, resource_list_entry, link);
+		free(rle, M_BUS);
+	}
+}
+
+/**
+ * @brief Allocate a reserved resource
+ *
+ * This can be used by busses to force the allocation of resources
+ * that are always active in the system even if they are not allocated
+ * by a driver (e.g. PCI BARs).  This function is usually called when
+ * adding a new child to the bus.  The resource is allocated from the
+ * parent bus when it is reserved.  The resource list entry is marked
+ * with RLE_RESERVED to note that it is a reserved resource.
+ *
+ * Subsequent attempts to allocate the resource with
+ * resource_list_alloc() will succeed the first time and will set
+ * RLE_ALLOCATED to note that it has been allocated.  When a reserved
+ * resource that has been allocated is released with
+ * resource_list_release() the resource RLE_ALLOCATED is cleared, but
+ * the actual resource remains allocated.  The resource can be released to
+ * the parent bus by calling resource_list_unreserve().
+ *
+ * @param rl		the resource list to allocate from
+ * @param bus		the parent device of @p child
+ * @param child		the device for which the resource is being reserved
+ * @param type		the type of resource to allocate
+ * @param rid		a pointer to the resource identifier
+ * @param start		hint at the start of the resource range - pass
+ *			@c 0UL for any start address
+ * @param end		hint at the end of the resource range - pass
+ *			@c ~0UL for any end address
+ * @param count		hint at the size of range required - pass @c 1
+ *			for any size
+ * @param flags		any extra flags to control the resource
+ *			allocation - see @c RF_XXX flags in
+ *			<sys/rman.h> for details
+ * 
+ * @returns		the resource which was allocated or @c NULL if no
+ *			resource could be allocated
+ */
+struct resource *
+resource_list_reserve(struct resource_list *rl, device_t bus, device_t child,
+    int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
+{
+	struct resource_list_entry *rle = NULL;
+	int passthrough = (device_get_parent(child) != bus);
+	struct resource *r;
+
+	if (passthrough)
+		panic(
+    "resource_list_reserve() should only be called for direct children");
+	if (flags & RF_ACTIVE)
+		panic(
+    "resource_list_reserve() should only reserve inactive resources");
+
+	r = resource_list_alloc(rl, bus, child, type, rid, start, end, count,
+	    flags);
+	if (r != NULL) {
+		rle = resource_list_find(rl, type, *rid);
+		rle->flags |= RLE_RESERVED;
+	}
+	return (r);
+}
+
+/**
+ * @brief Helper function for implementing BUS_ALLOC_RESOURCE()
+ *
+ * Implement BUS_ALLOC_RESOURCE() by looking up a resource from the list
+ * and passing the allocation up to the parent of @p bus. This assumes
+ * that the first entry of @c device_get_ivars(child) is a struct
+ * resource_list. This also handles 'passthrough' allocations where a
+ * child is a remote descendant of bus by passing the allocation up to
+ * the parent of bus.
+ *
+ * Typically, a bus driver would store a list of child resources
+ * somewhere in the child device's ivars (see device_get_ivars()) and
+ * its implementation of BUS_ALLOC_RESOURCE() would find that list and
+ * then call resource_list_alloc() to perform the allocation.
+ *
+ * @param rl		the resource list to allocate from
+ * @param bus		the parent device of @p child
+ * @param child		the device which is requesting an allocation
+ * @param type		the type of resource to allocate
+ * @param rid		a pointer to the resource identifier
+ * @param start		hint at the start of the resource range - pass
+ *			@c 0UL for any start address
+ * @param end		hint at the end of the resource range - pass
+ *			@c ~0UL for any end address
+ * @param count		hint at the size of range required - pass @c 1
+ *			for any size
+ * @param flags		any extra flags to control the resource
+ *			allocation - see @c RF_XXX flags in
+ *			<sys/rman.h> for details
+ * 
+ * @returns		the resource which was allocated or @c NULL if no
+ *			resource could be allocated
+ */
+struct resource *
+resource_list_alloc(struct resource_list *rl, device_t bus, device_t child,
+    int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
+{
+	struct resource_list_entry *rle = NULL;
+	int passthrough = (device_get_parent(child) != bus);
+	int isdefault = (start == 0UL && end == ~0UL);
+
+	if (passthrough) {
+		return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
+		    type, rid, start, end, count, flags));
+	}
+
+	rle = resource_list_find(rl, type, *rid);
+
+	if (!rle)
+		return (NULL);		/* no resource of that type/rid */
+
+	if (rle->res) {
+		if (rle->flags & RLE_RESERVED) {
+			if (rle->flags & RLE_ALLOCATED)
+				return (NULL);
+			if ((flags & RF_ACTIVE) &&
+			    bus_activate_resource(child, type, *rid,
+			    rle->res) != 0)
+				return (NULL);
+			rle->flags |= RLE_ALLOCATED;
+			return (rle->res);
+		}
+		panic("resource_list_alloc: resource entry is busy");
+	}
+
+	if (isdefault) {
+		start = rle->start;
+		count = ulmax(count, rle->count);
+		end = ulmax(rle->end, start + count - 1);
+	}
+
+	rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
+	    type, rid, start, end, count, flags);
+
+	/*
+	 * Record the new range.
+	 */
+	if (rle->res) {
+		rle->start = rman_get_start(rle->res);
+		rle->end = rman_get_end(rle->res);
+		rle->count = count;
+	}
+
+	return (rle->res);
+}
+
+/**
+ * @brief Helper function for implementing BUS_RELEASE_RESOURCE()
+ * 
+ * Implement BUS_RELEASE_RESOURCE() using a resource list. Normally
+ * used with resource_list_alloc().
+ * 
+ * @param rl		the resource list which was allocated from
+ * @param bus		the parent device of @p child
+ * @param child		the device which is requesting a release
+ * @param type		the type of resource to release
+ * @param rid		the resource identifier
+ * @param res		the resource to release
+ * 
+ * @retval 0		success
+ * @retval non-zero	a standard unix error code indicating what
+ *			error condition prevented the operation
+ */
+int
+resource_list_release(struct resource_list *rl, device_t bus, device_t child,
+    int type, int rid, struct resource *res)
+{
+	struct resource_list_entry *rle = NULL;
+	int passthrough = (device_get_parent(child) != bus);
+	int error;
+
+	if (passthrough) {
+		return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
+		    type, rid, res));
+	}
+
+	rle = resource_list_find(rl, type, rid);
+
+	if (!rle)
+		panic("resource_list_release: can't find resource");
+	if (!rle->res)
+		panic("resource_list_release: resource entry is not busy");
+	if (rle->flags & RLE_RESERVED) {
+		if (rle->flags & RLE_ALLOCATED) {
+			if (rman_get_flags(res) & RF_ACTIVE) {
+				error = bus_deactivate_resource(child, type,
+				    rid, res);
+				if (error)
+					return (error);
+			}
+			rle->flags &= ~RLE_ALLOCATED;
+			return (0);
+		}
+		return (EINVAL);
+	}
+
+	error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
+	    type, rid, res);
+	if (error)
+		return (error);
+
+	rle->res = NULL;
+	return (0);
+}
+
+/**
+ * @brief Release all active resources of a given type
+ *
+ * Release all active resources of a specified type.  This is intended
+ * to be used to cleanup resources leaked by a driver after detach or
+ * a failed attach.
+ *
+ * @param rl		the resource list which was allocated from
+ * @param bus		the parent device of @p child
+ * @param child		the device whose active resources are being released
+ * @param type		the type of resources to release
+ * 
+ * @retval 0		success
+ * @retval EBUSY	at least one resource was active
+ */
+int
+resource_list_release_active(struct resource_list *rl, device_t bus,
+    device_t child, int type)
+{
+	struct resource_list_entry *rle;
+	int error, retval;
+
+	retval = 0;
+	STAILQ_FOREACH(rle, rl, link) {
+		if (rle->type != type)
+			continue;
+		if (rle->res == NULL)
+			continue;
+		if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) ==
+		    RLE_RESERVED)
+			continue;
+		retval = EBUSY;
+		error = resource_list_release(rl, bus, child, type,
+		    rman_get_rid(rle->res), rle->res);
+		if (error != 0)
+			device_printf(bus,
+			    "Failed to release active resource: %d\n", error);
+	}
+	return (retval);
+}
+
+
+/**
+ * @brief Fully release a reserved resource
+ *
+ * Fully releases a resource reserved via resource_list_reserve().
+ *
+ * @param rl		the resource list which was allocated from
+ * @param bus		the parent device of @p child
+ * @param child		the device whose reserved resource is being released
+ * @param type		the type of resource to release
+ * @param rid		the resource identifier
+ * @param res		the resource to release
+ * 
+ * @retval 0		success
+ * @retval non-zero	a standard unix error code indicating what
+ *			error condition prevented the operation
+ */
+int
+resource_list_unreserve(struct resource_list *rl, device_t bus, device_t child,
+    int type, int rid)
+{
+	struct resource_list_entry *rle = NULL;
+	int passthrough = (device_get_parent(child) != bus);
+
+	if (passthrough)
+		panic(
+    "resource_list_unreserve() should only be called for direct children");
+
+	rle = resource_list_find(rl, type, rid);
+
+	if (!rle)
+		panic("resource_list_unreserve: can't find resource");
+	if (!(rle->flags & RLE_RESERVED))
+		return (EINVAL);
+	if (rle->flags & RLE_ALLOCATED)
+		return (EBUSY);
+	rle->flags &= ~RLE_RESERVED;
+	return (resource_list_release(rl, bus, child, type, rid, rle->res));
+}
+
+/**
+ * @brief Print a description of resources in a resource list
+ *
+ * Print all resources of a specified type, for use in BUS_PRINT_CHILD().
+ * The name is printed if at least one resource of the given type is available.
+ * The format is used to print resource start and end.
+ *
+ * @param rl		the resource list to print
+ * @param name		the name of @p type, e.g. @c "memory"
+ * @param type		type type of resource entry to print
+ * @param format	printf(9) format string to print resource
+ *			start and end values
+ * 
+ * @returns		the number of characters printed
+ */
+int
+resource_list_print_type(struct resource_list *rl, const char *name, int type,
+    const char *format)
+{
+	struct resource_list_entry *rle;
+	int printed, retval;
+
+	printed = 0;
+	retval = 0;
+	/* Yes, this is kinda cheating */
+	STAILQ_FOREACH(rle, rl, link) {
+		if (rle->type == type) {
+			if (printed == 0)
+				retval += printf(" %s ", name);
+			else
+				retval += printf(",");
+			printed++;
+			retval += printf(format, rle->start);
+			if (rle->count > 1) {
+				retval += printf("-");
+				retval += printf(format, rle->start +
+						 rle->count - 1);
+			}
+		}
+	}
+	return (retval);
+}
+
+/**
+ * @brief Releases all the resources in a list.
+ *
+ * @param rl		The resource list to purge.
+ * 
+ * @returns		nothing
+ */
+void
+resource_list_purge(struct resource_list *rl)
+{
+	struct resource_list_entry *rle;
+
+	while ((rle = STAILQ_FIRST(rl)) != NULL) {
+		if (rle->res)
+			bus_release_resource(rman_get_device(rle->res),
+			    rle->type, rle->rid, rle->res);
+		STAILQ_REMOVE_HEAD(rl, link);
+		free(rle, M_BUS);
+	}
+}
+
+device_t
+bus_generic_add_child(device_t dev, u_int order, const char *name, int unit)
+{
+
+	return (device_add_child_ordered(dev, order, name, unit));
+}
+
+/**
+ * @brief Helper function for implementing DEVICE_PROBE()
+ *
+ * This function can be used to help implement the DEVICE_PROBE() for
+ * a bus (i.e. a device which has other devices attached to it). It
+ * calls the DEVICE_IDENTIFY() method of each driver in the device's
+ * devclass.
+ */
+int
+bus_generic_probe(device_t dev)
+{
+	devclass_t dc = dev->devclass;
+	driverlink_t dl;
+
+	TAILQ_FOREACH(dl, &dc->drivers, link) {
+		/*
+		 * If this driver's pass is too high, then ignore it.
+		 * For most drivers in the default pass, this will
+		 * never be true.  For early-pass drivers they will
+		 * only call the identify routines of eligible drivers
+		 * when this routine is called.  Drivers for later
+		 * passes should have their identify routines called
+		 * on early-pass busses during BUS_NEW_PASS().
+		 */
+		if (dl->pass > bus_current_pass)
+			continue;
+		DEVICE_IDENTIFY(dl->driver, dev);
+	}
+
+	return (0);
+}
+
+/**
+ * @brief Helper function for implementing DEVICE_ATTACH()
+ *
+ * This function can be used to help implement the DEVICE_ATTACH() for
+ * a bus. It calls device_probe_and_attach() for each of the device's
+ * children.
+ */
+int
+bus_generic_attach(device_t dev)
+{
+	device_t child;
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		device_probe_and_attach(child);
+	}
+
+	return (0);
+}
+
+/**
+ * @brief Helper function for implementing DEVICE_DETACH()
+ *
+ * This function can be used to help implement the DEVICE_DETACH() for
+ * a bus. It calls device_detach() for each of the device's
+ * children.
+ */
+int
+bus_generic_detach(device_t dev)
+{
+	device_t child;
+	int error;
+
+	if (dev->state != DS_ATTACHED)
+		return (EBUSY);
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		if ((error = device_detach(child)) != 0)
+			return (error);
+	}
+
+	return (0);
+}
+
+/**
+ * @brief Helper function for implementing DEVICE_SHUTDOWN()
+ *
+ * This function can be used to help implement the DEVICE_SHUTDOWN()
+ * for a bus. It calls device_shutdown() for each of the device's
+ * children.
+ */
+int
+bus_generic_shutdown(device_t dev)
+{
+	device_t child;
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		device_shutdown(child);
+	}
+
+	return (0);
+}
+
+/**
+ * @brief Helper function for implementing DEVICE_SUSPEND()
+ *
+ * This function can be used to help implement the DEVICE_SUSPEND()
+ * for a bus. It calls DEVICE_SUSPEND() for each of the device's
+ * children. If any call to DEVICE_SUSPEND() fails, the suspend
+ * operation is aborted and any devices which were suspended are
+ * resumed immediately by calling their DEVICE_RESUME() methods.
+ */
+int
+bus_generic_suspend(device_t dev)
+{
+	int		error;
+	device_t	child, child2;
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		error = DEVICE_SUSPEND(child);
+		if (error) {
+			for (child2 = TAILQ_FIRST(&dev->children);
+			     child2 && child2 != child;
+			     child2 = TAILQ_NEXT(child2, link))
+				DEVICE_RESUME(child2);
+			return (error);
+		}
+	}
+	return (0);
+}
+
+/**
+ * @brief Helper function for implementing DEVICE_RESUME()
+ *
+ * This function can be used to help implement the DEVICE_RESUME() for
+ * a bus. It calls DEVICE_RESUME() on each of the device's children.
+ */
+int
+bus_generic_resume(device_t dev)
+{
+	device_t	child;
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		DEVICE_RESUME(child);
+		/* if resume fails, there's nothing we can usefully do... */
+	}
+	return (0);
+}
+
+/**
+ * @brief Helper function for implementing BUS_PRINT_CHILD().
+ *
+ * This function prints the first part of the ascii representation of
+ * @p child, including its name, unit and description (if any - see
+ * device_set_desc()).
+ *
+ * @returns the number of characters printed
+ */
+int
+bus_print_child_header(device_t dev, device_t child)
+{
+	int	retval = 0;
+
+	if (device_get_desc(child)) {
+		retval += device_printf(child, "<%s>", device_get_desc(child));
+	} else {
+		retval += printf("%s", device_get_nameunit(child));
+	}
+
+	return (retval);
+}
+
+/**
+ * @brief Helper function for implementing BUS_PRINT_CHILD().
+ *
+ * This function prints the last part of the ascii representation of
+ * @p child, which consists of the string @c " on " followed by the
+ * name and unit of the @p dev.
+ *
+ * @returns the number of characters printed
+ */
+int
+bus_print_child_footer(device_t dev, device_t child)
+{
+	return (printf(" on %s\n", device_get_nameunit(dev)));
+}
+
+/**
+ * @brief Helper function for implementing BUS_PRINT_CHILD().
+ *
+ * This function simply calls bus_print_child_header() followed by
+ * bus_print_child_footer().
+ *
+ * @returns the number of characters printed
+ */
+int
+bus_generic_print_child(device_t dev, device_t child)
+{
+	int	retval = 0;
+
+	retval += bus_print_child_header(dev, child);
+	retval += bus_print_child_footer(dev, child);
+
+	return (retval);
+}
+
+/**
+ * @brief Stub function for implementing BUS_READ_IVAR().
+ * 
+ * @returns ENOENT
+ */
+int
+bus_generic_read_ivar(device_t dev, device_t child, int index,
+    uintptr_t * result)
+{
+	return (ENOENT);
+}
+
+/**
+ * @brief Stub function for implementing BUS_WRITE_IVAR().
+ * 
+ * @returns ENOENT
+ */
+int
+bus_generic_write_ivar(device_t dev, device_t child, int index,
+    uintptr_t value)
+{
+	return (ENOENT);
+}
+
+/**
+ * @brief Stub function for implementing BUS_GET_RESOURCE_LIST().
+ * 
+ * @returns NULL
+ */
+struct resource_list *
+bus_generic_get_resource_list(device_t dev, device_t child)
+{
+	return (NULL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_DRIVER_ADDED().
+ *
+ * This implementation of BUS_DRIVER_ADDED() simply calls the driver's
+ * DEVICE_IDENTIFY() method to allow it to add new children to the bus
+ * and then calls device_probe_and_attach() for each unattached child.
+ */
+void
+bus_generic_driver_added(device_t dev, driver_t *driver)
+{
+	device_t child;
+
+	DEVICE_IDENTIFY(driver, dev);
+	TAILQ_FOREACH(child, &dev->children, link) {
+		if (child->state == DS_NOTPRESENT ||
+		    (child->flags & DF_REBID))
+			device_probe_and_attach(child);
+	}
+}
+
+/**
+ * @brief Helper function for implementing BUS_NEW_PASS().
+ *
+ * This implementing of BUS_NEW_PASS() first calls the identify
+ * routines for any drivers that probe at the current pass.  Then it
+ * walks the list of devices for this bus.  If a device is already
+ * attached, then it calls BUS_NEW_PASS() on that device.  If the
+ * device is not already attached, it attempts to attach a driver to
+ * it.
+ */
+void
+bus_generic_new_pass(device_t dev)
+{
+	driverlink_t dl;
+	devclass_t dc;
+	device_t child;
+
+	dc = dev->devclass;
+	TAILQ_FOREACH(dl, &dc->drivers, link) {
+		if (dl->pass == bus_current_pass)
+			DEVICE_IDENTIFY(dl->driver, dev);
+	}
+	TAILQ_FOREACH(child, &dev->children, link) {
+		if (child->state >= DS_ATTACHED)
+			BUS_NEW_PASS(child);
+		else if (child->state == DS_NOTPRESENT)
+			device_probe_and_attach(child);
+	}
+}
+
+/**
+ * @brief Helper function for implementing BUS_SETUP_INTR().
+ *
+ * This simple implementation of BUS_SETUP_INTR() simply calls the
+ * BUS_SETUP_INTR() method of the parent of @p dev.
+ */
+int
+bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
+    int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg, 
+    void **cookiep)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_SETUP_INTR(dev->parent, child, irq, flags,
+		    filter, intr, arg, cookiep));
+	return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_TEARDOWN_INTR().
+ *
+ * This simple implementation of BUS_TEARDOWN_INTR() simply calls the
+ * BUS_TEARDOWN_INTR() method of the parent of @p dev.
+ */
+int
+bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq,
+    void *cookie)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie));
+	return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_ADJUST_RESOURCE().
+ *
+ * This simple implementation of BUS_ADJUST_RESOURCE() simply calls the
+ * BUS_ADJUST_RESOURCE() method of the parent of @p dev.
+ */
+int
+bus_generic_adjust_resource(device_t dev, device_t child, int type,
+    struct resource *r, u_long start, u_long end)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_ADJUST_RESOURCE(dev->parent, child, type, r, start,
+		    end));
+	return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
+ *
+ * This simple implementation of BUS_ALLOC_RESOURCE() simply calls the
+ * BUS_ALLOC_RESOURCE() method of the parent of @p dev.
+ */
+struct resource *
+bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid,
+    u_long start, u_long end, u_long count, u_int flags)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid,
+		    start, end, count, flags));
+	return (NULL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
+ *
+ * This simple implementation of BUS_RELEASE_RESOURCE() simply calls the
+ * BUS_RELEASE_RESOURCE() method of the parent of @p dev.
+ */
+int
+bus_generic_release_resource(device_t dev, device_t child, int type, int rid,
+    struct resource *r)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid,
+		    r));
+	return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_ACTIVATE_RESOURCE().
+ *
+ * This simple implementation of BUS_ACTIVATE_RESOURCE() simply calls the
+ * BUS_ACTIVATE_RESOURCE() method of the parent of @p dev.
+ */
+int
+bus_generic_activate_resource(device_t dev, device_t child, int type, int rid,
+    struct resource *r)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid,
+		    r));
+	return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_DEACTIVATE_RESOURCE().
+ *
+ * This simple implementation of BUS_DEACTIVATE_RESOURCE() simply calls the
+ * BUS_DEACTIVATE_RESOURCE() method of the parent of @p dev.
+ */
+int
+bus_generic_deactivate_resource(device_t dev, device_t child, int type,
+    int rid, struct resource *r)
+{
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid,
+		    r));
+	return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_BIND_INTR().
+ *
+ * This simple implementation of BUS_BIND_INTR() simply calls the
+ * BUS_BIND_INTR() method of the parent of @p dev.
+ */
+int
+bus_generic_bind_intr(device_t dev, device_t child, struct resource *irq,
+    int cpu)
+{
+
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_BIND_INTR(dev->parent, child, irq, cpu));
+	return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_CONFIG_INTR().
+ *
+ * This simple implementation of BUS_CONFIG_INTR() simply calls the
+ * BUS_CONFIG_INTR() method of the parent of @p dev.
+ */
+int
+bus_generic_config_intr(device_t dev, int irq, enum intr_trigger trig,
+    enum intr_polarity pol)
+{
+
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_CONFIG_INTR(dev->parent, irq, trig, pol));
+	return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_DESCRIBE_INTR().
+ *
+ * This simple implementation of BUS_DESCRIBE_INTR() simply calls the
+ * BUS_DESCRIBE_INTR() method of the parent of @p dev.
+ */
+int
+bus_generic_describe_intr(device_t dev, device_t child, struct resource *irq,
+    void *cookie, const char *descr)
+{
+
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent)
+		return (BUS_DESCRIBE_INTR(dev->parent, child, irq, cookie,
+		    descr));
+	return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_GET_DMA_TAG().
+ *
+ * This simple implementation of BUS_GET_DMA_TAG() simply calls the
+ * BUS_GET_DMA_TAG() method of the parent of @p dev.
+ */
+bus_dma_tag_t
+bus_generic_get_dma_tag(device_t dev, device_t child)
+{
+
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent != NULL)
+		return (BUS_GET_DMA_TAG(dev->parent, child));
+	return (NULL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_GET_RESOURCE().
+ *
+ * This implementation of BUS_GET_RESOURCE() uses the
+ * resource_list_find() function to do most of the work. It calls
+ * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
+ * search.
+ */
+int
+bus_generic_rl_get_resource(device_t dev, device_t child, int type, int rid,
+    u_long *startp, u_long *countp)
+{
+	struct resource_list *		rl = NULL;
+	struct resource_list_entry *	rle = NULL;
+
+	rl = BUS_GET_RESOURCE_LIST(dev, child);
+	if (!rl)
+		return (EINVAL);
+
+	rle = resource_list_find(rl, type, rid);
+	if (!rle)
+		return (ENOENT);
+
+	if (startp)
+		*startp = rle->start;
+	if (countp)
+		*countp = rle->count;
+
+	return (0);
+}
+
+/**
+ * @brief Helper function for implementing BUS_SET_RESOURCE().
+ *
+ * This implementation of BUS_SET_RESOURCE() uses the
+ * resource_list_add() function to do most of the work. It calls
+ * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
+ * edit.
+ */
+int
+bus_generic_rl_set_resource(device_t dev, device_t child, int type, int rid,
+    u_long start, u_long count)
+{
+	struct resource_list *		rl = NULL;
+
+	rl = BUS_GET_RESOURCE_LIST(dev, child);
+	if (!rl)
+		return (EINVAL);
+
+	resource_list_add(rl, type, rid, start, (start + count - 1), count);
+
+	return (0);
+}
+
+/**
+ * @brief Helper function for implementing BUS_DELETE_RESOURCE().
+ *
+ * This implementation of BUS_DELETE_RESOURCE() uses the
+ * resource_list_delete() function to do most of the work. It calls
+ * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
+ * edit.
+ */
+void
+bus_generic_rl_delete_resource(device_t dev, device_t child, int type, int rid)
+{
+	struct resource_list *		rl = NULL;
+
+	rl = BUS_GET_RESOURCE_LIST(dev, child);
+	if (!rl)
+		return;
+
+	resource_list_delete(rl, type, rid);
+
+	return;
+}
+
+/**
+ * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
+ *
+ * This implementation of BUS_RELEASE_RESOURCE() uses the
+ * resource_list_release() function to do most of the work. It calls
+ * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
+ */
+int
+bus_generic_rl_release_resource(device_t dev, device_t child, int type,
+    int rid, struct resource *r)
+{
+	struct resource_list *		rl = NULL;
+
+	if (device_get_parent(child) != dev)
+		return (BUS_RELEASE_RESOURCE(device_get_parent(dev), child,
+		    type, rid, r));
+
+	rl = BUS_GET_RESOURCE_LIST(dev, child);
+	if (!rl)
+		return (EINVAL);
+
+	return (resource_list_release(rl, dev, child, type, rid, r));
+}
+
+/**
+ * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
+ *
+ * This implementation of BUS_ALLOC_RESOURCE() uses the
+ * resource_list_alloc() function to do most of the work. It calls
+ * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
+ */
+struct resource *
+bus_generic_rl_alloc_resource(device_t dev, device_t child, int type,
+    int *rid, u_long start, u_long end, u_long count, u_int flags)
+{
+	struct resource_list *		rl = NULL;
+
+	if (device_get_parent(child) != dev)
+		return (BUS_ALLOC_RESOURCE(device_get_parent(dev), child,
+		    type, rid, start, end, count, flags));
+
+	rl = BUS_GET_RESOURCE_LIST(dev, child);
+	if (!rl)
+		return (NULL);
+
+	return (resource_list_alloc(rl, dev, child, type, rid,
+	    start, end, count, flags));
+}
+
+/**
+ * @brief Helper function for implementing BUS_CHILD_PRESENT().
+ *
+ * This simple implementation of BUS_CHILD_PRESENT() simply calls the
+ * BUS_CHILD_PRESENT() method of the parent of @p dev.
+ */
+int
+bus_generic_child_present(device_t dev, device_t child)
+{
+	return (BUS_CHILD_PRESENT(device_get_parent(dev), dev));
+}
+
+/*
+ * Some convenience functions to make it easier for drivers to use the
+ * resource-management functions.  All these really do is hide the
+ * indirection through the parent's method table, making for slightly
+ * less-wordy code.  In the future, it might make sense for this code
+ * to maintain some sort of a list of resources allocated by each device.
+ */
+
+int
+bus_alloc_resources(device_t dev, struct resource_spec *rs,
+    struct resource **res)
+{
+	int i;
+
+	for (i = 0; rs[i].type != -1; i++)
+		res[i] = NULL;
+	for (i = 0; rs[i].type != -1; i++) {
+		res[i] = bus_alloc_resource_any(dev,
+		    rs[i].type, &rs[i].rid, rs[i].flags);
+		if (res[i] == NULL && !(rs[i].flags & RF_OPTIONAL)) {
+			bus_release_resources(dev, rs, res);
+			return (ENXIO);
+		}
+	}
+	return (0);
+}
+
+void
+bus_release_resources(device_t dev, const struct resource_spec *rs,
+    struct resource **res)
+{
+	int i;
+
+	for (i = 0; rs[i].type != -1; i++)
+		if (res[i] != NULL) {
+			bus_release_resource(
+			    dev, rs[i].type, rs[i].rid, res[i]);
+			res[i] = NULL;
+		}
+}
+
+/**
+ * @brief Wrapper function for BUS_ALLOC_RESOURCE().
+ *
+ * This function simply calls the BUS_ALLOC_RESOURCE() method of the
+ * parent of @p dev.
+ */
+struct resource *
+bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end,
+    u_long count, u_int flags)
+{
+	if (dev->parent == NULL)
+		return (NULL);
+	return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
+	    count, flags));
+}
+
+/**
+ * @brief Wrapper function for BUS_ADJUST_RESOURCE().
+ *
+ * This function simply calls the BUS_ADJUST_RESOURCE() method of the
+ * parent of @p dev.
+ */
+int
+bus_adjust_resource(device_t dev, int type, struct resource *r, u_long start,
+    u_long end)
+{
+	if (dev->parent == NULL)
+		return (EINVAL);
+	return (BUS_ADJUST_RESOURCE(dev->parent, dev, type, r, start, end));
+}
+
+/**
+ * @brief Wrapper function for BUS_ACTIVATE_RESOURCE().
+ *
+ * This function simply calls the BUS_ACTIVATE_RESOURCE() method of the
+ * parent of @p dev.
+ */
+int
+bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
+{
+	if (dev->parent == NULL)
+		return (EINVAL);
+	return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+/**
+ * @brief Wrapper function for BUS_DEACTIVATE_RESOURCE().
+ *
+ * This function simply calls the BUS_DEACTIVATE_RESOURCE() method of the
+ * parent of @p dev.
+ */
+int
+bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
+{
+	if (dev->parent == NULL)
+		return (EINVAL);
+	return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+/**
+ * @brief Wrapper function for BUS_RELEASE_RESOURCE().
+ *
+ * This function simply calls the BUS_RELEASE_RESOURCE() method of the
+ * parent of @p dev.
+ */
+int
+bus_release_resource(device_t dev, int type, int rid, struct resource *r)
+{
+	if (dev->parent == NULL)
+		return (EINVAL);
+	return (BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+/**
+ * @brief Wrapper function for BUS_SETUP_INTR().
+ *
+ * This function simply calls the BUS_SETUP_INTR() method of the
+ * parent of @p dev.
+ */
+int
+bus_setup_intr(device_t dev, struct resource *r, int flags,
+    driver_filter_t filter, driver_intr_t handler, void *arg, void **cookiep)
+{
+	int error;
+
+	if (dev->parent == NULL)
+		return (EINVAL);
+	error = BUS_SETUP_INTR(dev->parent, dev, r, flags, filter, handler,
+	    arg, cookiep);
+	if (error != 0)
+		return (error);
+	if (handler != NULL && !(flags & INTR_MPSAFE))
+		device_printf(dev, "[GIANT-LOCKED]\n");
+	return (0);
+}
+
+/**
+ * @brief Wrapper function for BUS_TEARDOWN_INTR().
+ *
+ * This function simply calls the BUS_TEARDOWN_INTR() method of the
+ * parent of @p dev.
+ */
+int
+bus_teardown_intr(device_t dev, struct resource *r, void *cookie)
+{
+	if (dev->parent == NULL)
+		return (EINVAL);
+	return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie));
+}
+
+/**
+ * @brief Wrapper function for BUS_BIND_INTR().
+ *
+ * This function simply calls the BUS_BIND_INTR() method of the
+ * parent of @p dev.
+ */
+int
+bus_bind_intr(device_t dev, struct resource *r, int cpu)
+{
+	if (dev->parent == NULL)
+		return (EINVAL);
+	return (BUS_BIND_INTR(dev->parent, dev, r, cpu));
+}
+
+/**
+ * @brief Wrapper function for BUS_DESCRIBE_INTR().
+ *
+ * This function first formats the requested description into a
+ * temporary buffer and then calls the BUS_DESCRIBE_INTR() method of
+ * the parent of @p dev.
+ */
+int
+bus_describe_intr(device_t dev, struct resource *irq, void *cookie,
+    const char *fmt, ...)
+{
+	va_list ap;
+	char descr[MAXCOMLEN + 1];
+
+	if (dev->parent == NULL)
+		return (EINVAL);
+	va_start(ap, fmt);
+	vsnprintf(descr, sizeof(descr), fmt, ap);
+	va_end(ap);
+	return (BUS_DESCRIBE_INTR(dev->parent, dev, irq, cookie, descr));
+}
+
+/**
+ * @brief Wrapper function for BUS_SET_RESOURCE().
+ *
+ * This function simply calls the BUS_SET_RESOURCE() method of the
+ * parent of @p dev.
+ */
+int
+bus_set_resource(device_t dev, int type, int rid,
+    u_long start, u_long count)
+{
+	return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid,
+	    start, count));
+}
+
+/**
+ * @brief Wrapper function for BUS_GET_RESOURCE().
+ *
+ * This function simply calls the BUS_GET_RESOURCE() method of the
+ * parent of @p dev.
+ */
+int
+bus_get_resource(device_t dev, int type, int rid,
+    u_long *startp, u_long *countp)
+{
+	return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
+	    startp, countp));
+}
+
+/**
+ * @brief Wrapper function for BUS_GET_RESOURCE().
+ *
+ * This function simply calls the BUS_GET_RESOURCE() method of the
+ * parent of @p dev and returns the start value.
+ */
+u_long
+bus_get_resource_start(device_t dev, int type, int rid)
+{
+	u_long start, count;
+	int error;
+
+	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
+	    &start, &count);
+	if (error)
+		return (0);
+	return (start);
+}
+
+/**
+ * @brief Wrapper function for BUS_GET_RESOURCE().
+ *
+ * This function simply calls the BUS_GET_RESOURCE() method of the
+ * parent of @p dev and returns the count value.
+ */
+u_long
+bus_get_resource_count(device_t dev, int type, int rid)
+{
+	u_long start, count;
+	int error;
+
+	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
+	    &start, &count);
+	if (error)
+		return (0);
+	return (count);
+}
+
+/**
+ * @brief Wrapper function for BUS_DELETE_RESOURCE().
+ *
+ * This function simply calls the BUS_DELETE_RESOURCE() method of the
+ * parent of @p dev.
+ */
+void
+bus_delete_resource(device_t dev, int type, int rid)
+{
+	BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid);
+}
+
+/**
+ * @brief Wrapper function for BUS_CHILD_PRESENT().
+ *
+ * This function simply calls the BUS_CHILD_PRESENT() method of the
+ * parent of @p dev.
+ */
+int
+bus_child_present(device_t child)
+{
+	return (BUS_CHILD_PRESENT(device_get_parent(child), child));
+}
+
+/**
+ * @brief Wrapper function for BUS_CHILD_PNPINFO_STR().
+ *
+ * This function simply calls the BUS_CHILD_PNPINFO_STR() method of the
+ * parent of @p dev.
+ */
+int
+bus_child_pnpinfo_str(device_t child, char *buf, size_t buflen)
+{
+	device_t parent;
+
+	parent = device_get_parent(child);
+	if (parent == NULL) {
+		*buf = '\0';
+		return (0);
+	}
+	return (BUS_CHILD_PNPINFO_STR(parent, child, buf, buflen));
+}
+
+/**
+ * @brief Wrapper function for BUS_CHILD_LOCATION_STR().
+ *
+ * This function simply calls the BUS_CHILD_LOCATION_STR() method of the
+ * parent of @p dev.
+ */
+int
+bus_child_location_str(device_t child, char *buf, size_t buflen)
+{
+	device_t parent;
+
+	parent = device_get_parent(child);
+	if (parent == NULL) {
+		*buf = '\0';
+		return (0);
+	}
+	return (BUS_CHILD_LOCATION_STR(parent, child, buf, buflen));
+}
+
+/**
+ * @brief Wrapper function for BUS_GET_DMA_TAG().
+ *
+ * This function simply calls the BUS_GET_DMA_TAG() method of the
+ * parent of @p dev.
+ */
+bus_dma_tag_t
+bus_get_dma_tag(device_t dev)
+{
+	device_t parent;
+
+	parent = device_get_parent(dev);
+	if (parent == NULL)
+		return (NULL);
+	return (BUS_GET_DMA_TAG(parent, dev));
+}
+
+/* Resume all devices and then notify userland that we're up again. */
+static int
+root_resume(device_t dev)
+{
+	int error;
+
+	error = bus_generic_resume(dev);
+	if (error == 0)
+		devctl_notify("kern", "power", "resume", NULL);
+	return (error);
+}
+
+static int
+root_print_child(device_t dev, device_t child)
+{
+	int	retval = 0;
+
+	retval += bus_print_child_header(dev, child);
+	retval += printf("\n");
+
+	return (retval);
+}
+
+static int
+root_setup_intr(device_t dev, device_t child, struct resource *irq, int flags,
+    driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep)
+{
+	/*
+	 * If an interrupt mapping gets to here something bad has happened.
+	 */
+	panic("root_setup_intr");
+}
+
+/*
+ * If we get here, assume that the device is permanant and really is
+ * present in the system.  Removable bus drivers are expected to intercept
+ * this call long before it gets here.  We return -1 so that drivers that
+ * really care can check vs -1 or some ERRNO returned higher in the food
+ * chain.
+ */
+static int
+root_child_present(device_t dev, device_t child)
+{
+	return (-1);
+}
+
+static kobj_method_t root_methods[] = {
+	/* Device interface */
+	KOBJMETHOD(device_shutdown,	bus_generic_shutdown),
+	KOBJMETHOD(device_suspend,	bus_generic_suspend),
+	KOBJMETHOD(device_resume,	root_resume),
+
+	/* Bus interface */
+	KOBJMETHOD(bus_print_child,	root_print_child),
+	KOBJMETHOD(bus_read_ivar,	bus_generic_read_ivar),
+	KOBJMETHOD(bus_write_ivar,	bus_generic_write_ivar),
+	KOBJMETHOD(bus_setup_intr,	root_setup_intr),
+	KOBJMETHOD(bus_child_present,	root_child_present),
+
+	KOBJMETHOD_END
+};
+
+static driver_t root_driver = {
+	"root",
+	root_methods,
+	1,			/* no softc */
+};
+
+device_t	root_bus;
+devclass_t	root_devclass;
+
+static int
+root_bus_module_handler(module_t mod, int what, void* arg)
+{
+	switch (what) {
+	case MOD_LOAD:
+		TAILQ_INIT(&bus_data_devices);
+		kobj_class_compile((kobj_class_t) &root_driver);
+		root_bus = make_device(NULL, "root", 0);
+		root_bus->desc = "System root bus";
+		kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver);
+		root_bus->driver = &root_driver;
+		root_bus->state = DS_ATTACHED;
+		root_devclass = devclass_find_internal("root", NULL, FALSE);
+		devinit();
+		return (0);
+
+	case MOD_SHUTDOWN:
+		device_shutdown(root_bus);
+		return (0);
+	default:
+		return (EOPNOTSUPP);
+	}
+
+	return (0);
+}
+
+static moduledata_t root_bus_mod = {
+	"rootbus",
+	root_bus_module_handler,
+	NULL
+};
+DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+
+/**
+ * @brief Automatically configure devices
+ *
+ * This function begins the autoconfiguration process by calling
+ * device_probe_and_attach() for each child of the @c root0 device.
+ */ 
+void
+root_bus_configure(void)
+{
+
+	PDEBUG(("."));
+
+	/* Eventually this will be split up, but this is sufficient for now. */
+	bus_set_pass(BUS_PASS_DEFAULT);
+}
+
+/**
+ * @brief Module handler for registering device drivers
+ *
+ * This module handler is used to automatically register device
+ * drivers when modules are loaded. If @p what is MOD_LOAD, it calls
+ * devclass_add_driver() for the driver described by the
+ * driver_module_data structure pointed to by @p arg
+ */
+int
+driver_module_handler(module_t mod, int what, void *arg)
+{
+	struct driver_module_data *dmd;
+	devclass_t bus_devclass;
+	kobj_class_t driver;
+	int error, pass;
+
+	dmd = (struct driver_module_data *)arg;
+	bus_devclass = devclass_find_internal(dmd->dmd_busname, NULL, TRUE);
+	error = 0;
+
+	switch (what) {
+	case MOD_LOAD:
+		if (dmd->dmd_chainevh)
+			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
+
+		pass = dmd->dmd_pass;
+		driver = dmd->dmd_driver;
+		PDEBUG(("Loading module: driver %s on bus %s (pass %d)",
+		    DRIVERNAME(driver), dmd->dmd_busname, pass));
+		error = devclass_add_driver(bus_devclass, driver, pass,
+		    dmd->dmd_devclass);
+		break;
+
+	case MOD_UNLOAD:
+		PDEBUG(("Unloading module: driver %s from bus %s",
+		    DRIVERNAME(dmd->dmd_driver),
+		    dmd->dmd_busname));
+		error = devclass_delete_driver(bus_devclass,
+		    dmd->dmd_driver);
+
+		if (!error && dmd->dmd_chainevh)
+			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
+		break;
+	case MOD_QUIESCE:
+		PDEBUG(("Quiesce module: driver %s from bus %s",
+		    DRIVERNAME(dmd->dmd_driver),
+		    dmd->dmd_busname));
+		error = devclass_quiesce_driver(bus_devclass,
+		    dmd->dmd_driver);
+
+		if (!error && dmd->dmd_chainevh)
+			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+
+	return (error);
+}
+
+/**
+ * @brief Enumerate all hinted devices for this bus.
+ *
+ * Walks through the hints for this bus and calls the bus_hinted_child
+ * routine for each one it fines.  It searches first for the specific
+ * bus that's being probed for hinted children (eg isa0), and then for
+ * generic children (eg isa).
+ *
+ * @param	dev	bus device to enumerate
+ */
+void
+bus_enumerate_hinted_children(device_t bus)
+{
+	int i;
+	const char *dname, *busname;
+	int dunit;
+
+	/*
+	 * enumerate all devices on the specific bus
+	 */
+	busname = device_get_nameunit(bus);
+	i = 0;
+	while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
+		BUS_HINTED_CHILD(bus, dname, dunit);
+
+	/*
+	 * and all the generic ones.
+	 */
+	busname = device_get_name(bus);
+	i = 0;
+	while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
+		BUS_HINTED_CHILD(bus, dname, dunit);
+}
+
+#ifdef BUS_DEBUG
+
+/* the _short versions avoid iteration by not calling anything that prints
+ * more than oneliners. I love oneliners.
+ */
+
+static void
+print_device_short(device_t dev, int indent)
+{
+	if (!dev)
+		return;
+
+	indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s%s,%sivars,%ssoftc,busy=%d\n",
+	    dev->unit, dev->desc,
+	    (dev->parent? "":"no "),
+	    (TAILQ_EMPTY(&dev->children)? "no ":""),
+	    (dev->flags&DF_ENABLED? "enabled,":"disabled,"),
+	    (dev->flags&DF_FIXEDCLASS? "fixed,":""),
+	    (dev->flags&DF_WILDCARD? "wildcard,":""),
+	    (dev->flags&DF_DESCMALLOCED? "descmalloced,":""),
+	    (dev->flags&DF_REBID? "rebiddable,":""),
+	    (dev->ivars? "":"no "),
+	    (dev->softc? "":"no "),
+	    dev->busy));
+}
+
+static void
+print_device(device_t dev, int indent)
+{
+	if (!dev)
+		return;
+
+	print_device_short(dev, indent);
+
+	indentprintf(("Parent:\n"));
+	print_device_short(dev->parent, indent+1);
+	indentprintf(("Driver:\n"));
+	print_driver_short(dev->driver, indent+1);
+	indentprintf(("Devclass:\n"));
+	print_devclass_short(dev->devclass, indent+1);
+}
+
+void
+print_device_tree_short(device_t dev, int indent)
+/* print the device and all its children (indented) */
+{
+	device_t child;
+
+	if (!dev)
+		return;
+
+	print_device_short(dev, indent);
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		print_device_tree_short(child, indent+1);
+	}
+}
+
+void
+print_device_tree(device_t dev, int indent)
+/* print the device and all its children (indented) */
+{
+	device_t child;
+
+	if (!dev)
+		return;
+
+	print_device(dev, indent);
+
+	TAILQ_FOREACH(child, &dev->children, link) {
+		print_device_tree(child, indent+1);
+	}
+}
+
+static void
+print_driver_short(driver_t *driver, int indent)
+{
+	if (!driver)
+		return;
+
+	indentprintf(("driver %s: softc size = %zd\n",
+	    driver->name, driver->size));
+}
+
+static void
+print_driver(driver_t *driver, int indent)
+{
+	if (!driver)
+		return;
+
+	print_driver_short(driver, indent);
+}
+
+static void
+print_driver_list(driver_list_t drivers, int indent)
+{
+	driverlink_t driver;
+
+	TAILQ_FOREACH(driver, &drivers, link) {
+		print_driver(driver->driver, indent);
+	}
+}
+
+static void
+print_devclass_short(devclass_t dc, int indent)
+{
+	if ( !dc )
+		return;
+
+	indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit));
+}
+
+static void
+print_devclass(devclass_t dc, int indent)
+{
+	int i;
+
+	if ( !dc )
+		return;
+
+	print_devclass_short(dc, indent);
+	indentprintf(("Drivers:\n"));
+	print_driver_list(dc->drivers, indent+1);
+
+	indentprintf(("Devices:\n"));
+	for (i = 0; i < dc->maxunit; i++)
+		if (dc->devices[i])
+			print_device(dc->devices[i], indent+1);
+}
+
+void
+print_devclass_list_short(void)
+{
+	devclass_t dc;
+
+	printf("Short listing of devclasses, drivers & devices:\n");
+	TAILQ_FOREACH(dc, &devclasses, link) {
+		print_devclass_short(dc, 0);
+	}
+}
+
+void
+print_devclass_list(void)
+{
+	devclass_t dc;
+
+	printf("Full listing of devclasses, drivers & devices:\n");
+	TAILQ_FOREACH(dc, &devclasses, link) {
+		print_devclass(dc, 0);
+	}
+}
+
+#endif
+
+/*
+ * User-space access to the device tree.
+ *
+ * We implement a small set of nodes:
+ *
+ * hw.bus			Single integer read method to obtain the
+ *				current generation count.
+ * hw.bus.devices		Reads the entire device tree in flat space.
+ * hw.bus.rman			Resource manager interface
+ *
+ * We might like to add the ability to scan devclasses and/or drivers to
+ * determine what else is currently loaded/available.
+ */
+
+static int
+sysctl_bus(SYSCTL_HANDLER_ARGS)
+{
+	struct u_businfo	ubus;
+
+	ubus.ub_version = BUS_USER_VERSION;
+	ubus.ub_generation = bus_data_generation;
+
+	return (SYSCTL_OUT(req, &ubus, sizeof(ubus)));
+}
+SYSCTL_NODE(_hw_bus, OID_AUTO, info, CTLFLAG_RW, sysctl_bus,
+    "bus-related data");
+
+static int
+sysctl_devices(SYSCTL_HANDLER_ARGS)
+{
+	int			*name = (int *)arg1;
+	u_int			namelen = arg2;
+	int			index;
+	struct device		*dev;
+	struct u_device		udev;	/* XXX this is a bit big */
+	int			error;
+
+	if (namelen != 2)
+		return (EINVAL);
+
+	if (bus_data_generation_check(name[0]))
+		return (EINVAL);
+
+	index = name[1];
+
+	/*
+	 * Scan the list of devices, looking for the requested index.
+	 */
+	TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
+		if (index-- == 0)
+			break;
+	}
+	if (dev == NULL)
+		return (ENOENT);
+
+	/*
+	 * Populate the return array.
+	 */
+	bzero(&udev, sizeof(udev));
+	udev.dv_handle = (uintptr_t)dev;
+	udev.dv_parent = (uintptr_t)dev->parent;
+	if (dev->nameunit != NULL)
+		strlcpy(udev.dv_name, dev->nameunit, sizeof(udev.dv_name));
+	if (dev->desc != NULL)
+		strlcpy(udev.dv_desc, dev->desc, sizeof(udev.dv_desc));
+	if (dev->driver != NULL && dev->driver->name != NULL)
+		strlcpy(udev.dv_drivername, dev->driver->name,
+		    sizeof(udev.dv_drivername));
+	bus_child_pnpinfo_str(dev, udev.dv_pnpinfo, sizeof(udev.dv_pnpinfo));
+	bus_child_location_str(dev, udev.dv_location, sizeof(udev.dv_location));
+	udev.dv_devflags = dev->devflags;
+	udev.dv_flags = dev->flags;
+	udev.dv_state = dev->state;
+	error = SYSCTL_OUT(req, &udev, sizeof(udev));
+	return (error);
+}
+
+SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD, sysctl_devices,
+    "system device tree");
+
+int
+bus_data_generation_check(int generation)
+{
+	if (generation != bus_data_generation)
+		return (1);
+
+	/* XXX generate optimised lists here? */
+	return (0);
+}
+
+void
+bus_data_generation_update(void)
+{
+	bus_data_generation++;
+}
+
+int
+bus_free_resource(device_t dev, int type, struct resource *r)
+{
+	if (r == NULL)
+		return (0);
+	return (bus_release_resource(dev, type, rman_get_rid(r), r));
+}
diff --git a/sys/kern/subr_bus_dma.c b/sys/kern/subr_bus_dma.c
new file mode 100644
index 0000000..999de3f
--- /dev/null
+++ b/sys/kern/subr_bus_dma.c
@@ -0,0 +1,533 @@
+/*-
+ * Copyright (c) 2012 EMC Corp.
+ * All rights reserved.
+ *
+ * Copyright (c) 1997, 1998 Justin T. Gibbs.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_bus.h"
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/bus.h>
+#include <sys/callout.h>
+#include <sys/mbuf.h>
+#include <sys/memdesc.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/pmap.h>
+
+#include <cam/cam.h>
+#include <cam/cam_ccb.h>
+
+#include <machine/bus.h>
+
+/*
+ * Load a list of virtual addresses.
+ */
+static int
+_bus_dmamap_load_vlist(bus_dma_tag_t dmat, bus_dmamap_t map,
+    bus_dma_segment_t *list, int sglist_cnt, struct pmap *pmap, int *nsegs,
+    int flags)
+{
+	int error;
+
+	error = 0;
+	for (; sglist_cnt > 0; sglist_cnt--, list++) {
+		error = _bus_dmamap_load_buffer(dmat, map,
+		    (void *)(uintptr_t)list->ds_addr, list->ds_len, pmap,
+		    flags, NULL, nsegs);
+		if (error)
+			break;
+	}
+	return (error);
+}
+
+/*
+ * Load a list of physical addresses.
+ */
+static int
+_bus_dmamap_load_plist(bus_dma_tag_t dmat, bus_dmamap_t map,
+    bus_dma_segment_t *list, int sglist_cnt, int *nsegs, int flags)
+{
+	int error;
+
+	error = 0;
+	for (; sglist_cnt > 0; sglist_cnt--, list++) {
+		error = _bus_dmamap_load_phys(dmat, map,
+		    (vm_paddr_t)list->ds_addr, list->ds_len, flags, NULL,
+		    nsegs);
+		if (error)
+			break;
+	}
+	return (error);
+}
+
+/*
+ * Load an mbuf chain.
+ */
+static int
+_bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
+    struct mbuf *m0, bus_dma_segment_t *segs, int *nsegs, int flags)
+{
+	struct mbuf *m;
+	int error;
+
+	error = 0;
+	for (m = m0; m != NULL && error == 0; m = m->m_next) {
+		if (m->m_len > 0) {
+			error = _bus_dmamap_load_buffer(dmat, map, m->m_data,
+			    m->m_len, kernel_pmap, flags | BUS_DMA_LOAD_MBUF,
+			    segs, nsegs);
+		}
+	}
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, *nsegs);
+	return (error);
+}
+
+/*
+ * Load from block io.
+ */
+static int
+_bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
+    int *nsegs, int flags)
+{
+	vm_paddr_t paddr;
+	bus_size_t len, tlen;
+	int error, i, ma_offs;
+
+	if ((bio->bio_flags & BIO_UNMAPPED) == 0) {
+		error = _bus_dmamap_load_buffer(dmat, map, bio->bio_data,
+		    bio->bio_bcount, kernel_pmap, flags, NULL, nsegs);
+		return (error);
+	}
+
+	error = 0;
+	tlen = bio->bio_bcount;
+	ma_offs = bio->bio_ma_offset;
+	for (i = 0; tlen > 0; i++, tlen -= len) {
+		len = min(PAGE_SIZE - ma_offs, tlen);
+		paddr = VM_PAGE_TO_PHYS(bio->bio_ma[i]) + ma_offs;
+		error = _bus_dmamap_load_phys(dmat, map, paddr, len,
+		    flags, NULL, nsegs);
+		if (error != 0)
+			break;
+		ma_offs = 0;
+	}
+	return (error);
+}
+
+/*
+ * Load a cam control block.
+ */
+static int
+_bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb,
+		    int *nsegs, int flags)
+{
+	struct ccb_hdr *ccb_h;
+	void *data_ptr;
+	int error;
+	uint32_t dxfer_len;
+	uint16_t sglist_cnt;
+
+	error = 0;
+	ccb_h = &ccb->ccb_h;
+	switch (ccb_h->func_code) {
+	case XPT_SCSI_IO: {
+		struct ccb_scsiio *csio;
+
+		csio = &ccb->csio;
+		data_ptr = csio->data_ptr;
+		dxfer_len = csio->dxfer_len;
+		sglist_cnt = csio->sglist_cnt;
+		break;
+	}
+	case XPT_CONT_TARGET_IO: {
+		struct ccb_scsiio *ctio;
+
+		ctio = &ccb->ctio;
+		data_ptr = ctio->data_ptr;
+		dxfer_len = ctio->dxfer_len;
+		sglist_cnt = ctio->sglist_cnt;
+		break;
+	}
+	case XPT_ATA_IO: {
+		struct ccb_ataio *ataio;
+
+		ataio = &ccb->ataio;
+		data_ptr = ataio->data_ptr;
+		dxfer_len = ataio->dxfer_len;
+		sglist_cnt = 0;
+		break;
+	}
+	default:
+		panic("_bus_dmamap_load_ccb: Unsupported func code %d",
+		    ccb_h->func_code);
+	}
+
+	switch ((ccb_h->flags & CAM_DATA_MASK)) {
+	case CAM_DATA_VADDR:
+		error = _bus_dmamap_load_buffer(dmat, map, data_ptr, dxfer_len,
+		    kernel_pmap, flags, NULL, nsegs);
+		break;
+	case CAM_DATA_PADDR:
+		error = _bus_dmamap_load_phys(dmat, map,
+		    (vm_paddr_t)(uintptr_t)data_ptr, dxfer_len, flags, NULL,
+		    nsegs);
+		break;
+	case CAM_DATA_SG:
+		error = _bus_dmamap_load_vlist(dmat, map,
+		    (bus_dma_segment_t *)data_ptr, sglist_cnt, kernel_pmap,
+		    nsegs, flags);
+		break;
+	case CAM_DATA_SG_PADDR:
+		error = _bus_dmamap_load_plist(dmat, map,
+		    (bus_dma_segment_t *)data_ptr, sglist_cnt, nsegs, flags);
+		break;
+	case CAM_DATA_BIO:
+		error = _bus_dmamap_load_bio(dmat, map, (struct bio *)data_ptr,
+		    nsegs, flags);
+		break;
+	default:
+		panic("_bus_dmamap_load_ccb: flags 0x%X unimplemented",
+		    ccb_h->flags);
+	}
+	return (error);
+}
+
+/*
+ * Load a uio.
+ */
+static int
+_bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio,
+    int *nsegs, int flags)
+{
+	bus_size_t resid;
+	bus_size_t minlen;
+	struct iovec *iov;
+	pmap_t pmap;
+	caddr_t addr;
+	int error, i;
+
+	if (uio->uio_segflg == UIO_USERSPACE) {
+		KASSERT(uio->uio_td != NULL,
+			("bus_dmamap_load_uio: USERSPACE but no proc"));
+		pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
+	} else
+		pmap = kernel_pmap;
+	resid = uio->uio_resid;
+	iov = uio->uio_iov;
+	error = 0;
+
+	for (i = 0; i < uio->uio_iovcnt && resid != 0 && !error; i++) {
+		/*
+		 * Now at the first iovec to load.  Load each iovec
+		 * until we have exhausted the residual count.
+		 */
+
+		addr = (caddr_t) iov[i].iov_base;
+		minlen = resid < iov[i].iov_len ? resid : iov[i].iov_len;
+		if (minlen > 0) {
+			error = _bus_dmamap_load_buffer(dmat, map, addr,
+			    minlen, pmap, flags, NULL, nsegs);
+			resid -= minlen;
+		}
+	}
+
+	return (error);
+}
+
+/*
+ * Map the buffer buf into bus space using the dmamap map.
+ */
+int
+bus_dmamap_load(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
+    bus_size_t buflen, bus_dmamap_callback_t *callback,
+    void *callback_arg, int flags)
+{
+	bus_dma_segment_t *segs;
+	struct memdesc mem;
+	int error;
+	int nsegs;
+
+	if ((flags & BUS_DMA_NOWAIT) == 0) {
+		mem = memdesc_vaddr(buf, buflen);
+		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
+	}
+
+	nsegs = -1;
+	error = _bus_dmamap_load_buffer(dmat, map, buf, buflen, kernel_pmap,
+	    flags, NULL, &nsegs);
+	nsegs++;
+
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, nsegs);
+
+	if (error == EINPROGRESS)
+		return (error);
+
+	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+	if (error)
+		(*callback)(callback_arg, segs, 0, error);
+	else
+		(*callback)(callback_arg, segs, nsegs, 0);
+
+	/*
+	 * Return ENOMEM to the caller so that it can pass it up the stack.
+	 * This error only happens when NOWAIT is set, so deferral is disabled.
+	 */
+	if (error == ENOMEM)
+		return (error);
+
+	return (0);
+}
+
+int
+bus_dmamap_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
+    bus_dmamap_callback2_t *callback, void *callback_arg, int flags)
+{
+	bus_dma_segment_t *segs;
+	int nsegs, error;
+
+	M_ASSERTPKTHDR(m0);
+
+	flags |= BUS_DMA_NOWAIT;
+	nsegs = -1;
+	error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, NULL, &nsegs, flags);
+	++nsegs;
+
+	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+	if (error)
+		(*callback)(callback_arg, segs, 0, 0, error);
+	else
+		(*callback)(callback_arg, segs, nsegs, m0->m_pkthdr.len, error);
+
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, nsegs);
+	return (error);
+}
+
+int
+bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
+    bus_dma_segment_t *segs, int *nsegs, int flags)
+{
+	int error;
+
+	flags |= BUS_DMA_NOWAIT;
+	*nsegs = -1;
+	error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, segs, nsegs, flags);
+	++*nsegs;
+	_bus_dmamap_complete(dmat, map, segs, *nsegs, error);
+	return (error);
+}
+
+int
+bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio,
+    bus_dmamap_callback2_t *callback, void *callback_arg, int flags)
+{
+	bus_dma_segment_t *segs;
+	int nsegs, error;
+
+	flags |= BUS_DMA_NOWAIT;
+	nsegs = -1;
+	error = _bus_dmamap_load_uio(dmat, map, uio, &nsegs, flags);
+	nsegs++;
+
+	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+	if (error)
+		(*callback)(callback_arg, segs, 0, 0, error);
+	else
+		(*callback)(callback_arg, segs, nsegs, uio->uio_resid, error);
+
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, nsegs);
+	return (error);
+}
+
+int
+bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb,
+		    bus_dmamap_callback_t *callback, void *callback_arg,
+		    int flags)
+{
+	bus_dma_segment_t *segs;
+	struct ccb_hdr *ccb_h;
+	struct memdesc mem;
+	int error;
+	int nsegs;
+
+	ccb_h = &ccb->ccb_h;
+	if ((ccb_h->flags & CAM_DIR_MASK) == CAM_DIR_NONE) {
+		callback(callback_arg, NULL, 0, 0);
+		return (0);
+	}
+	if ((flags & BUS_DMA_NOWAIT) == 0) {
+		mem = memdesc_ccb(ccb);
+		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
+	}
+	nsegs = -1;
+	error = _bus_dmamap_load_ccb(dmat, map, ccb, &nsegs, flags);
+	nsegs++;
+
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, nsegs);
+
+	if (error == EINPROGRESS)
+		return (error);
+
+	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+	if (error)
+		(*callback)(callback_arg, segs, 0, error);
+	else
+		(*callback)(callback_arg, segs, nsegs, error);
+	/*
+	 * Return ENOMEM to the caller so that it can pass it up the stack.
+	 * This error only happens when NOWAIT is set, so deferral is disabled.
+	 */
+	if (error == ENOMEM)
+		return (error);
+
+	return (0);
+}
+
+int
+bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
+		    bus_dmamap_callback_t *callback, void *callback_arg,
+		    int flags)
+{
+	bus_dma_segment_t *segs;
+	struct memdesc mem;
+	int error;
+	int nsegs;
+
+	if ((flags & BUS_DMA_NOWAIT) == 0) {
+		mem = memdesc_bio(bio);
+		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
+	}
+	nsegs = -1;
+	error = _bus_dmamap_load_bio(dmat, map, bio, &nsegs, flags);
+	nsegs++;
+
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, nsegs);
+
+	if (error == EINPROGRESS)
+		return (error);
+
+	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+	if (error)
+		(*callback)(callback_arg, segs, 0, error);
+	else
+		(*callback)(callback_arg, segs, nsegs, error);
+	/*
+	 * Return ENOMEM to the caller so that it can pass it up the stack.
+	 * This error only happens when NOWAIT is set, so deferral is disabled.
+	 */
+	if (error == ENOMEM)
+		return (error);
+
+	return (0);
+}
+
+int
+bus_dmamap_load_mem(bus_dma_tag_t dmat, bus_dmamap_t map,
+    struct memdesc *mem, bus_dmamap_callback_t *callback,
+    void *callback_arg, int flags)
+{
+	bus_dma_segment_t *segs;
+	int error;
+	int nsegs;
+
+	if ((flags & BUS_DMA_NOWAIT) == 0)
+		_bus_dmamap_waitok(dmat, map, mem, callback, callback_arg);
+
+	nsegs = -1;
+	error = 0;
+	switch (mem->md_type) {
+	case MEMDESC_VADDR:
+		error = _bus_dmamap_load_buffer(dmat, map, mem->u.md_vaddr,
+		    mem->md_opaque, kernel_pmap, flags, NULL, &nsegs);
+		break;
+	case MEMDESC_PADDR:
+		error = _bus_dmamap_load_phys(dmat, map, mem->u.md_paddr,
+		    mem->md_opaque, flags, NULL, &nsegs);
+		break;
+	case MEMDESC_VLIST:
+		error = _bus_dmamap_load_vlist(dmat, map, mem->u.md_list,
+		    mem->md_opaque, kernel_pmap, &nsegs, flags);
+		break;
+	case MEMDESC_PLIST:
+		error = _bus_dmamap_load_plist(dmat, map, mem->u.md_list,
+		    mem->md_opaque, &nsegs, flags);
+		break;
+	case MEMDESC_BIO:
+		error = _bus_dmamap_load_bio(dmat, map, mem->u.md_bio,
+		    &nsegs, flags);
+		break;
+	case MEMDESC_UIO:
+		error = _bus_dmamap_load_uio(dmat, map, mem->u.md_uio,
+		    &nsegs, flags);
+		break;
+	case MEMDESC_MBUF:
+		error = _bus_dmamap_load_mbuf_sg(dmat, map, mem->u.md_mbuf,
+		    NULL, &nsegs, flags);
+		break;
+	case MEMDESC_CCB:
+		error = _bus_dmamap_load_ccb(dmat, map, mem->u.md_ccb, &nsegs,
+		    flags);
+		break;
+	}
+	nsegs++;
+
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, nsegs);
+
+	if (error == EINPROGRESS)
+		return (error);
+
+	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+	if (error)
+		(*callback)(callback_arg, segs, 0, error);
+	else
+		(*callback)(callback_arg, segs, nsegs, 0);
+
+	/*
+	 * Return ENOMEM to the caller so that it can pass it up the stack.
+	 * This error only happens when NOWAIT is set, so deferral is disabled.
+	 */
+	if (error == ENOMEM)
+		return (error);
+
+	return (0);
+}
diff --git a/sys/kern/subr_busdma_bufalloc.c b/sys/kern/subr_busdma_bufalloc.c
new file mode 100644
index 0000000..a80a233
--- /dev/null
+++ b/sys/kern/subr_busdma_bufalloc.c
@@ -0,0 +1,174 @@
+/*-
+ * Copyright (c) 2012 Ian Lepore
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Buffer allocation support routines for bus_dmamem_alloc implementations.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/busdma_bufalloc.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/uma.h>
+
+/*
+ * We manage buffer zones up to a page in size.  Buffers larger than a page can
+ * be managed by one of the kernel's page-oriented memory allocation routines as
+ * efficiently as what we can do here.  Also, a page is the largest size for
+ * which we can g'tee contiguity when using uma, and contiguity is one of the
+ * requirements we have to fulfill.
+ */
+#define	MIN_ZONE_BUFSIZE	32
+#define	MAX_ZONE_BUFSIZE	PAGE_SIZE
+
+/*
+ * The static array of 12 bufzones is big enough to handle all the zones for the
+ * smallest supported allocation size of 32 through the largest supported page
+ * size of 64K.  If you up the biggest page size number, up the array size too.
+ * Basically the size of the array needs to be log2(maxsize)-log2(minsize)+1,
+ * but I don't know of an easy way to express that as a compile-time constant.
+ */
+#if PAGE_SIZE > 65536
+#error Unsupported page size
+#endif
+
+struct busdma_bufalloc {
+	bus_size_t		min_size;
+	size_t			num_zones;
+	struct busdma_bufzone	buf_zones[12];
+};
+
+busdma_bufalloc_t 
+busdma_bufalloc_create(const char *name, bus_size_t minimum_alignment,
+    uma_alloc alloc_func, uma_free free_func, u_int32_t zcreate_flags)
+{
+	struct busdma_bufalloc *ba;
+	struct busdma_bufzone *bz;
+	int i;
+	bus_size_t cursize;
+
+	ba = malloc(sizeof(struct busdma_bufalloc), M_DEVBUF, 
+	    M_ZERO | M_WAITOK);
+
+	ba->min_size = MAX(MIN_ZONE_BUFSIZE, minimum_alignment);
+
+	/*
+	 * Each uma zone is created with an alignment of size-1, meaning that
+	 * the alignment is equal to the size (I.E., 64 byte buffers are aligned
+	 * to 64 byte boundaries, etc).  This allows for a fast efficient test
+	 * when deciding whether a pool buffer meets the constraints of a given
+	 * tag used for allocation: the buffer is usable if tag->alignment <=
+	 * bufzone->size.
+	 */
+	for (i = 0, bz = ba->buf_zones, cursize = ba->min_size;
+	    i < nitems(ba->buf_zones) && cursize <= MAX_ZONE_BUFSIZE;
+	    ++i, ++bz, cursize <<= 1) {
+		snprintf(bz->name, sizeof(bz->name), "dma %.10s %lu",
+		    name, cursize);
+		bz->size = cursize;
+		bz->umazone = uma_zcreate(bz->name, bz->size,
+		    NULL, NULL, NULL, NULL, bz->size - 1, zcreate_flags);
+		if (bz->umazone == NULL) {
+			busdma_bufalloc_destroy(ba);
+			return (NULL);
+		}
+		if (alloc_func != NULL)
+			uma_zone_set_allocf(bz->umazone, alloc_func);
+		if (free_func != NULL)
+			uma_zone_set_freef(bz->umazone, free_func);
+		++ba->num_zones;
+	}
+
+	return (ba);
+}
+
+void 
+busdma_bufalloc_destroy(busdma_bufalloc_t ba)
+{
+	struct busdma_bufzone *bz;
+	int i;
+
+	if (ba == NULL)
+		return;
+
+	for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) {
+		uma_zdestroy(bz->umazone);
+	}
+
+	free(ba, M_DEVBUF);
+}
+
+struct busdma_bufzone * 
+busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size)
+{
+	struct busdma_bufzone *bz;
+	int i;
+
+	if (size > MAX_ZONE_BUFSIZE)
+		return (NULL);
+
+	for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) {
+		if (bz->size >= size)
+			return (bz);
+	}
+
+	panic("Didn't find a buffer zone of the right size");
+}
+
+void *
+busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, u_int8_t *pflag,
+    int wait)
+{
+#ifdef VM_MEMATTR_UNCACHEABLE
+
+	/* Inform UMA that this allocator uses kernel_arena/object. */
+	*pflag = UMA_SLAB_KERNEL;
+
+	return ((void *)kmem_alloc_attr(kernel_arena, size, wait, 0,
+	    BUS_SPACE_MAXADDR, VM_MEMATTR_UNCACHEABLE));
+
+#else
+
+	panic("VM_MEMATTR_UNCACHEABLE unavailable");
+
+#endif	/* VM_MEMATTR_UNCACHEABLE */
+}
+
+void 
+busdma_bufalloc_free_uncacheable(void *item, int size, u_int8_t pflag)
+{
+
+	kmem_free(kernel_arena, (vm_offset_t)item, size);
+}
+
diff --git a/sys/kern/subr_capability.c b/sys/kern/subr_capability.c
new file mode 100644
index 0000000..61ace5a
--- /dev/null
+++ b/sys/kern/subr_capability.c
@@ -0,0 +1,298 @@
+/*-
+ * Copyright (c) 2013 FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/capability.h>
+#include <sys/systm.h>
+
+#include <machine/stdarg.h>
+#else	/* !_KERNEL */
+#include <sys/types.h>
+#include <sys/capability.h>
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#endif
+
+#ifdef _KERNEL
+#define	assert(exp)	KASSERT((exp), ("%s:%u", __func__, __LINE__))
+#endif
+
+#define	CAPARSIZE_MIN	(CAP_RIGHTS_VERSION_00 + 2)
+#define	CAPARSIZE_MAX	(CAP_RIGHTS_VERSION + 2)
+
+static __inline int
+right_to_index(uint64_t right)
+{
+	static const int bit2idx[] = {
+		-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
+		4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+	};
+	int idx;
+
+	idx = CAPIDXBIT(right);
+	assert(idx >= 0 && idx < sizeof(bit2idx) / sizeof(bit2idx[0]));
+	return (bit2idx[idx]);
+}
+
+static void
+cap_rights_vset(cap_rights_t *rights, va_list ap)
+{
+	uint64_t right;
+	int i, n;
+
+	assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
+
+	n = CAPARSIZE(rights);
+	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+
+	for (;;) {
+		right = (uint64_t)va_arg(ap, unsigned long long);
+		if (right == 0)
+			break;
+		assert(CAPRVER(right) == 0);
+		i = right_to_index(right);
+		assert(i >= 0);
+		assert(i < n);
+		assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
+		rights->cr_rights[i] |= right;
+		assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
+	}
+}
+
+static void
+cap_rights_vclear(cap_rights_t *rights, va_list ap)
+{
+	uint64_t right;
+	int i, n;
+
+	assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
+
+	n = CAPARSIZE(rights);
+	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+
+	for (;;) {
+		right = (uint64_t)va_arg(ap, unsigned long long);
+		if (right == 0)
+			break;
+		assert(CAPRVER(right) == 0);
+		i = right_to_index(right);
+		assert(i >= 0);
+		assert(i < n);
+		assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
+		rights->cr_rights[i] &= ~(right & 0x01FFFFFFFFFFFFFFULL);
+		assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
+	}
+}
+
+static bool
+cap_rights_is_vset(const cap_rights_t *rights, va_list ap)
+{
+	uint64_t right;
+	int i, n;
+
+	assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
+
+	n = CAPARSIZE(rights);
+	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+
+	for (;;) {
+		right = (uint64_t)va_arg(ap, unsigned long long);
+		if (right == 0)
+			break;
+		assert(CAPRVER(right) == 0);
+		i = right_to_index(right);
+		assert(i >= 0);
+		assert(i < n);
+		assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
+		if ((rights->cr_rights[i] & right) != right)
+			return (false);
+	}
+
+	return (true);
+}
+
+cap_rights_t *
+__cap_rights_init(int version, cap_rights_t *rights, ...)
+{
+	unsigned int n;
+	va_list ap;
+
+	assert(version == CAP_RIGHTS_VERSION_00);
+
+	n = version + 2;
+	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+	memset(rights->cr_rights, 0, sizeof(rights->cr_rights[0]) * n);
+	CAP_NONE(rights);
+	va_start(ap, rights);
+	cap_rights_vset(rights, ap);
+	va_end(ap);
+
+	return (rights);
+}
+
+void
+__cap_rights_set(cap_rights_t *rights, ...)
+{
+	va_list ap;
+
+	assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
+
+	va_start(ap, rights);
+	cap_rights_vset(rights, ap);
+	va_end(ap);
+}
+
+void
+__cap_rights_clear(cap_rights_t *rights, ...)
+{
+	va_list ap;
+
+	assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
+
+	va_start(ap, rights);
+	cap_rights_vclear(rights, ap);
+	va_end(ap);
+}
+
+bool
+__cap_rights_is_set(const cap_rights_t *rights, ...)
+{
+	va_list ap;
+	bool ret;
+
+	assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
+
+	va_start(ap, rights);
+	ret = cap_rights_is_vset(rights, ap);
+	va_end(ap);
+
+	return (ret);
+}
+
+bool
+cap_rights_is_valid(const cap_rights_t *rights)
+{
+	cap_rights_t allrights;
+	int i, j;
+
+	if (CAPVER(rights) != CAP_RIGHTS_VERSION_00)
+		return (false);
+	if (CAPARSIZE(rights) < CAPARSIZE_MIN ||
+	    CAPARSIZE(rights) > CAPARSIZE_MAX) {
+		return (false);
+	}
+	CAP_ALL(&allrights);
+	if (!cap_rights_contains(&allrights, rights))
+		return (false);
+	for (i = 0; i < CAPARSIZE(rights); i++) {
+		j = right_to_index(rights->cr_rights[i]);
+		if (i != j)
+			return (false);
+		if (i > 0) {
+			if (CAPRVER(rights->cr_rights[i]) != 0)
+				return (false);
+		}
+	}
+
+	return (true);
+}
+
+void
+cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src)
+{
+	unsigned int i, n;
+
+	assert(CAPVER(dst) == CAP_RIGHTS_VERSION_00);
+	assert(CAPVER(src) == CAP_RIGHTS_VERSION_00);
+	assert(CAPVER(dst) == CAPVER(src));
+	assert(cap_rights_is_valid(src));
+	assert(cap_rights_is_valid(dst));
+
+	n = CAPARSIZE(dst);
+	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+
+	for (i = 0; i < n; i++)
+		dst->cr_rights[i] |= src->cr_rights[i];
+
+	assert(cap_rights_is_valid(src));
+	assert(cap_rights_is_valid(dst));
+}
+
+void
+cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src)
+{
+	unsigned int i, n;
+
+	assert(CAPVER(dst) == CAP_RIGHTS_VERSION_00);
+	assert(CAPVER(src) == CAP_RIGHTS_VERSION_00);
+	assert(CAPVER(dst) == CAPVER(src));
+	assert(cap_rights_is_valid(src));
+	assert(cap_rights_is_valid(dst));
+
+	n = CAPARSIZE(dst);
+	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+
+	for (i = 0; i < n; i++) {
+		dst->cr_rights[i] &=
+		    ~(src->cr_rights[i] & 0x01FFFFFFFFFFFFFFULL);
+	}
+
+	assert(cap_rights_is_valid(src));
+	assert(cap_rights_is_valid(dst));
+}
+
+bool
+cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little)
+{
+	unsigned int i, n;
+
+	assert(CAPVER(big) == CAP_RIGHTS_VERSION_00);
+	assert(CAPVER(little) == CAP_RIGHTS_VERSION_00);
+	assert(CAPVER(big) == CAPVER(little));
+
+	n = CAPARSIZE(big);
+	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+
+	for (i = 0; i < n; i++) {
+		if ((big->cr_rights[i] & little->cr_rights[i]) !=
+		    little->cr_rights[i]) {
+			return (false);
+		}
+	}
+
+	return (true);
+}
diff --git a/sys/kern/subr_clock.c b/sys/kern/subr_clock.c
new file mode 100644
index 0000000..dbd74f7
--- /dev/null
+++ b/sys/kern/subr_clock.c
@@ -0,0 +1,225 @@
+/*-
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1982, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: Utah $Hdr: clock.c 1.18 91/01/21$
+ *	from: @(#)clock.c	8.2 (Berkeley) 1/12/94
+ *	from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp
+ *	and
+ *	from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/clock.h>
+#include <sys/sysctl.h>
+#include <sys/timetc.h>
+
+int tz_minuteswest;
+int tz_dsttime;
+
+/*
+ * The adjkerntz and wall_cmos_clock sysctls are in the "machdep" sysctl
+ * namespace because they were misplaced there originally.
+ */
+static int adjkerntz;
+static int
+sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+	if (!error && req->newptr)
+		resettodr();
+	return (error);
+}
+SYSCTL_PROC(_machdep, OID_AUTO, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
+    &adjkerntz, 0, sysctl_machdep_adjkerntz, "I",
+    "Local offset from UTC in seconds");
+
+static int ct_debug;
+SYSCTL_INT(_debug, OID_AUTO, clocktime, CTLFLAG_RW,
+    &ct_debug, 0, "Enable printing of clocktime debugging");
+
+static int wall_cmos_clock;
+SYSCTL_INT(_machdep, OID_AUTO, wall_cmos_clock, CTLFLAG_RW,
+    &wall_cmos_clock, 0, "Enables application of machdep.adjkerntz");
+
+/*--------------------------------------------------------------------*
+ * Generic routines to convert between a POSIX date
+ * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec
+ * Derived from NetBSD arch/hp300/hp300/clock.c
+ */
+
+
+#define	FEBRUARY	2
+#define	days_in_year(y) 	(leapyear(y) ? 366 : 365)
+#define	days_in_month(y, m) \
+	(month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0))
+/* Day of week. Days are counted from 1/1/1970, which was a Thursday */
+#define	day_of_week(days)	(((days) + 4) % 7)
+
+static const int month_days[12] = {
+	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+
+/*
+ * This inline avoids some unnecessary modulo operations
+ * as compared with the usual macro:
+ *   ( ((year % 4) == 0 &&
+ *      (year % 100) != 0) ||
+ *     ((year % 400) == 0) )
+ * It is otherwise equivalent.
+ */
+static int
+leapyear(int year)
+{
+	int rv = 0;
+
+	if ((year & 3) == 0) {
+		rv = 1;
+		if ((year % 100) == 0) {
+			rv = 0;
+			if ((year % 400) == 0)
+				rv = 1;
+		}
+	}
+	return (rv);
+}
+
+static void
+print_ct(struct clocktime *ct)
+{
+	printf("[%04d-%02d-%02d %02d:%02d:%02d]",
+	    ct->year, ct->mon, ct->day,
+	    ct->hour, ct->min, ct->sec);
+}
+
+int
+clock_ct_to_ts(struct clocktime *ct, struct timespec *ts)
+{
+	time_t secs;
+	int i, year, days;
+
+	year = ct->year;
+
+	if (ct_debug) {
+		printf("ct_to_ts(");
+		print_ct(ct);
+		printf(")");
+	}
+
+	/* Sanity checks. */
+	if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 ||
+	    ct->day > days_in_month(year, ct->mon) ||
+	    ct->hour > 23 ||  ct->min > 59 || ct->sec > 59 ||
+	    ct->year > 2037) {		/* time_t overflow */
+		if (ct_debug)
+			printf(" = EINVAL\n");
+		return (EINVAL);
+	}
+
+	/*
+	 * Compute days since start of time
+	 * First from years, then from months.
+	 */
+	days = 0;
+	for (i = POSIX_BASE_YEAR; i < year; i++)
+		days += days_in_year(i);
+
+	/* Months */
+	for (i = 1; i < ct->mon; i++)
+	  	days += days_in_month(year, i);
+	days += (ct->day - 1);
+
+	/* Add hours, minutes, seconds. */
+	secs = ((days * 24 + ct->hour) * 60 + ct->min) * 60 + ct->sec;
+
+	ts->tv_sec = secs;
+	ts->tv_nsec = ct->nsec;
+	if (ct_debug)
+		printf(" = %ld.%09ld\n", (long)ts->tv_sec, (long)ts->tv_nsec);
+	return (0);
+}
+
+void
+clock_ts_to_ct(struct timespec *ts, struct clocktime *ct)
+{
+	int i, year, days;
+	time_t rsec;	/* remainder seconds */
+	time_t secs;
+
+	secs = ts->tv_sec;
+	days = secs / SECDAY;
+	rsec = secs % SECDAY;
+
+	ct->dow = day_of_week(days);
+
+	/* Subtract out whole years, counting them in i. */
+	for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++)
+		days -= days_in_year(year);
+	ct->year = year;
+
+	/* Subtract out whole months, counting them in i. */
+	for (i = 1; days >= days_in_month(year, i); i++)
+		days -= days_in_month(year, i);
+	ct->mon = i;
+
+	/* Days are what is left over (+1) from all that. */
+	ct->day = days + 1;
+
+	/* Hours, minutes, seconds are easy */
+	ct->hour = rsec / 3600;
+	rsec = rsec % 3600;
+	ct->min  = rsec / 60;
+	rsec = rsec % 60;
+	ct->sec  = rsec;
+	ct->nsec = ts->tv_nsec;
+	if (ct_debug) {
+		printf("ts_to_ct(%ld.%09ld) = ",
+		    (long)ts->tv_sec, (long)ts->tv_nsec);
+		print_ct(ct);
+		printf("\n");
+	}
+}
+
+int
+utc_offset(void)
+{
+
+	return (tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0));
+}
diff --git a/sys/kern/subr_counter.c b/sys/kern/subr_counter.c
new file mode 100644
index 0000000..b3ddc7a
--- /dev/null
+++ b/sys/kern/subr_counter.c
@@ -0,0 +1,107 @@
+/*-
+ * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <vm/uma.h>
+
+#define IN_SUBR_COUNTER_C
+#include <sys/counter.h>
+ 
+static uma_zone_t uint64_pcpu_zone;
+
+void
+counter_u64_zero(counter_u64_t c)
+{
+
+	counter_u64_zero_inline(c);
+}
+
+uint64_t
+counter_u64_fetch(counter_u64_t c)
+{
+
+	return (counter_u64_fetch_inline(c));
+}
+
+counter_u64_t
+counter_u64_alloc(int flags)
+{
+	counter_u64_t r;
+
+	r = uma_zalloc(uint64_pcpu_zone, flags);
+	if (r != NULL)
+		counter_u64_zero(r);
+
+	return (r);
+}
+
+void
+counter_u64_free(counter_u64_t c)
+{
+
+	uma_zfree(uint64_pcpu_zone, c);
+}
+
+int
+sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t out;
+	int error;
+
+	out = counter_u64_fetch(*(counter_u64_t *)arg1);
+
+	error = SYSCTL_OUT(req, &out, sizeof(uint64_t));
+
+	if (error || !req->newptr)
+		return (error);
+
+	/*
+	 * Any write attempt to a counter zeroes it.
+	 */
+	counter_u64_zero(*(counter_u64_t *)arg1);
+
+	return (0);
+}
+
+static void
+counter_startup(void)
+{
+
+	uint64_pcpu_zone = uma_zcreate("uint64 pcpu", sizeof(uint64_t),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU);
+}
+SYSINIT(counter, SI_SUB_KMEM, SI_ORDER_ANY, counter_startup, NULL);
diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c
new file mode 100644
index 0000000..c44ef27
--- /dev/null
+++ b/sys/kern/subr_devstat.c
@@ -0,0 +1,604 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdtrace.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/devicestat.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/conf.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/atomic.h>
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+
+dtrace_io_start_probe_func_t dtrace_io_start_probe;
+dtrace_io_done_probe_func_t dtrace_io_done_probe;
+dtrace_io_wait_start_probe_func_t dtrace_io_wait_start_probe;
+dtrace_io_wait_done_probe_func_t dtrace_io_wait_done_probe;
+
+uint32_t	dtio_start_id;
+uint32_t	dtio_done_id;
+uint32_t	dtio_wait_start_id;
+uint32_t	dtio_wait_done_id;
+
+#define DTRACE_DEVSTAT_START() \
+	if (dtrace_io_start_probe != NULL) \
+		(*dtrace_io_start_probe)(dtio_start_id, NULL, ds);
+
+#define DTRACE_DEVSTAT_BIO_START() \
+	if (dtrace_io_start_probe != NULL) \
+		(*dtrace_io_start_probe)(dtio_start_id, bp, ds);
+
+#define DTRACE_DEVSTAT_DONE() \
+	if (dtrace_io_done_probe != NULL) \
+		(*dtrace_io_done_probe)(dtio_done_id, NULL, ds);
+
+#define DTRACE_DEVSTAT_BIO_DONE() \
+	if (dtrace_io_done_probe != NULL) \
+		(*dtrace_io_done_probe)(dtio_done_id, bp, ds);
+
+#define DTRACE_DEVSTAT_WAIT_START() \
+	if (dtrace_io_wait_start_probe != NULL) \
+		(*dtrace_io_wait_start_probe)(dtio_wait_start_id, NULL, ds);
+
+#define DTRACE_DEVSTAT_WAIT_DONE() \
+	if (dtrace_io_wait_done_probe != NULL) \
+		(*dtrace_io_wait_done_probe)(dtio_wait_done_id, NULL, ds);
+
+#else /* ! KDTRACE_HOOKS */
+
+#define DTRACE_DEVSTAT_START()
+
+#define DTRACE_DEVSTAT_BIO_START()
+
+#define DTRACE_DEVSTAT_DONE()
+
+#define DTRACE_DEVSTAT_BIO_DONE()
+
+#define DTRACE_DEVSTAT_WAIT_START()
+
+#define DTRACE_DEVSTAT_WAIT_DONE()
+#endif /* KDTRACE_HOOKS */
+
+static int devstat_num_devs;
+static long devstat_generation = 1;
+static int devstat_version = DEVSTAT_VERSION;
+static int devstat_current_devnumber;
+static struct mtx devstat_mutex;
+MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF);
+
+static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq);
+static struct devstat *devstat_alloc(void);
+static void devstat_free(struct devstat *);
+static void devstat_add_entry(struct devstat *ds, const void *dev_name, 
+		       int unit_number, uint32_t block_size,
+		       devstat_support_flags flags,
+		       devstat_type_flags device_type,
+		       devstat_priority priority);
+
+/*
+ * Allocate a devstat and initialize it
+ */
+struct devstat *
+devstat_new_entry(const void *dev_name,
+		  int unit_number, uint32_t block_size,
+		  devstat_support_flags flags,
+		  devstat_type_flags device_type,
+		  devstat_priority priority)
+{
+	struct devstat *ds;
+
+	mtx_assert(&devstat_mutex, MA_NOTOWNED);
+
+	ds = devstat_alloc();
+	mtx_lock(&devstat_mutex);
+	if (unit_number == -1) {
+		ds->id = dev_name;
+		binuptime(&ds->creation_time);
+		devstat_generation++;
+	} else {
+		devstat_add_entry(ds, dev_name, unit_number, block_size,
+				  flags, device_type, priority);
+	}
+	mtx_unlock(&devstat_mutex);
+	return (ds);
+}
+
+/*
+ * Take a malloced and zeroed devstat structure given to us, fill it in 
+ * and add it to the queue of devices.  
+ */
+static void
+devstat_add_entry(struct devstat *ds, const void *dev_name, 
+		  int unit_number, uint32_t block_size,
+		  devstat_support_flags flags,
+		  devstat_type_flags device_type,
+		  devstat_priority priority)
+{
+	struct devstatlist *devstat_head;
+	struct devstat *ds_tmp;
+
+	mtx_assert(&devstat_mutex, MA_OWNED);
+	devstat_num_devs++;
+
+	devstat_head = &device_statq;
+
+	/*
+	 * Priority sort.  Each driver passes in its priority when it adds
+	 * its devstat entry.  Drivers are sorted first by priority, and
+	 * then by probe order.
+	 * 
+	 * For the first device, we just insert it, since the priority
+	 * doesn't really matter yet.  Subsequent devices are inserted into
+	 * the list using the order outlined above.
+	 */
+	if (devstat_num_devs == 1)
+		STAILQ_INSERT_TAIL(devstat_head, ds, dev_links);
+	else {
+		STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) {
+			struct devstat *ds_next;
+
+			ds_next = STAILQ_NEXT(ds_tmp, dev_links);
+
+			/*
+			 * If we find a break between higher and lower
+			 * priority items, and if this item fits in the
+			 * break, insert it.  This also applies if the
+			 * "lower priority item" is the end of the list.
+			 */
+			if ((priority <= ds_tmp->priority)
+			 && ((ds_next == NULL)
+			   || (priority > ds_next->priority))) {
+				STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds,
+						    dev_links);
+				break;
+			} else if (priority > ds_tmp->priority) {
+				/*
+				 * If this is the case, we should be able
+				 * to insert ourselves at the head of the
+				 * list.  If we can't, something is wrong.
+				 */
+				if (ds_tmp == STAILQ_FIRST(devstat_head)) {
+					STAILQ_INSERT_HEAD(devstat_head,
+							   ds, dev_links);
+					break;
+				} else {
+					STAILQ_INSERT_TAIL(devstat_head,
+							   ds, dev_links);
+					printf("devstat_add_entry: HELP! "
+					       "sorting problem detected "
+					       "for name %p unit %d\n",
+					       dev_name, unit_number);
+					break;
+				}
+			}
+		}
+	}
+
+	ds->device_number = devstat_current_devnumber++;
+	ds->unit_number = unit_number;
+	strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN);
+	ds->block_size = block_size;
+	ds->flags = flags;
+	ds->device_type = device_type;
+	ds->priority = priority;
+	binuptime(&ds->creation_time);
+	devstat_generation++;
+}
+
+/*
+ * Remove a devstat structure from the list of devices.
+ */
+void
+devstat_remove_entry(struct devstat *ds)
+{
+	struct devstatlist *devstat_head;
+
+	mtx_assert(&devstat_mutex, MA_NOTOWNED);
+	if (ds == NULL)
+		return;
+
+	mtx_lock(&devstat_mutex);
+
+	devstat_head = &device_statq;
+
+	/* Remove this entry from the devstat queue */
+	atomic_add_acq_int(&ds->sequence1, 1);
+	if (ds->id == NULL) {
+		devstat_num_devs--;
+		STAILQ_REMOVE(devstat_head, ds, devstat, dev_links);
+	}
+	devstat_free(ds);
+	devstat_generation++;
+	mtx_unlock(&devstat_mutex);
+}
+
+/*
+ * Record a transaction start.
+ *
+ * See comments for devstat_end_transaction().  Ordering is very important
+ * here.
+ */
+void
+devstat_start_transaction(struct devstat *ds, struct bintime *now)
+{
+
+	mtx_assert(&devstat_mutex, MA_NOTOWNED);
+
+	/* sanity check */
+	if (ds == NULL)
+		return;
+
+	atomic_add_acq_int(&ds->sequence1, 1);
+	/*
+	 * We only want to set the start time when we are going from idle
+	 * to busy.  The start time is really the start of the latest busy
+	 * period.
+	 */
+	if (ds->start_count == ds->end_count) {
+		if (now != NULL)
+			ds->busy_from = *now;
+		else
+			binuptime(&ds->busy_from);
+	}
+	ds->start_count++;
+	atomic_add_rel_int(&ds->sequence0, 1);
+	DTRACE_DEVSTAT_START();
+}
+
+void
+devstat_start_transaction_bio(struct devstat *ds, struct bio *bp)
+{
+
+	mtx_assert(&devstat_mutex, MA_NOTOWNED);
+
+	/* sanity check */
+	if (ds == NULL)
+		return;
+
+	binuptime(&bp->bio_t0);
+	devstat_start_transaction(ds, &bp->bio_t0);
+	DTRACE_DEVSTAT_BIO_START();
+}
+
+/*
+ * Record the ending of a transaction, and incrment the various counters.
+ *
+ * Ordering in this function, and in devstat_start_transaction() is VERY
+ * important.  The idea here is to run without locks, so we are very
+ * careful to only modify some fields on the way "down" (i.e. at
+ * transaction start) and some fields on the way "up" (i.e. at transaction
+ * completion).  One exception is busy_from, which we only modify in
+ * devstat_start_transaction() when there are no outstanding transactions,
+ * and thus it can't be modified in devstat_end_transaction()
+ * simultaneously.
+ *
+ * The sequence0 and sequence1 fields are provided to enable an application
+ * spying on the structures with mmap(2) to tell when a structure is in a
+ * consistent state or not.
+ *
+ * For this to work 100% reliably, it is important that the two fields
+ * are at opposite ends of the structure and that they are incremented
+ * in the opposite order of how a memcpy(3) in userland would copy them.
+ * We assume that the copying happens front to back, but there is actually
+ * no way short of writing your own memcpy(3) replacement to guarantee
+ * this will be the case.
+ *
+ * In addition to this, being a kind of locks, they must be updated with
+ * atomic instructions using appropriate memory barriers.
+ */
+void
+devstat_end_transaction(struct devstat *ds, uint32_t bytes, 
+			devstat_tag_type tag_type, devstat_trans_flags flags,
+			struct bintime *now, struct bintime *then)
+{
+	struct bintime dt, lnow;
+
+	/* sanity check */
+	if (ds == NULL)
+		return;
+
+	if (now == NULL) {
+		now = &lnow;
+		binuptime(now);
+	}
+
+	atomic_add_acq_int(&ds->sequence1, 1);
+	/* Update byte and operations counts */
+	ds->bytes[flags] += bytes;
+	ds->operations[flags]++;
+
+	/*
+	 * Keep a count of the various tag types sent.
+	 */
+	if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 &&
+	    tag_type != DEVSTAT_TAG_NONE)
+		ds->tag_types[tag_type]++;
+
+	if (then != NULL) {
+		/* Update duration of operations */
+		dt = *now;
+		bintime_sub(&dt, then);
+		bintime_add(&ds->duration[flags], &dt);
+	}
+
+	/* Accumulate busy time */
+	dt = *now;
+	bintime_sub(&dt, &ds->busy_from);
+	bintime_add(&ds->busy_time, &dt);
+	ds->busy_from = *now;
+
+	ds->end_count++;
+	atomic_add_rel_int(&ds->sequence0, 1);
+	DTRACE_DEVSTAT_DONE();
+}
+
+void
+devstat_end_transaction_bio(struct devstat *ds, struct bio *bp)
+{
+	devstat_trans_flags flg;
+
+	/* sanity check */
+	if (ds == NULL)
+		return;
+
+	if (bp->bio_cmd == BIO_DELETE)
+		flg = DEVSTAT_FREE;
+	else if (bp->bio_cmd == BIO_READ)
+		flg = DEVSTAT_READ;
+	else if (bp->bio_cmd == BIO_WRITE)
+		flg = DEVSTAT_WRITE;
+	else 
+		flg = DEVSTAT_NO_DATA;
+
+	devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid,
+				DEVSTAT_TAG_SIMPLE, flg, NULL, &bp->bio_t0);
+	DTRACE_DEVSTAT_BIO_DONE();
+}
+
+/*
+ * This is the sysctl handler for the devstat package.  The data pushed out
+ * on the kern.devstat.all sysctl variable consists of the current devstat
+ * generation number, and then an array of devstat structures, one for each
+ * device in the system.
+ *
+ * This is more cryptic that obvious, but basically we neither can nor
+ * want to hold the devstat_mutex for any amount of time, so we grab it
+ * only when we need to and keep an eye on devstat_generation all the time.
+ */
+static int
+sysctl_devstat(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	long mygen;
+	struct devstat *nds;
+
+	mtx_assert(&devstat_mutex, MA_NOTOWNED);
+
+	/*
+	 * XXX devstat_generation should really be "volatile" but that
+	 * XXX freaks out the sysctl macro below.  The places where we
+	 * XXX change it and inspect it are bracketed in the mutex which
+	 * XXX guarantees us proper write barriers.  I don't belive the
+	 * XXX compiler is allowed to optimize mygen away across calls
+	 * XXX to other functions, so the following is belived to be safe.
+	 */
+	mygen = devstat_generation;
+
+	error = SYSCTL_OUT(req, &mygen, sizeof(mygen));
+
+	if (devstat_num_devs == 0)
+		return(0);
+
+	if (error != 0)
+		return (error);
+
+	mtx_lock(&devstat_mutex);
+	nds = STAILQ_FIRST(&device_statq); 
+	if (mygen != devstat_generation)
+		error = EBUSY;
+	mtx_unlock(&devstat_mutex);
+
+	if (error != 0)
+		return (error);
+
+	for (;nds != NULL;) {
+		error = SYSCTL_OUT(req, nds, sizeof(struct devstat));
+		if (error != 0)
+			return (error);
+		mtx_lock(&devstat_mutex);
+		if (mygen != devstat_generation)
+			error = EBUSY;
+		else
+			nds = STAILQ_NEXT(nds, dev_links);
+		mtx_unlock(&devstat_mutex);
+		if (error != 0)
+			return (error);
+	}
+	return(error);
+}
+
+/*
+ * Sysctl entries for devstat.  The first one is a node that all the rest
+ * hang off of. 
+ */
+static SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, NULL,
+    "Device Statistics");
+
+SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE,
+    NULL, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list");
+/*
+ * Export the number of devices in the system so that userland utilities
+ * can determine how much memory to allocate to hold all the devices.
+ */
+SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, 
+    &devstat_num_devs, 0, "Number of devices in the devstat list");
+SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD,
+    &devstat_generation, 0, "Devstat list generation");
+SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, 
+    &devstat_version, 0, "Devstat list version number");
+
+/*
+ * Allocator for struct devstat structures.  We sub-allocate these from pages
+ * which we get from malloc.  These pages are exported for mmap(2)'ing through
+ * a miniature device driver
+ */
+
+#define statsperpage (PAGE_SIZE / sizeof(struct devstat))
+
+static d_mmap_t devstat_mmap;
+
+static struct cdevsw devstat_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_flags =	D_NEEDGIANT,
+	.d_mmap =	devstat_mmap,
+	.d_name =	"devstat",
+};
+
+struct statspage {
+	TAILQ_ENTRY(statspage)	list;
+	struct devstat		*stat;
+	u_int			nfree;
+};
+
+static TAILQ_HEAD(, statspage)	pagelist = TAILQ_HEAD_INITIALIZER(pagelist);
+static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics");
+
+static int
+devstat_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
+    int nprot, vm_memattr_t *memattr)
+{
+	struct statspage *spp;
+
+	if (nprot != VM_PROT_READ)
+		return (-1);
+	TAILQ_FOREACH(spp, &pagelist, list) {
+		if (offset == 0) {
+			*paddr = vtophys(spp->stat);
+			return (0);
+		}
+		offset -= PAGE_SIZE;
+	}
+	return (-1);
+}
+
+static struct devstat *
+devstat_alloc(void)
+{
+	struct devstat *dsp;
+	struct statspage *spp, *spp2;
+	u_int u;
+	static int once;
+
+	mtx_assert(&devstat_mutex, MA_NOTOWNED);
+	if (!once) {
+		make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME,
+		    &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400,
+		    DEVSTAT_DEVICE_NAME);
+		once = 1;
+	}
+	spp2 = NULL;
+	mtx_lock(&devstat_mutex);
+	for (;;) {
+		TAILQ_FOREACH(spp, &pagelist, list) {
+			if (spp->nfree > 0)
+				break;
+		}
+		if (spp != NULL)
+			break;
+		mtx_unlock(&devstat_mutex);
+		spp2 = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK);
+		spp2->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK);
+		spp2->nfree = statsperpage;
+
+		/*
+		 * If free statspages were added while the lock was released
+		 * just reuse them.
+		 */
+		mtx_lock(&devstat_mutex);
+		TAILQ_FOREACH(spp, &pagelist, list)
+			if (spp->nfree > 0)
+				break;
+		if (spp == NULL) {
+			spp = spp2;
+
+			/*
+			 * It would make more sense to add the new page at the
+			 * head but the order on the list determine the
+			 * sequence of the mapping so we can't do that.
+			 */
+			TAILQ_INSERT_TAIL(&pagelist, spp, list);
+		} else
+			break;
+	}
+	dsp = spp->stat;
+	for (u = 0; u < statsperpage; u++) {
+		if (dsp->allocated == 0)
+			break;
+		dsp++;
+	}
+	spp->nfree--;
+	dsp->allocated = 1;
+	mtx_unlock(&devstat_mutex);
+	if (spp2 != NULL && spp2 != spp) {
+		free(spp2->stat, M_DEVSTAT);
+		free(spp2, M_DEVSTAT);
+	}
+	return (dsp);
+}
+
+static void
+devstat_free(struct devstat *dsp)
+{
+	struct statspage *spp;
+
+	mtx_assert(&devstat_mutex, MA_OWNED);
+	bzero(dsp, sizeof *dsp);
+	TAILQ_FOREACH(spp, &pagelist, list) {
+		if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) {
+			spp->nfree++;
+			return;
+		}
+	}
+}
+
+SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD,
+    NULL, sizeof(struct devstat), "sizeof(struct devstat)");
diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c
new file mode 100644
index 0000000..2391540
--- /dev/null
+++ b/sys/kern/subr_disk.c
@@ -0,0 +1,267 @@
+/*-
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * The bioq_disksort() (and the specification of the bioq API)
+ * have been written by Luigi Rizzo and Fabio Checconi under the same
+ * license as above.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_geom.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/conf.h>
+#include <sys/disk.h>
+#include <geom/geom_disk.h>
+
+/*-
+ * Disk error is the preface to plaintive error messages
+ * about failing disk transfers.  It prints messages of the form
+ * 	"hp0g: BLABLABLA cmd=read fsbn 12345 of 12344-12347"
+ * blkdone should be -1 if the position of the error is unknown.
+ * The message is printed with printf.
+ */
+void
+disk_err(struct bio *bp, const char *what, int blkdone, int nl)
+{
+	daddr_t sn;
+
+	if (bp->bio_dev != NULL)
+		printf("%s: %s ", devtoname(bp->bio_dev), what);
+	else if (bp->bio_disk != NULL)
+		printf("%s%d: %s ",
+		    bp->bio_disk->d_name, bp->bio_disk->d_unit, what);
+	else
+		printf("disk??: %s ", what);
+	switch(bp->bio_cmd) {
+	case BIO_READ:		printf("cmd=read "); break;
+	case BIO_WRITE:		printf("cmd=write "); break;
+	case BIO_DELETE:	printf("cmd=delete "); break;
+	case BIO_GETATTR:	printf("cmd=getattr "); break;
+	case BIO_FLUSH:		printf("cmd=flush "); break;
+	default:		printf("cmd=%x ", bp->bio_cmd); break;
+	}
+	sn = bp->bio_pblkno;
+	if (bp->bio_bcount <= DEV_BSIZE) {
+		printf("fsbn %jd%s", (intmax_t)sn, nl ? "\n" : "");
+		return;
+	}
+	if (blkdone >= 0) {
+		sn += blkdone;
+		printf("fsbn %jd of ", (intmax_t)sn);
+	}
+	printf("%jd-%jd", (intmax_t)bp->bio_pblkno,
+	    (intmax_t)(bp->bio_pblkno + (bp->bio_bcount - 1) / DEV_BSIZE));
+	if (nl)
+		printf("\n");
+}
+
+/*
+ * BIO queue implementation
+ *
+ * Please read carefully the description below before making any change
+ * to the code, or you might change the behaviour of the data structure
+ * in undesirable ways.
+ *
+ * A bioq stores disk I/O request (bio), normally sorted according to
+ * the distance of the requested position (bio->bio_offset) from the
+ * current head position (bioq->last_offset) in the scan direction, i.e.
+ *
+ * 	(uoff_t)(bio_offset - last_offset)
+ *
+ * Note that the cast to unsigned (uoff_t) is fundamental to insure
+ * that the distance is computed in the scan direction.
+ *
+ * The main methods for manipulating the bioq are:
+ *
+ *   bioq_disksort()	performs an ordered insertion;
+ *
+ *   bioq_first()	return the head of the queue, without removing;
+ *
+ *   bioq_takefirst()	return and remove the head of the queue,
+ *		updating the 'current head position' as
+ *		bioq->last_offset = bio->bio_offset + bio->bio_length;
+ *
+ * When updating the 'current head position', we assume that the result of
+ * bioq_takefirst() is dispatched to the device, so bioq->last_offset
+ * represents the head position once the request is complete.
+ *
+ * If the bioq is manipulated using only the above calls, it starts
+ * with a sorted sequence of requests with bio_offset >= last_offset,
+ * possibly followed by another sorted sequence of requests with
+ * 0 <= bio_offset < bioq->last_offset 
+ *
+ * NOTE: historical behaviour was to ignore bio->bio_length in the
+ *	update, but its use tracks the head position in a better way.
+ *	Historical behaviour was also to update the head position when
+ *	the request under service is complete, rather than when the
+ *	request is extracted from the queue. However, the current API
+ *	has no method to update the head position; secondly, once
+ *	a request has been submitted to the disk, we have no idea of
+ *	the actual head position, so the final one is our best guess.
+ *
+ * --- Direct queue manipulation ---
+ *
+ * A bioq uses an underlying TAILQ to store requests, so we also
+ * export methods to manipulate the TAILQ, in particular:
+ *
+ * bioq_insert_tail()	insert an entry at the end.
+ *		It also creates a 'barrier' so all subsequent
+ *		insertions through bioq_disksort() will end up
+ *		after this entry;
+ *
+ * bioq_insert_head()	insert an entry at the head, update
+ *		bioq->last_offset = bio->bio_offset so that
+ *		all subsequent insertions through bioq_disksort()
+ *		will end up after this entry;
+ *
+ * bioq_remove()	remove a generic element from the queue, act as
+ *		bioq_takefirst() if invoked on the head of the queue.
+ *
+ * The semantic of these methods is the same as the operations
+ * on the underlying TAILQ, but with additional guarantees on
+ * subsequent bioq_disksort() calls. E.g. bioq_insert_tail()
+ * can be useful for making sure that all previous ops are flushed
+ * to disk before continuing.
+ *
+ * Updating bioq->last_offset on a bioq_insert_head() guarantees
+ * that the bio inserted with the last bioq_insert_head() will stay
+ * at the head of the queue even after subsequent bioq_disksort().
+ *
+ * Note that when the direct queue manipulation functions are used,
+ * the queue may contain multiple inversion points (i.e. more than
+ * two sorted sequences of requests).
+ *
+ */
+
+void
+bioq_init(struct bio_queue_head *head)
+{
+
+	TAILQ_INIT(&head->queue);
+	head->last_offset = 0;
+	head->insert_point = NULL;
+}
+
+void
+bioq_remove(struct bio_queue_head *head, struct bio *bp)
+{
+
+	if (head->insert_point == NULL) {
+		if (bp == TAILQ_FIRST(&head->queue))
+			head->last_offset = bp->bio_offset + bp->bio_length;
+	} else if (bp == head->insert_point)
+		head->insert_point = NULL;
+
+	TAILQ_REMOVE(&head->queue, bp, bio_queue);
+}
+
+void
+bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error)
+{
+	struct bio *bp;
+
+	while ((bp = bioq_takefirst(head)) != NULL)
+		biofinish(bp, stp, error);
+}
+
+void
+bioq_insert_head(struct bio_queue_head *head, struct bio *bp)
+{
+
+	if (head->insert_point == NULL)
+		head->last_offset = bp->bio_offset;
+	TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
+}
+
+void
+bioq_insert_tail(struct bio_queue_head *head, struct bio *bp)
+{
+
+	TAILQ_INSERT_TAIL(&head->queue, bp, bio_queue);
+	head->insert_point = bp;
+	head->last_offset = bp->bio_offset;
+}
+
+struct bio *
+bioq_first(struct bio_queue_head *head)
+{
+
+	return (TAILQ_FIRST(&head->queue));
+}
+
+struct bio *
+bioq_takefirst(struct bio_queue_head *head)
+{
+	struct bio *bp;
+
+	bp = TAILQ_FIRST(&head->queue);
+	if (bp != NULL)
+		bioq_remove(head, bp);
+	return (bp);
+}
+
+/*
+ * Compute the sorting key. The cast to unsigned is
+ * fundamental for correctness, see the description
+ * near the beginning of the file.
+ */
+static inline uoff_t
+bioq_bio_key(struct bio_queue_head *head, struct bio *bp)
+{
+
+	return ((uoff_t)(bp->bio_offset - head->last_offset));
+}
+
+/*
+ * Seek sort for disks.
+ *
+ * Sort all requests in a single queue while keeping
+ * track of the current position of the disk with last_offset.
+ * See above for details.
+ */
+void
+bioq_disksort(struct bio_queue_head *head, struct bio *bp)
+{
+	struct bio *cur, *prev;
+	uoff_t key;
+
+	if ((bp->bio_flags & BIO_ORDERED) != 0) {
+		/*
+		 * Ordered transactions can only be dispatched
+		 * after any currently queued transactions.  They
+		 * also have barrier semantics - no transactions
+		 * queued in the future can pass them.
+		 */
+		bioq_insert_tail(head, bp);
+		return;
+	}
+
+	prev = NULL;
+	key = bioq_bio_key(head, bp);
+	cur = TAILQ_FIRST(&head->queue);
+
+	if (head->insert_point) {
+		prev = head->insert_point;
+		cur = TAILQ_NEXT(head->insert_point, bio_queue);
+	}
+
+	while (cur != NULL && key >= bioq_bio_key(head, cur)) {
+		prev = cur;
+		cur = TAILQ_NEXT(cur, bio_queue);
+	}
+
+	if (prev == NULL)
+		TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
+	else
+		TAILQ_INSERT_AFTER(&head->queue, prev, bp, bio_queue);
+}
diff --git a/sys/kern/subr_dummy_vdso_tc.c b/sys/kern/subr_dummy_vdso_tc.c
new file mode 100644
index 0000000..9c84501
--- /dev/null
+++ b/sys/kern/subr_dummy_vdso_tc.c
@@ -0,0 +1,49 @@
+/*-
+ * Copyright 2012 Konstantin Belousov <kib@FreeBSD.ORG>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/vdso.h>
+
+uint32_t
+cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th)
+{
+
+	return (0);
+}
+
+#ifdef COMPAT_FREEBSD32
+uint32_t
+cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
+{
+
+	return (0);
+}
+#endif
diff --git a/sys/kern/subr_eventhandler.c b/sys/kern/subr_eventhandler.c
new file mode 100644
index 0000000..5894099
--- /dev/null
+++ b/sys/kern/subr_eventhandler.c
@@ -0,0 +1,280 @@
+/*-
+ * Copyright (c) 1999 Michael Smith <msmith@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+
+static MALLOC_DEFINE(M_EVENTHANDLER, "eventhandler", "Event handler records");
+
+/* List of 'slow' lists */
+static TAILQ_HEAD(, eventhandler_list)	eventhandler_lists;
+static int				eventhandler_lists_initted = 0;
+static struct mtx			eventhandler_mutex;
+
+struct eventhandler_entry_generic 
+{
+    struct eventhandler_entry	ee;
+    void			(* func)(void);
+};
+
+static struct eventhandler_list *_eventhandler_find_list(const char *name);
+
+/*
+ * Initialize the eventhandler mutex and list.
+ */
+static void
+eventhandler_init(void *dummy __unused)
+{
+    TAILQ_INIT(&eventhandler_lists);
+    mtx_init(&eventhandler_mutex, "eventhandler", NULL, MTX_DEF);
+    atomic_store_rel_int(&eventhandler_lists_initted, 1);
+}
+SYSINIT(eventhandlers, SI_SUB_EVENTHANDLER, SI_ORDER_FIRST, eventhandler_init,
+    NULL);
+
+/* 
+ * Insertion is O(n) due to the priority scan, but optimises to O(1)
+ * if all priorities are identical.
+ */
+static eventhandler_tag
+eventhandler_register_internal(struct eventhandler_list *list,
+    const char *name, eventhandler_tag epn)
+{
+    struct eventhandler_list		*new_list;
+    struct eventhandler_entry		*ep;
+    
+    KASSERT(eventhandler_lists_initted, ("eventhandler registered too early"));
+    KASSERT(epn != NULL, ("%s: cannot register NULL event", __func__));
+
+    /* lock the eventhandler lists */
+    mtx_lock(&eventhandler_mutex);
+
+    /* Do we need to find/create the (slow) list? */
+    if (list == NULL) {
+	/* look for a matching, existing list */
+	list = _eventhandler_find_list(name);
+
+	/* Do we need to create the list? */
+	if (list == NULL) {
+	    mtx_unlock(&eventhandler_mutex);
+
+	    new_list = malloc(sizeof(struct eventhandler_list) +
+		strlen(name) + 1, M_EVENTHANDLER, M_WAITOK);
+
+	    /* If someone else created it already, then use that one. */
+	    mtx_lock(&eventhandler_mutex);
+	    list = _eventhandler_find_list(name);
+	    if (list != NULL) {
+		free(new_list, M_EVENTHANDLER);
+	    } else {
+		CTR2(KTR_EVH, "%s: creating list \"%s\"", __func__, name);
+		list = new_list;
+		list->el_flags = 0;
+		list->el_runcount = 0;
+		bzero(&list->el_lock, sizeof(list->el_lock));
+		list->el_name = (char *)list + sizeof(struct eventhandler_list);
+		strcpy(list->el_name, name);
+		TAILQ_INSERT_HEAD(&eventhandler_lists, list, el_link);
+	    }
+	}
+    }
+    if (!(list->el_flags & EHL_INITTED)) {
+	TAILQ_INIT(&list->el_entries);
+	mtx_init(&list->el_lock, name, "eventhandler list", MTX_DEF);
+	atomic_store_rel_int(&list->el_flags, EHL_INITTED);
+    }
+    mtx_unlock(&eventhandler_mutex);
+
+    KASSERT(epn->ee_priority != EHE_DEAD_PRIORITY,
+	("%s: handler for %s registered with dead priority", __func__, name));
+
+    /* sort it into the list */
+    CTR4(KTR_EVH, "%s: adding item %p (function %p) to \"%s\"", __func__, epn,
+	((struct eventhandler_entry_generic *)epn)->func, name);
+    EHL_LOCK(list);
+    TAILQ_FOREACH(ep, &list->el_entries, ee_link) {
+	if (ep->ee_priority != EHE_DEAD_PRIORITY &&
+	    epn->ee_priority < ep->ee_priority) {
+	    TAILQ_INSERT_BEFORE(ep, epn, ee_link);
+	    break;
+	}
+    }
+    if (ep == NULL)
+	TAILQ_INSERT_TAIL(&list->el_entries, epn, ee_link);
+    EHL_UNLOCK(list);
+    return(epn);
+}
+
+eventhandler_tag
+eventhandler_register(struct eventhandler_list *list, const char *name, 
+		      void *func, void *arg, int priority)
+{
+    struct eventhandler_entry_generic	*eg;
+    
+    /* allocate an entry for this handler, populate it */
+    eg = malloc(sizeof(struct eventhandler_entry_generic), M_EVENTHANDLER,
+	M_WAITOK | M_ZERO);
+    eg->func = func;
+    eg->ee.ee_arg = arg;
+    eg->ee.ee_priority = priority;
+
+    return (eventhandler_register_internal(list, name, &eg->ee));
+}
+
+#ifdef VIMAGE
+struct eventhandler_entry_generic_vimage
+{
+    struct eventhandler_entry		ee;
+    vimage_iterator_func_t		func;		/* Vimage iterator function. */
+    struct eventhandler_entry_vimage	v_ee;		/* Original func, arg. */
+};
+
+eventhandler_tag
+vimage_eventhandler_register(struct eventhandler_list *list, const char *name, 
+    void *func, void *arg, int priority, vimage_iterator_func_t iterfunc)
+{
+    struct eventhandler_entry_generic_vimage	*eg;
+    
+    /* allocate an entry for this handler, populate it */
+    eg = malloc(sizeof(struct eventhandler_entry_generic_vimage),
+	M_EVENTHANDLER, M_WAITOK | M_ZERO);
+    eg->func = iterfunc;
+    eg->v_ee.func = func;
+    eg->v_ee.ee_arg = arg;
+    eg->ee.ee_arg = &eg->v_ee;
+    eg->ee.ee_priority = priority;
+
+    return (eventhandler_register_internal(list, name, &eg->ee));
+}
+#endif
+
+void
+eventhandler_deregister(struct eventhandler_list *list, eventhandler_tag tag)
+{
+    struct eventhandler_entry	*ep = tag;
+
+    EHL_LOCK_ASSERT(list, MA_OWNED);
+    if (ep != NULL) {
+	/* remove just this entry */
+	if (list->el_runcount == 0) {
+	    CTR3(KTR_EVH, "%s: removing item %p from \"%s\"", __func__, ep,
+		list->el_name);
+	    TAILQ_REMOVE(&list->el_entries, ep, ee_link);
+	    free(ep, M_EVENTHANDLER);
+	} else {
+	    CTR3(KTR_EVH, "%s: marking item %p from \"%s\" as dead", __func__,
+		ep, list->el_name);
+	    ep->ee_priority = EHE_DEAD_PRIORITY;
+	}
+    } else {
+	/* remove entire list */
+	if (list->el_runcount == 0) {
+	    CTR2(KTR_EVH, "%s: removing all items from \"%s\"", __func__,
+		list->el_name);
+	    while (!TAILQ_EMPTY(&list->el_entries)) {
+		ep = TAILQ_FIRST(&list->el_entries);
+		TAILQ_REMOVE(&list->el_entries, ep, ee_link);
+		free(ep, M_EVENTHANDLER);
+	    }
+	} else {
+	    CTR2(KTR_EVH, "%s: marking all items from \"%s\" as dead",
+		__func__, list->el_name);
+	    TAILQ_FOREACH(ep, &list->el_entries, ee_link)
+		ep->ee_priority = EHE_DEAD_PRIORITY;
+	}
+    }
+    while (list->el_runcount > 0)
+	    mtx_sleep(list, &list->el_lock, 0, "evhrm", 0);
+    EHL_UNLOCK(list);
+}
+
+/*
+ * Internal version for use when eventhandler list is already locked.
+ */
+static struct eventhandler_list *
+_eventhandler_find_list(const char *name)
+{
+    struct eventhandler_list	*list;
+
+    mtx_assert(&eventhandler_mutex, MA_OWNED);
+    TAILQ_FOREACH(list, &eventhandler_lists, el_link) {
+	if (!strcmp(name, list->el_name))
+	    break;
+    }
+    return (list);
+}
+
+/*
+ * Lookup a "slow" list by name.  Returns with the list locked.
+ */
+struct eventhandler_list *
+eventhandler_find_list(const char *name)
+{
+    struct eventhandler_list	*list;
+
+    if (!eventhandler_lists_initted)
+	return(NULL);
+    
+    /* scan looking for the requested list */
+    mtx_lock(&eventhandler_mutex);
+    list = _eventhandler_find_list(name);
+    if (list != NULL)
+	EHL_LOCK(list);
+    mtx_unlock(&eventhandler_mutex);
+    
+    return(list);
+}
+
+/*
+ * Prune "dead" entries from an eventhandler list.
+ */
+void
+eventhandler_prune_list(struct eventhandler_list *list)
+{
+    struct eventhandler_entry *ep, *en;
+    int pruned = 0;
+
+    CTR2(KTR_EVH, "%s: pruning list \"%s\"", __func__, list->el_name);
+    EHL_LOCK_ASSERT(list, MA_OWNED);
+    TAILQ_FOREACH_SAFE(ep, &list->el_entries, ee_link, en) {
+	if (ep->ee_priority == EHE_DEAD_PRIORITY) {
+	    TAILQ_REMOVE(&list->el_entries, ep, ee_link);
+	    free(ep, M_EVENTHANDLER);
+	    pruned++;
+	}
+    }
+    if (pruned > 0)
+	    wakeup(list);
+}
diff --git a/sys/kern/subr_fattime.c b/sys/kern/subr_fattime.c
new file mode 100644
index 0000000..1fb207e
--- /dev/null
+++ b/sys/kern/subr_fattime.c
@@ -0,0 +1,307 @@
+/*-
+ * Copyright (c) 2006 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ * Convert MS-DOS FAT format timestamps to and from unix timespecs
+ *
+ * FAT filestamps originally consisted of two 16 bit integers, encoded like
+ * this:
+ *
+ *	yyyyyyymmmmddddd (year - 1980, month, day)
+ *
+ *      hhhhhmmmmmmsssss (hour, minutes, seconds divided by two)
+ *
+ * Subsequently even Microsoft realized that files could be accessed in less
+ * than two seconds and a byte was added containing:
+ *
+ *      sfffffff	 (second mod two, 100ths of second)
+ *
+ * FAT timestamps are in the local timezone, with no indication of which
+ * timezone much less if daylight savings time applies.
+ *
+ * Later on again, in Windows NT, timestamps were defined relative to GMT.
+ *
+ * Purists will point out that UTC replaced GMT for such uses around
+ * a century ago, already then.  Ironically "NT" was an abbreviation of 
+ * "New Technology".  Anyway...
+ *
+ * The 'utc' argument determines if the resulting FATTIME timestamp
+ * should b on the UTC or local timezone calendar.
+ *
+ * The conversion functions below cut time into four-year leap-second
+ * cycles rather than single years and uses table lookups inside those
+ * cycles to get the months and years sorted out.
+ *
+ * Obviously we cannot calculate the correct table index going from
+ * a posix seconds count to Y/M/D, but we can get pretty close by
+ * dividing the daycount by 32 (giving a too low index), and then
+ * adjusting upwards a couple of steps if necessary.
+ *
+ * FAT timestamps have 7 bits for the year and starts at 1980, so
+ * they can represent up to 2107 which means that the non-leap-year
+ * 2100 must be handled.
+ *
+ * XXX: As long as time_t is 32 bits this is not relevant or easily
+ * XXX: testable.  Revisit when time_t grows bigger.
+ * XXX: grepfodder: 64 bit time_t, y2100, y2.1k, 2100, leap year
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/clock.h>
+
+#define DAY	(24 * 60 * 60)	/* Length of day in seconds */
+#define YEAR	365		/* Length of normal year */
+#define LYC	(4 * YEAR + 1)	/* Length of 4 year leap-year cycle */
+#define T1980	(10 * 365 + 2)	/* Days from 1970 to 1980 */
+
+/* End of month is N days from start of (normal) year */
+#define JAN	31
+#define FEB	(JAN + 28)
+#define MAR	(FEB + 31)
+#define APR	(MAR + 30)
+#define MAY	(APR + 31)
+#define JUN	(MAY + 30)
+#define JUL	(JUN + 31)
+#define AUG	(JUL + 31)
+#define SEP	(AUG + 30)
+#define OCT	(SEP + 31)
+#define NOV	(OCT + 30)
+#define DEC	(NOV + 31)
+
+/* Table of months in a 4 year leap-year cycle */
+
+#define ENC(y,m)	(((y) << 9) | ((m) << 5))
+
+static const struct {
+	uint16_t	days;	/* month start in days relative to cycle */
+	uint16_t	coded;	/* encoded year + month information */
+} mtab[48] = {
+	{   0 + 0 * YEAR,     ENC(0, 1)  },
+
+	{ JAN + 0 * YEAR,     ENC(0, 2)  }, { FEB + 0 * YEAR + 1, ENC(0, 3)  },
+	{ MAR + 0 * YEAR + 1, ENC(0, 4)  }, { APR + 0 * YEAR + 1, ENC(0, 5)  },
+	{ MAY + 0 * YEAR + 1, ENC(0, 6)  }, { JUN + 0 * YEAR + 1, ENC(0, 7)  },
+	{ JUL + 0 * YEAR + 1, ENC(0, 8)  }, { AUG + 0 * YEAR + 1, ENC(0, 9)  },
+	{ SEP + 0 * YEAR + 1, ENC(0, 10) }, { OCT + 0 * YEAR + 1, ENC(0, 11) },
+	{ NOV + 0 * YEAR + 1, ENC(0, 12) }, { DEC + 0 * YEAR + 1, ENC(1, 1)  },
+
+	{ JAN + 1 * YEAR + 1, ENC(1, 2)  }, { FEB + 1 * YEAR + 1, ENC(1, 3)  },
+	{ MAR + 1 * YEAR + 1, ENC(1, 4)  }, { APR + 1 * YEAR + 1, ENC(1, 5)  },
+	{ MAY + 1 * YEAR + 1, ENC(1, 6)  }, { JUN + 1 * YEAR + 1, ENC(1, 7)  },
+	{ JUL + 1 * YEAR + 1, ENC(1, 8)  }, { AUG + 1 * YEAR + 1, ENC(1, 9)  },
+	{ SEP + 1 * YEAR + 1, ENC(1, 10) }, { OCT + 1 * YEAR + 1, ENC(1, 11) },
+	{ NOV + 1 * YEAR + 1, ENC(1, 12) }, { DEC + 1 * YEAR + 1, ENC(2, 1)  },
+
+	{ JAN + 2 * YEAR + 1, ENC(2, 2)  }, { FEB + 2 * YEAR + 1, ENC(2, 3)  },
+	{ MAR + 2 * YEAR + 1, ENC(2, 4)  }, { APR + 2 * YEAR + 1, ENC(2, 5)  },
+	{ MAY + 2 * YEAR + 1, ENC(2, 6)  }, { JUN + 2 * YEAR + 1, ENC(2, 7)  },
+	{ JUL + 2 * YEAR + 1, ENC(2, 8)  }, { AUG + 2 * YEAR + 1, ENC(2, 9)  },
+	{ SEP + 2 * YEAR + 1, ENC(2, 10) }, { OCT + 2 * YEAR + 1, ENC(2, 11) },
+	{ NOV + 2 * YEAR + 1, ENC(2, 12) }, { DEC + 2 * YEAR + 1, ENC(3, 1)  },
+
+	{ JAN + 3 * YEAR + 1, ENC(3, 2)  }, { FEB + 3 * YEAR + 1, ENC(3, 3)  },
+	{ MAR + 3 * YEAR + 1, ENC(3, 4)  }, { APR + 3 * YEAR + 1, ENC(3, 5)  },
+	{ MAY + 3 * YEAR + 1, ENC(3, 6)  }, { JUN + 3 * YEAR + 1, ENC(3, 7)  },
+	{ JUL + 3 * YEAR + 1, ENC(3, 8)  }, { AUG + 3 * YEAR + 1, ENC(3, 9)  },
+	{ SEP + 3 * YEAR + 1, ENC(3, 10) }, { OCT + 3 * YEAR + 1, ENC(3, 11) },
+	{ NOV + 3 * YEAR + 1, ENC(3, 12) }
+};
+
+
+void
+timespec2fattime(struct timespec *tsp, int utc, uint16_t *ddp, uint16_t *dtp, uint8_t *dhp)
+{
+	time_t t1;
+	unsigned t2, l, m;
+
+	t1 = tsp->tv_sec;
+	if (!utc)
+		t1 -= utc_offset();
+
+	if (dhp != NULL)
+		*dhp = (tsp->tv_sec & 1) * 100 + tsp->tv_nsec / 10000000;
+	if (dtp != NULL) {
+		*dtp = (t1 / 2) % 30;
+		*dtp |= ((t1 / 60) % 60) << 5;
+		*dtp |= ((t1 / 3600) % 24) << 11;
+	}
+	if (ddp != NULL) {
+		t2 = t1 / DAY;
+		if (t2 < T1980) {
+			/* Impossible date, truncate to 1980-01-01 */
+			*ddp = 0x0021;
+		} else {
+			t2 -= T1980;
+
+			/*
+			 * 2100 is not a leap year.
+			 * XXX: a 32 bit time_t can not get us here.
+			 */
+			if (t2 >= ((2100 - 1980) / 4 * LYC + FEB))
+				t2++;
+
+			/* Account for full leapyear cycles */
+			l = t2 / LYC;
+			*ddp = (l * 4) << 9;
+			t2 -= l * LYC;
+
+			/* Find approximate table entry */
+			m = t2 / 32;
+
+			/* Find correct table entry */
+			while (m < 47 && mtab[m + 1].days <= t2)
+				m++;
+
+			/* Get year + month from the table */
+			*ddp += mtab[m].coded;
+
+			/* And apply the day in the month */
+			t2 -= mtab[m].days - 1;
+			*ddp |= t2;
+		}
+	}
+}
+
+/*
+ * Table indexed by the bottom two bits of year + four bits of the month
+ * from the FAT timestamp, returning number of days into 4 year long
+ * leap-year cycle
+ */
+
+#define DCOD(m, y, l)	((m) + YEAR * (y) + (l))
+static const uint16_t daytab[64] = {
+	0, 		 DCOD(  0, 0, 0), DCOD(JAN, 0, 0), DCOD(FEB, 0, 1),
+	DCOD(MAR, 0, 1), DCOD(APR, 0, 1), DCOD(MAY, 0, 1), DCOD(JUN, 0, 1),
+	DCOD(JUL, 0, 1), DCOD(AUG, 0, 1), DCOD(SEP, 0, 1), DCOD(OCT, 0, 1),
+	DCOD(NOV, 0, 1), DCOD(DEC, 0, 1), 0,               0,
+	0, 		 DCOD(  0, 1, 1), DCOD(JAN, 1, 1), DCOD(FEB, 1, 1),
+	DCOD(MAR, 1, 1), DCOD(APR, 1, 1), DCOD(MAY, 1, 1), DCOD(JUN, 1, 1),
+	DCOD(JUL, 1, 1), DCOD(AUG, 1, 1), DCOD(SEP, 1, 1), DCOD(OCT, 1, 1),
+	DCOD(NOV, 1, 1), DCOD(DEC, 1, 1), 0,               0,
+	0,		 DCOD(  0, 2, 1), DCOD(JAN, 2, 1), DCOD(FEB, 2, 1),
+	DCOD(MAR, 2, 1), DCOD(APR, 2, 1), DCOD(MAY, 2, 1), DCOD(JUN, 2, 1),
+	DCOD(JUL, 2, 1), DCOD(AUG, 2, 1), DCOD(SEP, 2, 1), DCOD(OCT, 2, 1),
+	DCOD(NOV, 2, 1), DCOD(DEC, 2, 1), 0,               0,
+	0,		 DCOD(  0, 3, 1), DCOD(JAN, 3, 1), DCOD(FEB, 3, 1),
+	DCOD(MAR, 3, 1), DCOD(APR, 3, 1), DCOD(MAY, 3, 1), DCOD(JUN, 3, 1),
+	DCOD(JUL, 3, 1), DCOD(AUG, 3, 1), DCOD(SEP, 3, 1), DCOD(OCT, 3, 1),
+	DCOD(NOV, 3, 1), DCOD(DEC, 3, 1), 0,               0
+};
+
+void
+fattime2timespec(unsigned dd, unsigned dt, unsigned dh, int utc, struct timespec *tsp)
+{
+	unsigned day;
+
+	/* Unpack time fields */
+	tsp->tv_sec = (dt & 0x1f) << 1;
+	tsp->tv_sec += ((dt & 0x7e0) >> 5) * 60;
+	tsp->tv_sec += ((dt & 0xf800) >> 11) * 3600;
+	tsp->tv_sec += dh / 100;
+	tsp->tv_nsec = (dh % 100) * 10000000;
+
+	/* Day of month */
+	day = (dd & 0x1f) - 1;
+
+	/* Full leap-year cycles */
+	day += LYC * ((dd >> 11) & 0x1f);
+
+	/* Month offset from leap-year cycle */
+	day += daytab[(dd >> 5) & 0x3f];
+
+	/*
+	 * 2100 is not a leap year.
+	 * XXX: a 32 bit time_t can not get us here.
+	 */
+	if (day >= ((2100 - 1980) / 4 * LYC + FEB))
+		day--;
+
+	/* Align with time_t epoch */
+	day += T1980;
+
+	tsp->tv_sec += DAY * day;
+	if (!utc)
+		tsp->tv_sec += utc_offset();
+}
+
+#ifdef TEST_DRIVER
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+int
+main(int argc __unused, char **argv __unused)
+{
+	int i;
+	struct timespec ts;
+	struct tm tm;
+	double a;
+	uint16_t d, t;
+	uint8_t p;
+	char buf[100];
+
+	for (i = 0; i < 10000; i++) {
+		do {
+			ts.tv_sec = random();
+		} while (ts.tv_sec < T1980 * 86400);
+		ts.tv_nsec = random() % 1000000000;
+
+		printf("%10d.%03ld -- ", ts.tv_sec, ts.tv_nsec / 1000000);
+
+		gmtime_r(&ts.tv_sec, &tm);
+		strftime(buf, sizeof buf, "%Y %m %d %H %M %S", &tm);
+		printf("%s -- ", buf);
+
+		a = ts.tv_sec + ts.tv_nsec * 1e-9;
+		d = t = p = 0;
+		timet2fattime(&ts, &d, &t, &p);
+		printf("%04x %04x %02x -- ", d, t, p);
+		printf("%3d %02d %02d %02d %02d %02d -- ",
+		    ((d >> 9)  & 0x7f) + 1980,
+		    (d >> 5)  & 0x0f,
+		    (d >> 0)  & 0x1f,
+		    (t >> 11) & 0x1f,
+		    (t >> 5)  & 0x3f,
+		    ((t >> 0)  & 0x1f) * 2);
+
+		ts.tv_sec = ts.tv_nsec = 0;
+		fattime2timet(d, t, p, &ts);
+		printf("%10d.%03ld == ", ts.tv_sec, ts.tv_nsec / 1000000);
+		gmtime_r(&ts.tv_sec, &tm);
+		strftime(buf, sizeof buf, "%Y %m %d %H %M %S", &tm);
+		printf("%s -- ", buf);
+		a -= ts.tv_sec + ts.tv_nsec * 1e-9;
+		printf("%.3f", a);
+		printf("\n");
+	}
+	return (0);
+}
+
+#endif /* TEST_DRIVER */
diff --git a/sys/kern/subr_firmware.c b/sys/kern/subr_firmware.c
new file mode 100644
index 0000000..20ab76e
--- /dev/null
+++ b/sys/kern/subr_firmware.c
@@ -0,0 +1,537 @@
+/*-
+ * Copyright (c) 2005-2008, Sam Leffler <sam@errno.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/errno.h>
+#include <sys/linker.h>
+#include <sys/firmware.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/module.h>
+#include <sys/eventhandler.h>
+
+#include <sys/filedesc.h>
+#include <sys/vnode.h>
+
+/*
+ * Loadable firmware support. See sys/sys/firmware.h and firmware(9)
+ * form more details on the subsystem.
+ *
+ * 'struct firmware' is the user-visible part of the firmware table.
+ * Additional internal information is stored in a 'struct priv_fw'
+ * (currently a static array). A slot is in use if FW_INUSE is true:
+ */
+
+#define FW_INUSE(p)	((p)->file != NULL || (p)->fw.name != NULL)
+
+/*
+ * fw.name != NULL when an image is registered; file != NULL for
+ * autoloaded images whose handling has not been completed.
+ *
+ * The state of a slot evolves as follows:
+ *	firmware_register	-->  fw.name = image_name
+ *	(autoloaded image)	-->  file = module reference
+ *	firmware_unregister	-->  fw.name = NULL
+ *	(unloadentry complete)	-->  file = NULL
+ *
+ * In order for the above to work, the 'file' field must remain
+ * unchanged in firmware_unregister().
+ *
+ * Images residing in the same module are linked to each other
+ * through the 'parent' argument of firmware_register().
+ * One image (typically, one with the same name as the module to let
+ * the autoloading mechanism work) is considered the parent image for
+ * all other images in the same module. Children affect the refcount
+ * on the parent image preventing improper unloading of the image itself.
+ */
+
+struct priv_fw {
+	int		refcnt;		/* reference count */
+
+	/*
+	 * parent entry, see above. Set on firmware_register(),
+	 * cleared on firmware_unregister().
+	 */
+	struct priv_fw	*parent;
+
+	int 		flags;	/* record FIRMWARE_UNLOAD requests */
+#define FW_UNLOAD	0x100
+
+	/*
+	 * 'file' is private info managed by the autoload/unload code.
+	 * Set at the end of firmware_get(), cleared only in the
+	 * firmware_unload_task, so the latter can depend on its value even
+	 * while the lock is not held.
+	 */
+	linker_file_t   file;	/* module file, if autoloaded */
+
+	/*
+	 * 'fw' is the externally visible image information.
+	 * We do not make it the first field in priv_fw, to avoid the
+	 * temptation of casting pointers to each other.
+	 * Use PRIV_FW(fw) to get a pointer to the cointainer of fw.
+	 * Beware, PRIV_FW does not work for a NULL pointer.
+	 */
+	struct firmware	fw;	/* externally visible information */
+};
+
+/*
+ * PRIV_FW returns the pointer to the container of struct firmware *x.
+ * Cast to intptr_t to override the 'const' attribute of x
+ */
+#define PRIV_FW(x)	((struct priv_fw *)		\
+	((intptr_t)(x) - offsetof(struct priv_fw, fw)) )
+
+/*
+ * At the moment we use a static array as backing store for the registry.
+ * Should we move to a dynamic structure, keep in mind that we cannot
+ * reallocate the array because pointers are held externally.
+ * A list may work, though.
+ */
+#define	FIRMWARE_MAX	50
+static struct priv_fw firmware_table[FIRMWARE_MAX];
+
+/*
+ * Firmware module operations are handled in a separate task as they
+ * might sleep and they require directory context to do i/o.
+ */
+static struct taskqueue *firmware_tq;
+static struct task firmware_unload_task;
+
+/*
+ * This mutex protects accesses to the firmware table.
+ */
+static struct mtx firmware_mtx;
+MTX_SYSINIT(firmware, &firmware_mtx, "firmware table", MTX_DEF);
+
+/*
+ * Helper function to lookup a name.
+ * As a side effect, it sets the pointer to a free slot, if any.
+ * This way we can concentrate most of the registry scanning in
+ * this function, which makes it easier to replace the registry
+ * with some other data structure.
+ */
+static struct priv_fw *
+lookup(const char *name, struct priv_fw **empty_slot)
+{
+	struct priv_fw *fp = NULL;
+	struct priv_fw *dummy;
+	int i;
+
+	if (empty_slot == NULL)
+		empty_slot = &dummy;
+	*empty_slot = NULL;
+	for (i = 0; i < FIRMWARE_MAX; i++) {
+		fp = &firmware_table[i];
+		if (fp->fw.name != NULL && strcasecmp(name, fp->fw.name) == 0)
+			break;
+		else if (!FW_INUSE(fp))
+			*empty_slot = fp;
+	}
+	return (i < FIRMWARE_MAX ) ? fp : NULL;
+}
+
+/*
+ * Register a firmware image with the specified name.  The
+ * image name must not already be registered.  If this is a
+ * subimage then parent refers to a previously registered
+ * image that this should be associated with.
+ */
+const struct firmware *
+firmware_register(const char *imagename, const void *data, size_t datasize,
+    unsigned int version, const struct firmware *parent)
+{
+	struct priv_fw *match, *frp;
+	char *str;
+
+	str = strdup(imagename, M_TEMP);
+
+	mtx_lock(&firmware_mtx);
+	/*
+	 * Do a lookup to make sure the name is unique or find a free slot.
+	 */
+	match = lookup(imagename, &frp);
+	if (match != NULL) {
+		mtx_unlock(&firmware_mtx);
+		printf("%s: image %s already registered!\n",
+			__func__, imagename);
+		free(str, M_TEMP);
+		return NULL;
+	}
+	if (frp == NULL) {
+		mtx_unlock(&firmware_mtx);
+		printf("%s: cannot register image %s, firmware table full!\n",
+		    __func__, imagename);
+		free(str, M_TEMP);
+		return NULL;
+	}
+	bzero(frp, sizeof(*frp));	/* start from a clean record */
+	frp->fw.name = str;
+	frp->fw.data = data;
+	frp->fw.datasize = datasize;
+	frp->fw.version = version;
+	if (parent != NULL)
+		frp->parent = PRIV_FW(parent);
+	mtx_unlock(&firmware_mtx);
+	if (bootverbose)
+		printf("firmware: '%s' version %u: %zu bytes loaded at %p\n",
+		    imagename, version, datasize, data);
+	return &frp->fw;
+}
+
+/*
+ * Unregister/remove a firmware image.  If there are outstanding
+ * references an error is returned and the image is not removed
+ * from the registry.
+ */
+int
+firmware_unregister(const char *imagename)
+{
+	struct priv_fw *fp;
+	int err;
+
+	mtx_lock(&firmware_mtx);
+	fp = lookup(imagename, NULL);
+	if (fp == NULL) {
+		/*
+		 * It is ok for the lookup to fail; this can happen
+		 * when a module is unloaded on last reference and the
+		 * module unload handler unregister's each of it's
+		 * firmware images.
+		 */
+		err = 0;
+	} else if (fp->refcnt != 0) {	/* cannot unregister */
+		err = EBUSY;
+	} else {
+		linker_file_t x = fp->file;	/* save value */
+
+		/*
+		 * Clear the whole entry with bzero to make sure we
+		 * do not forget anything. Then restore 'file' which is
+		 * non-null for autoloaded images.
+		 */
+		free((void *) (uintptr_t) fp->fw.name, M_TEMP);
+		bzero(fp, sizeof(struct priv_fw));
+		fp->file = x;
+		err = 0;
+	}
+	mtx_unlock(&firmware_mtx);
+	return err;
+}
+
+static void
+loadimage(void *arg, int npending)
+{
+	struct thread *td = curthread;
+	char *imagename = arg;
+	struct priv_fw *fp;
+	linker_file_t result;
+	int error;
+
+	/* synchronize with the thread that dispatched us */
+	mtx_lock(&firmware_mtx);
+	mtx_unlock(&firmware_mtx);
+
+	if (td->td_proc->p_fd->fd_rdir == NULL) {
+		printf("%s: root not mounted yet, no way to load image\n",
+		    imagename);
+		goto done;
+	}
+	error = linker_reference_module(imagename, NULL, &result);
+	if (error != 0) {
+		printf("%s: could not load firmware image, error %d\n",
+		    imagename, error);
+		goto done;
+	}
+
+	mtx_lock(&firmware_mtx);
+	fp = lookup(imagename, NULL);
+	if (fp == NULL || fp->file != NULL) {
+		mtx_unlock(&firmware_mtx);
+		if (fp == NULL)
+			printf("%s: firmware image loaded, "
+			    "but did not register\n", imagename);
+		(void) linker_release_module(imagename, NULL, NULL);
+		goto done;
+	}
+	fp->file = result;	/* record the module identity */
+	mtx_unlock(&firmware_mtx);
+done:
+	wakeup_one(imagename);		/* we're done */
+}
+
+/*
+ * Lookup and potentially load the specified firmware image.
+ * If the firmware is not found in the registry, try to load a kernel
+ * module named as the image name.
+ * If the firmware is located, a reference is returned. The caller must
+ * release this reference for the image to be eligible for removal/unload.
+ */
+const struct firmware *
+firmware_get(const char *imagename)
+{
+	struct task fwload_task;
+	struct thread *td;
+	struct priv_fw *fp;
+
+	mtx_lock(&firmware_mtx);
+	fp = lookup(imagename, NULL);
+	if (fp != NULL)
+		goto found;
+	/*
+	 * Image not present, try to load the module holding it.
+	 */
+	td = curthread;
+	if (priv_check(td, PRIV_FIRMWARE_LOAD) != 0 ||
+	    securelevel_gt(td->td_ucred, 0) != 0) {
+		mtx_unlock(&firmware_mtx);
+		printf("%s: insufficient privileges to "
+		    "load firmware image %s\n", __func__, imagename);
+		return NULL;
+	}
+	/* 
+	 * Defer load to a thread with known context.  linker_reference_module
+	 * may do filesystem i/o which requires root & current dirs, etc.
+	 * Also we must not hold any mtx's over this call which is problematic.
+	 */
+	if (!cold) {
+		TASK_INIT(&fwload_task, 0, loadimage, __DECONST(void *,
+		    imagename));
+		taskqueue_enqueue(firmware_tq, &fwload_task);
+		msleep(__DECONST(void *, imagename), &firmware_mtx, 0,
+		    "fwload", 0);
+	}
+	/*
+	 * After attempting to load the module, see if the image is registered.
+	 */
+	fp = lookup(imagename, NULL);
+	if (fp == NULL) {
+		mtx_unlock(&firmware_mtx);
+		return NULL;
+	}
+found:				/* common exit point on success */
+	if (fp->refcnt == 0 && fp->parent != NULL)
+		fp->parent->refcnt++;
+	fp->refcnt++;
+	mtx_unlock(&firmware_mtx);
+	return &fp->fw;
+}
+
+/*
+ * Release a reference to a firmware image returned by firmware_get.
+ * The caller may specify, with the FIRMWARE_UNLOAD flag, its desire
+ * to release the resource, but the flag is only advisory.
+ *
+ * If this is the last reference to the firmware image, and this is an
+ * autoloaded module, wake up the firmware_unload_task to figure out
+ * what to do with the associated module.
+ */
+void
+firmware_put(const struct firmware *p, int flags)
+{
+	struct priv_fw *fp = PRIV_FW(p);
+
+	mtx_lock(&firmware_mtx);
+	fp->refcnt--;
+	if (fp->refcnt == 0) {
+		if (fp->parent != NULL)
+			fp->parent->refcnt--;
+		if (flags & FIRMWARE_UNLOAD)
+			fp->flags |= FW_UNLOAD;
+		if (fp->file)
+			taskqueue_enqueue(firmware_tq, &firmware_unload_task);
+	}
+	mtx_unlock(&firmware_mtx);
+}
+
+/*
+ * Setup directory state for the firmware_tq thread so we can do i/o.
+ */
+static void
+set_rootvnode(void *arg, int npending)
+{
+	struct thread *td = curthread;
+	struct proc *p = td->td_proc;
+
+	FILEDESC_XLOCK(p->p_fd);
+	if (p->p_fd->fd_cdir == NULL) {
+		p->p_fd->fd_cdir = rootvnode;
+		VREF(rootvnode);
+	}
+	if (p->p_fd->fd_rdir == NULL) {
+		p->p_fd->fd_rdir = rootvnode;
+		VREF(rootvnode);
+	}
+	FILEDESC_XUNLOCK(p->p_fd);
+
+	free(arg, M_TEMP);
+}
+
+/*
+ * Event handler called on mounting of /; bounce a task
+ * into the task queue thread to setup it's directories.
+ */
+static void
+firmware_mountroot(void *arg)
+{
+	struct task *setroot_task;
+
+	setroot_task = malloc(sizeof(struct task), M_TEMP, M_NOWAIT);
+	if (setroot_task != NULL) {
+		TASK_INIT(setroot_task, 0, set_rootvnode, setroot_task);
+		taskqueue_enqueue(firmware_tq, setroot_task);
+	} else
+		printf("%s: no memory for task!\n", __func__);
+}
+EVENTHANDLER_DEFINE(mountroot, firmware_mountroot, NULL, 0);
+
+/*
+ * The body of the task in charge of unloading autoloaded modules
+ * that are not needed anymore.
+ * Images can be cross-linked so we may need to make multiple passes,
+ * but the time we spend in the loop is bounded because we clear entries
+ * as we touch them.
+ */
+static void
+unloadentry(void *unused1, int unused2)
+{
+	int limit = FIRMWARE_MAX;
+	int i;	/* current cycle */
+
+	mtx_lock(&firmware_mtx);
+	/*
+	 * Scan the table. limit is set to make sure we make another
+	 * full sweep after matching an entry that requires unloading.
+	 */
+	for (i = 0; i < limit; i++) {
+		struct priv_fw *fp;
+		int err;
+
+		fp = &firmware_table[i % FIRMWARE_MAX];
+		if (fp->fw.name == NULL || fp->file == NULL ||
+		    fp->refcnt != 0 || (fp->flags & FW_UNLOAD) == 0)
+			continue;
+
+		/*
+		 * Found an entry. Now:
+		 * 1. bump up limit to make sure we make another full round;
+		 * 2. clear FW_UNLOAD so we don't try this entry again.
+		 * 3. release the lock while trying to unload the module.
+		 * 'file' remains set so that the entry cannot be reused
+		 * in the meantime (it also means that fp->file will
+		 * not change while we release the lock).
+		 */
+		limit = i + FIRMWARE_MAX;	/* make another full round */
+		fp->flags &= ~FW_UNLOAD;	/* do not try again */
+
+		mtx_unlock(&firmware_mtx);
+		err = linker_release_module(NULL, NULL, fp->file);
+		mtx_lock(&firmware_mtx);
+
+		/*
+		 * We rely on the module to call firmware_unregister()
+		 * on unload to actually release the entry.
+		 * If err = 0 we can drop our reference as the system
+		 * accepted it. Otherwise unloading failed (e.g. the
+		 * module itself gave an error) so our reference is
+		 * still valid.
+		 */
+		if (err == 0)
+			fp->file = NULL; 
+	}
+	mtx_unlock(&firmware_mtx);
+}
+
+/*
+ * Module glue.
+ */
+static int
+firmware_modevent(module_t mod, int type, void *unused)
+{
+	struct priv_fw *fp;
+	int i, err;
+
+	switch (type) {
+	case MOD_LOAD:
+		TASK_INIT(&firmware_unload_task, 0, unloadentry, NULL);
+		firmware_tq = taskqueue_create("taskqueue_firmware", M_WAITOK,
+		    taskqueue_thread_enqueue, &firmware_tq);
+		/* NB: use our own loop routine that sets up context */
+		(void) taskqueue_start_threads(&firmware_tq, 1, PWAIT,
+		    "firmware taskq");
+		if (rootvnode != NULL) {
+			/* 
+			 * Root is already mounted so we won't get an event;
+			 * simulate one here.
+			 */
+			firmware_mountroot(NULL);
+		}
+		return 0;
+
+	case MOD_UNLOAD:
+		/* request all autoloaded modules to be released */
+		mtx_lock(&firmware_mtx);
+		for (i = 0; i < FIRMWARE_MAX; i++) {
+			fp = &firmware_table[i];
+			fp->flags |= FW_UNLOAD;
+		}
+		mtx_unlock(&firmware_mtx);
+		taskqueue_enqueue(firmware_tq, &firmware_unload_task);
+		taskqueue_drain(firmware_tq, &firmware_unload_task);
+		err = 0;
+		for (i = 0; i < FIRMWARE_MAX; i++) {
+			fp = &firmware_table[i];
+			if (fp->fw.name != NULL) {
+				printf("%s: image %p ref %d still active slot %d\n",
+					__func__, fp->fw.name,
+					fp->refcnt,  i);
+				err = EINVAL;
+			}
+		}
+		if (err == 0)
+			taskqueue_free(firmware_tq);
+		return err;
+	}
+	return EINVAL;
+}
+
+static moduledata_t firmware_mod = {
+	"firmware",
+	firmware_modevent,
+	NULL
+};
+DECLARE_MODULE(firmware, firmware_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_VERSION(firmware, 1);
diff --git a/sys/kern/subr_hash.c b/sys/kern/subr_hash.c
new file mode 100644
index 0000000..5533882
--- /dev/null
+++ b/sys/kern/subr_hash.c
@@ -0,0 +1,128 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_subr.c	8.3 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+/*
+ * General routine to allocate a hash table with control of memory flags.
+ */
+void *
+hashinit_flags(int elements, struct malloc_type *type, u_long *hashmask,
+    int flags)
+{
+	long hashsize;
+	LIST_HEAD(generic, generic) *hashtbl;
+	int i;
+
+	KASSERT(elements > 0, ("%s: bad elements", __func__));
+	/* Exactly one of HASH_WAITOK and HASH_NOWAIT must be set. */
+	KASSERT((flags & HASH_WAITOK) ^ (flags & HASH_NOWAIT),
+	    ("Bad flags (0x%x) passed to hashinit_flags", flags));
+
+	for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
+		continue;
+	hashsize >>= 1;
+
+	if (flags & HASH_NOWAIT)
+		hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl),
+		    type, M_NOWAIT);
+	else
+		hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl),
+		    type, M_WAITOK);
+
+	if (hashtbl != NULL) {
+		for (i = 0; i < hashsize; i++)
+			LIST_INIT(&hashtbl[i]);
+		*hashmask = hashsize - 1;
+	}
+	return (hashtbl);
+}
+
+/*
+ * Allocate and initialize a hash table with default flag: may sleep.
+ */
+void *
+hashinit(int elements, struct malloc_type *type, u_long *hashmask)
+{
+
+	return (hashinit_flags(elements, type, hashmask, HASH_WAITOK));
+}
+
+void
+hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask)
+{
+	LIST_HEAD(generic, generic) *hashtbl, *hp;
+
+	hashtbl = vhashtbl;
+	for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++)
+		KASSERT(LIST_EMPTY(hp), ("%s: hash not empty", __func__));
+	free(hashtbl, type);
+}
+
+static const int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531,
+			2039, 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143,
+			6653, 7159, 7673, 8191, 12281, 16381, 24571, 32749 };
+#define NPRIMES (sizeof(primes) / sizeof(primes[0]))
+
+/*
+ * General routine to allocate a prime number sized hash table.
+ */
+void *
+phashinit(int elements, struct malloc_type *type, u_long *nentries)
+{
+	long hashsize;
+	LIST_HEAD(generic, generic) *hashtbl;
+	int i;
+
+	KASSERT(elements > 0, ("%s: bad elements", __func__));
+	for (i = 1, hashsize = primes[1]; hashsize <= elements;) {
+		i++;
+		if (i == NPRIMES)
+			break;
+		hashsize = primes[i];
+	}
+	hashsize = primes[i - 1];
+	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
+	for (i = 0; i < hashsize; i++)
+		LIST_INIT(&hashtbl[i]);
+	*nentries = hashsize;
+	return (hashtbl);
+}
diff --git a/sys/kern/subr_hints.c b/sys/kern/subr_hints.c
new file mode 100644
index 0000000..db45fb8
--- /dev/null
+++ b/sys/kern/subr_hints.c
@@ -0,0 +1,463 @@
+/*-
+ * Copyright (c) 2000,2001 Peter Wemm <peter@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+/*
+ * Access functions for device resources.
+ */
+
+static int checkmethod = 1;
+static int use_kenv;
+static char *hintp;
+
+/*
+ * Define kern.hintmode sysctl, which only accept value 2, that cause to
+ * switch from Static KENV mode to Dynamic KENV. So systems that have hints
+ * compiled into kernel will be able to see/modify KENV (and hints too).
+ */
+
+static int
+sysctl_hintmode(SYSCTL_HANDLER_ARGS)
+{
+	const char *cp;
+	char *line, *eq;
+	int eqidx, error, from_kenv, i, value;
+
+	from_kenv = 0;
+	cp = kern_envp;
+	value = hintmode;
+
+	/* Fetch candidate for new hintmode value */
+	error = sysctl_handle_int(oidp, &value, 0, req);
+	if (error || req->newptr == NULL)
+		return (error);
+
+	if (value != 2)
+		/* Only accept swithing to hintmode 2 */
+		return (EINVAL);
+
+	/* Migrate from static to dynamic hints */
+	switch (hintmode) {
+	case 0:
+		if (dynamic_kenv) {
+			/*
+			 * Already here. But assign hintmode to 2, to not
+			 * check it in the future.
+			 */
+			hintmode = 2;
+			return (0);
+		}
+		from_kenv = 1;
+		cp = kern_envp;
+		break;
+	case 1:
+		cp = static_hints;
+		break;
+	case 2:
+		/* Nothing to do, hintmode already 2 */
+		return (0);
+	}
+
+	while (cp) {
+		i = strlen(cp);
+		if (i == 0)
+			break;
+		if (from_kenv) {
+			if (strncmp(cp, "hint.", 5) != 0)
+				/* kenv can have not only hints */
+				continue;
+		}
+		eq = strchr(cp, '=');
+		if (eq == NULL)
+			/* Bad hint value */
+			continue;
+		eqidx = eq - cp;
+
+		line = malloc(i+1, M_TEMP, M_WAITOK);
+		strcpy(line, cp);
+		line[eqidx] = '\0';
+		setenv(line, line + eqidx + 1);
+		free(line, M_TEMP);
+		cp += i + 1;
+	}
+
+	hintmode = value;
+	use_kenv = 1;
+	return (0);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, hintmode, CTLTYPE_INT|CTLFLAG_RW,
+    &hintmode, 0, sysctl_hintmode, "I", "Get/set current hintmode");
+
+/*
+ * Evil wildcarding resource string lookup.
+ * This walks the supplied env string table and returns a match.
+ * The start point can be remembered for incremental searches.
+ */
+static int
+res_find(int *line, int *startln,
+    const char *name, int *unit, const char *resname, const char *value,
+    const char **ret_name, int *ret_namelen, int *ret_unit,
+    const char **ret_resname, int *ret_resnamelen, const char **ret_value)
+{
+	int n = 0, hit, i = 0;
+	char r_name[32];
+	int r_unit;
+	char r_resname[32];
+	char r_value[128];
+	const char *s, *cp;
+	char *p;
+
+	if (checkmethod) {
+		hintp = NULL;
+
+		switch (hintmode) {
+		case 0:		/* loader hints in environment only */
+			break;
+		case 1:		/* static hints only */
+			hintp = static_hints;
+			checkmethod = 0;
+			break;
+		case 2:		/* fallback mode */
+			if (dynamic_kenv) {
+				mtx_lock(&kenv_lock);
+				cp = kenvp[0];
+				for (i = 0; cp != NULL; cp = kenvp[++i]) {
+					if (!strncmp(cp, "hint.", 5)) {
+						use_kenv = 1;
+						checkmethod = 0;
+						break;
+					}
+				}
+				mtx_unlock(&kenv_lock);
+			} else {
+				cp = kern_envp;
+				while (cp) {
+					if (strncmp(cp, "hint.", 5) == 0) {
+						cp = NULL;
+						hintp = kern_envp;
+						break;
+					}
+					while (*cp != '\0')
+						cp++;
+					cp++;
+					if (*cp == '\0') {
+						cp = NULL;
+						hintp = static_hints;
+						break;
+					}
+				}
+			}
+			break;
+		default:
+			break;
+		}
+		if (hintp == NULL) {
+			if (dynamic_kenv) {
+				use_kenv = 1;
+				checkmethod = 0;
+			} else
+				hintp = kern_envp;
+		}
+	}
+
+	if (use_kenv) {
+		mtx_lock(&kenv_lock);
+		i = 0;
+		cp = kenvp[0];
+		if (cp == NULL) {
+			mtx_unlock(&kenv_lock);
+			return (ENOENT);
+		}
+	} else
+		cp = hintp;
+	while (cp) {
+		hit = 1;
+		(*line)++;
+		if (strncmp(cp, "hint.", 5) != 0)
+			hit = 0;
+		else
+			n = sscanf(cp, "hint.%32[^.].%d.%32[^=]=%128s",
+			    r_name, &r_unit, r_resname, r_value);
+		if (hit && n != 4) {
+			printf("CONFIG: invalid hint '%s'\n", cp);
+			p = strchr(cp, 'h');
+			*p = 'H';
+			hit = 0;
+		}
+		if (hit && startln && *startln >= 0 && *line < *startln)
+			hit = 0;
+		if (hit && name && strcmp(name, r_name) != 0)
+			hit = 0;
+		if (hit && unit && *unit != r_unit)
+			hit = 0;
+		if (hit && resname && strcmp(resname, r_resname) != 0)
+			hit = 0;
+		if (hit && value && strcmp(value, r_value) != 0)
+			hit = 0;
+		if (hit)
+			break;
+		if (use_kenv) {
+			cp = kenvp[++i];
+			if (cp == NULL)
+				break;
+		} else {
+			while (*cp != '\0')
+				cp++;
+			cp++;
+			if (*cp == '\0') {
+				cp = NULL;
+				break;
+			}
+		}
+	}
+	if (use_kenv)
+		mtx_unlock(&kenv_lock);
+	if (cp == NULL)
+		return ENOENT;
+
+	s = cp;
+	/* This is a bit of a hack, but at least is reentrant */
+	/* Note that it returns some !unterminated! strings. */
+	s = strchr(s, '.') + 1;		/* start of device */
+	if (ret_name)
+		*ret_name = s;
+	s = strchr(s, '.') + 1;		/* start of unit */
+	if (ret_namelen && ret_name)
+		*ret_namelen = s - *ret_name - 1; /* device length */
+	if (ret_unit)
+		*ret_unit = r_unit;
+	s = strchr(s, '.') + 1;		/* start of resname */
+	if (ret_resname)
+		*ret_resname = s;
+	s = strchr(s, '=') + 1;		/* start of value */
+	if (ret_resnamelen && ret_resname)
+		*ret_resnamelen = s - *ret_resname - 1; /* value len */
+	if (ret_value)
+		*ret_value = s;
+	if (startln)			/* line number for anchor */
+		*startln = *line + 1;
+	return 0;
+}
+
+/*
+ * Search all the data sources for matches to our query.  We look for
+ * dynamic hints first as overrides for static or fallback hints.
+ */
+static int
+resource_find(int *line, int *startln,
+    const char *name, int *unit, const char *resname, const char *value,
+    const char **ret_name, int *ret_namelen, int *ret_unit,
+    const char **ret_resname, int *ret_resnamelen, const char **ret_value)
+{
+	int i;
+	int un;
+
+	*line = 0;
+
+	/* Search for exact unit matches first */
+	i = res_find(line, startln, name, unit, resname, value,
+	    ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen,
+	    ret_value);
+	if (i == 0)
+		return 0;
+	if (unit == NULL)
+		return ENOENT;
+	/* If we are still here, search for wildcard matches */
+	un = -1;
+	i = res_find(line, startln, name, &un, resname, value,
+	    ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen,
+	    ret_value);
+	if (i == 0)
+		return 0;
+	return ENOENT;
+}
+
+int
+resource_int_value(const char *name, int unit, const char *resname, int *result)
+{
+	int error;
+	const char *str;
+	char *op;
+	unsigned long val;
+	int line;
+
+	line = 0;
+	error = resource_find(&line, NULL, name, &unit, resname, NULL,
+	    NULL, NULL, NULL, NULL, NULL, &str);
+	if (error)
+		return error;
+	if (*str == '\0') 
+		return EFTYPE;
+	val = strtoul(str, &op, 0);
+	if (*op != '\0') 
+		return EFTYPE;
+	*result = val;
+	return 0;
+}
+
+int
+resource_long_value(const char *name, int unit, const char *resname,
+    long *result)
+{
+	int error;
+	const char *str;
+	char *op;
+	unsigned long val;
+	int line;
+
+	line = 0;
+	error = resource_find(&line, NULL, name, &unit, resname, NULL,
+	    NULL, NULL, NULL, NULL, NULL, &str);
+	if (error)
+		return error;
+	if (*str == '\0') 
+		return EFTYPE;
+	val = strtoul(str, &op, 0);
+	if (*op != '\0') 
+		return EFTYPE;
+	*result = val;
+	return 0;
+}
+
+int
+resource_string_value(const char *name, int unit, const char *resname,
+    const char **result)
+{
+	int error;
+	const char *str;
+	int line;
+
+	line = 0;
+	error = resource_find(&line, NULL, name, &unit, resname, NULL,
+	    NULL, NULL, NULL, NULL, NULL, &str);
+	if (error)
+		return error;
+	*result = str;
+	return 0;
+}
+
+/*
+ * This is a bit nasty, but allows us to not modify the env strings.
+ */
+static const char *
+resource_string_copy(const char *s, int len)
+{
+	static char stringbuf[256];
+	static int offset = 0;
+	const char *ret;
+
+	if (len == 0)
+		len = strlen(s);
+	if (len > 255)
+		return NULL;
+	if ((offset + len + 1) > 255)
+		offset = 0;
+	bcopy(s, &stringbuf[offset], len);
+	stringbuf[offset + len] = '\0';
+	ret = &stringbuf[offset];
+	offset += len + 1;
+	return ret;
+}
+
+/*
+ * err = resource_find_match(&anchor, &name, &unit, resname, value)
+ * Iteratively fetch a list of devices wired "at" something
+ * res and value are restrictions.  eg: "at", "scbus0".
+ * For practical purposes, res = required, value = optional.
+ * *name and *unit are set.
+ * set *anchor to zero before starting.
+ */
+int
+resource_find_match(int *anchor, const char **name, int *unit,
+    const char *resname, const char *value)
+{
+	const char *found_name;
+	int found_namelen;
+	int found_unit;
+	int ret;
+	int newln;
+
+	newln = *anchor;
+	ret = resource_find(anchor, &newln, NULL, NULL, resname, value,
+	    &found_name, &found_namelen, &found_unit, NULL, NULL, NULL);
+	if (ret == 0) {
+		*name = resource_string_copy(found_name, found_namelen);
+		*unit = found_unit;
+	}
+	*anchor = newln;
+	return ret;
+}
+
+
+/*
+ * err = resource_find_dev(&anchor, name, &unit, res, value);
+ * Iterate through a list of devices, returning their unit numbers.
+ * res and value are optional restrictions.  eg: "at", "scbus0".
+ * *unit is set to the value.
+ * set *anchor to zero before starting.
+ */
+int
+resource_find_dev(int *anchor, const char *name, int *unit,
+    const char *resname, const char *value)
+{
+	int found_unit;
+	int newln;
+	int ret;
+
+	newln = *anchor;
+	ret = resource_find(anchor, &newln, name, NULL, resname, value,
+	    NULL, NULL, &found_unit, NULL, NULL, NULL);
+	if (ret == 0) {
+		*unit = found_unit;
+	}
+	*anchor = newln;
+	return ret;
+}
+
+/*
+ * Check to see if a device is disabled via a disabled hint.
+ */
+int
+resource_disabled(const char *name, int unit)
+{
+	int error, value;
+
+	error = resource_int_value(name, unit, "disabled", &value);
+	if (error)
+	       return (0);
+	return (value);
+}
diff --git a/sys/kern/subr_kdb.c b/sys/kern/subr_kdb.c
new file mode 100644
index 0000000..59d6258
--- /dev/null
+++ b/sys/kern/subr_kdb.c
@@ -0,0 +1,675 @@
+/*-
+ * Copyright (c) 2004 The FreeBSD Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdb.h"
+#include "opt_stack.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cons.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/smp.h>
+#include <sys/stack.h>
+#include <sys/sysctl.h>
+
+#include <machine/kdb.h>
+#include <machine/pcb.h>
+
+#ifdef SMP
+#include <machine/smp.h>
+#endif
+
+int kdb_active = 0;
+static void *kdb_jmpbufp = NULL;
+struct kdb_dbbe *kdb_dbbe = NULL;
+static struct pcb kdb_pcb;
+struct pcb *kdb_thrctx = NULL;
+struct thread *kdb_thread = NULL;
+struct trapframe *kdb_frame = NULL;
+
+#ifdef BREAK_TO_DEBUGGER
+#define	KDB_BREAK_TO_DEBUGGER	1
+#else
+#define	KDB_BREAK_TO_DEBUGGER	0
+#endif
+
+#ifdef ALT_BREAK_TO_DEBUGGER
+#define	KDB_ALT_BREAK_TO_DEBUGGER	1
+#else
+#define	KDB_ALT_BREAK_TO_DEBUGGER	0
+#endif
+
+static int	kdb_break_to_debugger = KDB_BREAK_TO_DEBUGGER;
+static int	kdb_alt_break_to_debugger = KDB_ALT_BREAK_TO_DEBUGGER;
+
+KDB_BACKEND(null, NULL, NULL, NULL, NULL);
+SET_DECLARE(kdb_dbbe_set, struct kdb_dbbe);
+
+static int kdb_sysctl_available(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_current(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_enter(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_panic(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_trap(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS);
+
+static SYSCTL_NODE(_debug, OID_AUTO, kdb, CTLFLAG_RW, NULL, "KDB nodes");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD, NULL,
+    0, kdb_sysctl_available, "A", "list of available KDB backends");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, current, CTLTYPE_STRING | CTLFLAG_RW, NULL,
+    0, kdb_sysctl_current, "A", "currently selected KDB backend");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, enter,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
+    kdb_sysctl_enter, "I", "set to enter the debugger");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, panic,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
+    kdb_sysctl_panic, "I", "set to panic the kernel");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, trap,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
+    kdb_sysctl_trap, "I", "set to cause a page fault via data access");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, trap_code,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
+    kdb_sysctl_trap_code, "I", "set to cause a page fault via code access");
+
+SYSCTL_INT(_debug_kdb, OID_AUTO, break_to_debugger,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_TUN | CTLFLAG_SECURE,
+    &kdb_break_to_debugger, 0, "Enable break to debugger");
+TUNABLE_INT("debug.kdb.break_to_debugger", &kdb_break_to_debugger);
+
+SYSCTL_INT(_debug_kdb, OID_AUTO, alt_break_to_debugger,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_TUN | CTLFLAG_SECURE,
+    &kdb_alt_break_to_debugger, 0, "Enable alternative break to debugger");
+TUNABLE_INT("debug.kdb.alt_break_to_debugger", &kdb_alt_break_to_debugger);
+
+/*
+ * Flag to indicate to debuggers why the debugger was entered.
+ */
+const char * volatile kdb_why = KDB_WHY_UNSET;
+
+static int
+kdb_sysctl_available(SYSCTL_HANDLER_ARGS)
+{
+	struct kdb_dbbe **iter;
+	struct sbuf sbuf;
+	int error;
+
+	sbuf_new_for_sysctl(&sbuf, NULL, 64, req);
+	SET_FOREACH(iter, kdb_dbbe_set) {
+		if ((*iter)->dbbe_active == 0)
+			sbuf_printf(&sbuf, "%s ", (*iter)->dbbe_name);
+	}
+	error = sbuf_finish(&sbuf);
+	sbuf_delete(&sbuf);
+	return (error);
+}
+
+static int
+kdb_sysctl_current(SYSCTL_HANDLER_ARGS)
+{
+	char buf[16];
+	int error;
+
+	if (kdb_dbbe != NULL)
+		strlcpy(buf, kdb_dbbe->dbbe_name, sizeof(buf));
+	else
+		*buf = '\0';
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (kdb_active)
+		return (EBUSY);
+	return (kdb_dbbe_select(buf));
+}
+
+static int
+kdb_sysctl_enter(SYSCTL_HANDLER_ARGS)
+{
+	int error, i;
+
+	error = sysctl_wire_old_buffer(req, sizeof(int));
+	if (error == 0) {
+		i = 0;
+		error = sysctl_handle_int(oidp, &i, 0, req);
+	}
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (kdb_active)
+		return (EBUSY);
+	kdb_enter(KDB_WHY_SYSCTL, "sysctl debug.kdb.enter");
+	return (0);
+}
+
+static int
+kdb_sysctl_panic(SYSCTL_HANDLER_ARGS)
+{
+	int error, i;
+
+	error = sysctl_wire_old_buffer(req, sizeof(int));
+	if (error == 0) {
+		i = 0;
+		error = sysctl_handle_int(oidp, &i, 0, req);
+	}
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	panic("kdb_sysctl_panic");
+	return (0);
+}
+
+static int
+kdb_sysctl_trap(SYSCTL_HANDLER_ARGS)
+{
+	int error, i;
+	int *addr = (int *)0x10;
+
+	error = sysctl_wire_old_buffer(req, sizeof(int));
+	if (error == 0) {
+		i = 0;
+		error = sysctl_handle_int(oidp, &i, 0, req);
+	}
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	return (*addr);
+}
+
+static int
+kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS)
+{
+	int error, i;
+	void (*fp)(u_int, u_int, u_int) = (void *)0xdeadc0de;
+
+	error = sysctl_wire_old_buffer(req, sizeof(int));
+	if (error == 0) {
+		i = 0;
+		error = sysctl_handle_int(oidp, &i, 0, req);
+	}
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	(*fp)(0x11111111, 0x22222222, 0x33333333);
+	return (0);
+}
+
+void
+kdb_panic(const char *msg)
+{
+
+	printf("KDB: panic\n");
+	panic("%s", msg);
+}
+
+void
+kdb_reboot(void)
+{
+
+	printf("KDB: reboot requested\n");
+	shutdown_nice(0);
+}
+
+/*
+ * Solaris implements a new BREAK which is initiated by a character sequence
+ * CR ~ ^b which is similar to a familiar pattern used on Sun servers by the
+ * Remote Console.
+ *
+ * Note that this function may be called from almost anywhere, with interrupts
+ * disabled and with unknown locks held, so it must not access data other than
+ * its arguments.  Its up to the caller to ensure that the state variable is
+ * consistent.
+ */
+
+#define	KEY_CR		13	/* CR '\r' */
+#define	KEY_TILDE	126	/* ~ */
+#define	KEY_CRTLB	2	/* ^B */
+#define	KEY_CRTLP	16	/* ^P */
+#define	KEY_CRTLR	18	/* ^R */
+
+/* States of th KDB "alternate break sequence" detecting state machine. */
+enum {
+	KDB_ALT_BREAK_SEEN_NONE,
+	KDB_ALT_BREAK_SEEN_CR,
+	KDB_ALT_BREAK_SEEN_CR_TILDE,
+};
+
+int
+kdb_break(void)
+{
+
+	if (!kdb_break_to_debugger)
+		return (0);
+	kdb_enter(KDB_WHY_BREAK, "Break to debugger");
+	return (KDB_REQ_DEBUGGER);
+}
+
+static int
+kdb_alt_break_state(int key, int *state)
+{
+	int brk;
+
+	/* All states transition to KDB_ALT_BREAK_SEEN_CR on a CR. */
+	if (key == KEY_CR) {
+		*state = KDB_ALT_BREAK_SEEN_CR;
+		return (0);
+	}
+
+	brk = 0;
+	switch (*state) {
+	case KDB_ALT_BREAK_SEEN_CR:
+		*state = KDB_ALT_BREAK_SEEN_NONE;
+		if (key == KEY_TILDE)
+			*state = KDB_ALT_BREAK_SEEN_CR_TILDE;
+		break;
+	case KDB_ALT_BREAK_SEEN_CR_TILDE:
+		*state = KDB_ALT_BREAK_SEEN_NONE;
+		if (key == KEY_CRTLB)
+			brk = KDB_REQ_DEBUGGER;
+		else if (key == KEY_CRTLP)
+			brk = KDB_REQ_PANIC;
+		else if (key == KEY_CRTLR)
+			brk = KDB_REQ_REBOOT;
+		break;
+	case KDB_ALT_BREAK_SEEN_NONE:
+	default:
+		*state = KDB_ALT_BREAK_SEEN_NONE;
+		break;
+	}
+	return (brk);
+}
+
+static int
+kdb_alt_break_internal(int key, int *state, int force_gdb)
+{
+	int brk;
+
+	if (!kdb_alt_break_to_debugger)
+		return (0);
+	brk = kdb_alt_break_state(key, state);
+	switch (brk) {
+	case KDB_REQ_DEBUGGER:
+		if (force_gdb)
+			kdb_dbbe_select("gdb");
+		kdb_enter(KDB_WHY_BREAK, "Break to debugger");
+		break;
+
+	case KDB_REQ_PANIC:
+		if (force_gdb)
+			kdb_dbbe_select("gdb");
+		kdb_panic("Panic sequence on console");
+		break;
+
+	case KDB_REQ_REBOOT:
+		kdb_reboot();
+		break;
+	}
+	return (0);
+}
+
+int
+kdb_alt_break(int key, int *state)
+{
+
+	return (kdb_alt_break_internal(key, state, 0));
+}
+
+/*
+ * This variation on kdb_alt_break() is used only by dcons, which has its own
+ * configuration flag to force GDB use regardless of the global KDB
+ * configuration.
+ */
+int
+kdb_alt_break_gdb(int key, int *state)
+{
+
+	return (kdb_alt_break_internal(key, state, 1));
+}
+
+/*
+ * Print a backtrace of the calling thread. The backtrace is generated by
+ * the selected debugger, provided it supports backtraces. If no debugger
+ * is selected or the current debugger does not support backtraces, this
+ * function silently returns.
+ */
+
+void
+kdb_backtrace(void)
+{
+
+	if (kdb_dbbe != NULL && kdb_dbbe->dbbe_trace != NULL) {
+		printf("KDB: stack backtrace:\n");
+		kdb_dbbe->dbbe_trace();
+	}
+#ifdef STACK
+	else {
+		struct stack st;
+
+		printf("KDB: stack backtrace:\n");
+		stack_zero(&st);
+		stack_save(&st);
+		stack_print_ddb(&st);
+	}
+#endif
+}
+
+/*
+ * Similar to kdb_backtrace() except that it prints a backtrace of an
+ * arbitrary thread rather than the calling thread.
+ */
+void
+kdb_backtrace_thread(struct thread *td)
+{
+
+	if (kdb_dbbe != NULL && kdb_dbbe->dbbe_trace_thread != NULL) {
+		printf("KDB: stack backtrace of thread %d:\n", td->td_tid);
+		kdb_dbbe->dbbe_trace_thread(td);
+	}
+#ifdef STACK
+	else {
+		struct stack st;
+
+		printf("KDB: stack backtrace of thread %d:\n", td->td_tid);
+		stack_zero(&st);
+		stack_save_td(&st, td);
+		stack_print_ddb(&st);
+	}
+#endif
+}
+
+/*
+ * Set/change the current backend.
+ */
+
+int
+kdb_dbbe_select(const char *name)
+{
+	struct kdb_dbbe *be, **iter;
+
+	SET_FOREACH(iter, kdb_dbbe_set) {
+		be = *iter;
+		if (be->dbbe_active == 0 && strcmp(be->dbbe_name, name) == 0) {
+			kdb_dbbe = be;
+			return (0);
+		}
+	}
+	return (EINVAL);
+}
+
+/*
+ * Enter the currently selected debugger. If a message has been provided,
+ * it is printed first. If the debugger does not support the enter method,
+ * it is entered by using breakpoint(), which enters the debugger through
+ * kdb_trap().  The 'why' argument will contain a more mechanically usable
+ * string than 'msg', and is relied upon by DDB scripting to identify the
+ * reason for entering the debugger so that the right script can be run.
+ */
+void
+kdb_enter(const char *why, const char *msg)
+{
+
+	if (kdb_dbbe != NULL && kdb_active == 0) {
+		if (msg != NULL)
+			printf("KDB: enter: %s\n", msg);
+		kdb_why = why;
+		breakpoint();
+		kdb_why = KDB_WHY_UNSET;
+	}
+}
+
+/*
+ * Initialize the kernel debugger interface.
+ */
+
+void
+kdb_init(void)
+{
+	struct kdb_dbbe *be, **iter;
+	int cur_pri, pri;
+
+	kdb_active = 0;
+	kdb_dbbe = NULL;
+	cur_pri = -1;
+	SET_FOREACH(iter, kdb_dbbe_set) {
+		be = *iter;
+		pri = (be->dbbe_init != NULL) ? be->dbbe_init() : -1;
+		be->dbbe_active = (pri >= 0) ? 0 : -1;
+		if (pri > cur_pri) {
+			cur_pri = pri;
+			kdb_dbbe = be;
+		}
+	}
+	if (kdb_dbbe != NULL) {
+		printf("KDB: debugger backends:");
+		SET_FOREACH(iter, kdb_dbbe_set) {
+			be = *iter;
+			if (be->dbbe_active == 0)
+				printf(" %s", be->dbbe_name);
+		}
+		printf("\n");
+		printf("KDB: current backend: %s\n",
+		    kdb_dbbe->dbbe_name);
+	}
+}
+
+/*
+ * Handle contexts.
+ */
+
+void *
+kdb_jmpbuf(jmp_buf new)
+{
+	void *old;
+
+	old = kdb_jmpbufp;
+	kdb_jmpbufp = new;
+	return (old);
+}
+
+void
+kdb_reenter(void)
+{
+
+	if (!kdb_active || kdb_jmpbufp == NULL)
+		return;
+
+	longjmp(kdb_jmpbufp, 1);
+	/* NOTREACHED */
+}
+
+/*
+ * Thread related support functions.
+ */
+
+struct pcb *
+kdb_thr_ctx(struct thread *thr)
+{  
+#if defined(SMP) && defined(KDB_STOPPEDPCB)
+	struct pcpu *pc;
+#endif
+ 
+	if (thr == curthread) 
+		return (&kdb_pcb);
+
+#if defined(SMP) && defined(KDB_STOPPEDPCB)
+	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu)  {
+		if (pc->pc_curthread == thr &&
+		    CPU_ISSET(pc->pc_cpuid, &stopped_cpus))
+			return (KDB_STOPPEDPCB(pc));
+	}
+#endif
+	return (thr->td_pcb);
+}
+
+struct thread *
+kdb_thr_first(void)
+{
+	struct proc *p;
+	struct thread *thr;
+
+	p = LIST_FIRST(&allproc);
+	while (p != NULL) {
+		if (p->p_flag & P_INMEM) {
+			thr = FIRST_THREAD_IN_PROC(p);
+			if (thr != NULL)
+				return (thr);
+		}
+		p = LIST_NEXT(p, p_list);
+	}
+	return (NULL);
+}
+
+struct thread *
+kdb_thr_from_pid(pid_t pid)
+{
+	struct proc *p;
+
+	p = LIST_FIRST(&allproc);
+	while (p != NULL) {
+		if (p->p_flag & P_INMEM && p->p_pid == pid)
+			return (FIRST_THREAD_IN_PROC(p));
+		p = LIST_NEXT(p, p_list);
+	}
+	return (NULL);
+}
+
+struct thread *
+kdb_thr_lookup(lwpid_t tid)
+{
+	struct thread *thr;
+
+	thr = kdb_thr_first();
+	while (thr != NULL && thr->td_tid != tid)
+		thr = kdb_thr_next(thr);
+	return (thr);
+}
+
+struct thread *
+kdb_thr_next(struct thread *thr)
+{
+	struct proc *p;
+
+	p = thr->td_proc;
+	thr = TAILQ_NEXT(thr, td_plist);
+	do {
+		if (thr != NULL)
+			return (thr);
+		p = LIST_NEXT(p, p_list);
+		if (p != NULL && (p->p_flag & P_INMEM))
+			thr = FIRST_THREAD_IN_PROC(p);
+	} while (p != NULL);
+	return (NULL);
+}
+
+int
+kdb_thr_select(struct thread *thr)
+{
+	if (thr == NULL)
+		return (EINVAL);
+	kdb_thread = thr;
+	kdb_thrctx = kdb_thr_ctx(thr);
+	return (0);
+}
+
+/*
+ * Enter the debugger due to a trap.
+ */
+
+int
+kdb_trap(int type, int code, struct trapframe *tf)
+{
+#ifdef SMP
+	cpuset_t other_cpus;
+#endif
+	struct kdb_dbbe *be;
+	register_t intr;
+	int handled;
+#ifdef SMP
+	int did_stop_cpus;
+#endif
+
+	be = kdb_dbbe;
+	if (be == NULL || be->dbbe_trap == NULL)
+		return (0);
+
+	/* We reenter the debugger through kdb_reenter(). */
+	if (kdb_active)
+		return (0);
+
+	intr = intr_disable();
+
+#ifdef SMP
+	if (!SCHEDULER_STOPPED()) {
+		other_cpus = all_cpus;
+		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+		stop_cpus_hard(other_cpus);
+		did_stop_cpus = 1;
+	} else
+		did_stop_cpus = 0;
+#endif
+
+	kdb_active++;
+
+	kdb_frame = tf;
+
+	/* Let MD code do its thing first... */
+	kdb_cpu_trap(type, code);
+
+	makectx(tf, &kdb_pcb);
+	kdb_thr_select(curthread);
+
+	cngrab();
+
+	for (;;) {
+		handled = be->dbbe_trap(type, code);
+		if (be == kdb_dbbe)
+			break;
+		be = kdb_dbbe;
+		if (be == NULL || be->dbbe_trap == NULL)
+			break;
+		printf("Switching to %s back-end\n", be->dbbe_name);
+	}
+
+	cnungrab();
+
+	kdb_active--;
+
+#ifdef SMP
+	if (did_stop_cpus)
+		restart_cpus(stopped_cpus);
+#endif
+
+	intr_restore(intr);
+
+	return (handled);
+}
diff --git a/sys/kern/subr_kobj.c b/sys/kern/subr_kobj.c
new file mode 100644
index 0000000..5be746a
--- /dev/null
+++ b/sys/kern/subr_kobj.c
@@ -0,0 +1,348 @@
+/*-
+ * Copyright (c) 2000,2003 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#ifndef TEST
+#include <sys/systm.h>
+#endif
+
+#ifdef TEST
+#include "usertest.h"
+#endif
+
+static MALLOC_DEFINE(M_KOBJ, "kobj", "Kernel object structures");
+
+#ifdef KOBJ_STATS
+
+u_int kobj_lookup_hits;
+u_int kobj_lookup_misses;
+
+SYSCTL_UINT(_kern, OID_AUTO, kobj_hits, CTLFLAG_RD,
+	   &kobj_lookup_hits, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, kobj_misses, CTLFLAG_RD,
+	   &kobj_lookup_misses, 0, "");
+
+#endif
+
+static struct mtx kobj_mtx;
+static int kobj_mutex_inited;
+static int kobj_next_id = 1;
+
+#define	KOBJ_LOCK()		mtx_lock(&kobj_mtx)
+#define	KOBJ_UNLOCK()		mtx_unlock(&kobj_mtx)
+#define	KOBJ_ASSERT(what)	mtx_assert(&kobj_mtx, what);
+
+SYSCTL_INT(_kern, OID_AUTO, kobj_methodcount, CTLFLAG_RD,
+	   &kobj_next_id, 0, "");
+
+static void
+kobj_init_mutex(void *arg)
+{
+	if (!kobj_mutex_inited) {
+		mtx_init(&kobj_mtx, "kobj", NULL, MTX_DEF);
+		kobj_mutex_inited = 1;
+	}
+}
+
+SYSINIT(kobj, SI_SUB_LOCK, SI_ORDER_ANY, kobj_init_mutex, NULL);
+
+/*
+ * This method structure is used to initialise new caches. Since the
+ * desc pointer is NULL, it is guaranteed never to match any read
+ * descriptors.
+ */
+static const struct kobj_method null_method = {
+	0, 0,
+};
+
+int
+kobj_error_method(void)
+{
+
+	return ENXIO;
+}
+
+static void
+kobj_class_compile_common(kobj_class_t cls, kobj_ops_t ops)
+{
+	kobj_method_t *m;
+	int i;
+
+	/*
+	 * Don't do anything if we are already compiled.
+	 */
+	if (cls->ops)
+		return;
+
+	/*
+	 * First register any methods which need it.
+	 */
+	for (i = 0, m = cls->methods; m->desc; i++, m++) {
+		if (m->desc->id == 0)
+			m->desc->id = kobj_next_id++;
+	}
+
+	/*
+	 * Then initialise the ops table.
+	 */
+	for (i = 0; i < KOBJ_CACHE_SIZE; i++)
+		ops->cache[i] = &null_method;
+	ops->cls = cls;
+	cls->ops = ops;
+}
+
+void
+kobj_class_compile(kobj_class_t cls)
+{
+	kobj_ops_t ops;
+
+	KOBJ_ASSERT(MA_NOTOWNED);
+
+	/*
+	 * Allocate space for the compiled ops table.
+	 */
+	ops = malloc(sizeof(struct kobj_ops), M_KOBJ, M_NOWAIT);
+	if (!ops)
+		panic("%s: out of memory", __func__);
+
+	KOBJ_LOCK();
+	
+	/*
+	 * We may have lost a race for kobj_class_compile here - check
+	 * to make sure someone else hasn't already compiled this
+	 * class.
+	 */
+	if (cls->ops) {
+		KOBJ_UNLOCK();
+		free(ops, M_KOBJ);
+		return;
+	}
+
+	kobj_class_compile_common(cls, ops);
+	KOBJ_UNLOCK();
+}
+
+void
+kobj_class_compile_static(kobj_class_t cls, kobj_ops_t ops)
+{
+
+	KASSERT(kobj_mutex_inited == 0,
+	    ("%s: only supported during early cycles", __func__));
+
+	/*
+	 * Increment refs to make sure that the ops table is not freed.
+	 */
+	cls->refs++;
+	kobj_class_compile_common(cls, ops);
+}
+
+static kobj_method_t*
+kobj_lookup_method_class(kobj_class_t cls, kobjop_desc_t desc)
+{
+	kobj_method_t *methods = cls->methods;
+	kobj_method_t *ce;
+
+	for (ce = methods; ce && ce->desc; ce++) {
+		if (ce->desc == desc) {
+			return ce;
+		}
+	}
+
+	return NULL;
+}
+
+static kobj_method_t*
+kobj_lookup_method_mi(kobj_class_t cls,
+		      kobjop_desc_t desc)
+{
+	kobj_method_t *ce;
+	kobj_class_t *basep;
+
+	ce = kobj_lookup_method_class(cls, desc);
+	if (ce)
+		return ce;
+
+	basep = cls->baseclasses;
+	if (basep) {
+		for (; *basep; basep++) {
+			ce = kobj_lookup_method_mi(*basep, desc);
+			if (ce)
+				return ce;
+		}
+	}
+
+	return NULL;
+}
+
+kobj_method_t*
+kobj_lookup_method(kobj_class_t cls,
+		   kobj_method_t **cep,
+		   kobjop_desc_t desc)
+{
+	kobj_method_t *ce;
+
+#ifdef KOBJ_STATS
+	/*
+	 * Correct for the 'hit' assumption in KOBJOPLOOKUP and record
+	 * a 'miss'.
+	 */
+	kobj_lookup_hits--;
+	kobj_lookup_misses++;
+#endif
+
+	ce = kobj_lookup_method_mi(cls, desc);
+	if (!ce)
+		ce = &desc->deflt;
+	*cep = ce;
+	return ce;
+}
+
+void
+kobj_class_free(kobj_class_t cls)
+{
+	void* ops = NULL;
+
+	KOBJ_ASSERT(MA_NOTOWNED);
+	KOBJ_LOCK();
+
+	/*
+	 * Protect against a race between kobj_create and
+	 * kobj_delete.
+	 */
+	if (cls->refs == 0) {
+		/*
+		 * For now we don't do anything to unregister any methods
+		 * which are no longer used.
+		 */
+
+		/*
+		 * Free memory and clean up.
+		 */
+		ops = cls->ops;
+		cls->ops = NULL;
+	}
+	
+	KOBJ_UNLOCK();
+
+	if (ops)
+		free(ops, M_KOBJ);
+}
+
+kobj_t
+kobj_create(kobj_class_t cls,
+	    struct malloc_type *mtype,
+	    int mflags)
+{
+	kobj_t obj;
+
+	/*
+	 * Allocate and initialise the new object.
+	 */
+	obj = malloc(cls->size, mtype, mflags | M_ZERO);
+	if (!obj)
+		return NULL;
+	kobj_init(obj, cls);
+
+	return obj;
+}
+
+static void
+kobj_init_common(kobj_t obj, kobj_class_t cls)
+{
+
+	obj->ops = cls->ops;
+	cls->refs++;
+}
+
+void
+kobj_init(kobj_t obj, kobj_class_t cls)
+{
+	KOBJ_ASSERT(MA_NOTOWNED);
+  retry:
+	KOBJ_LOCK();
+
+	/*
+	 * Consider compiling the class' method table.
+	 */
+	if (!cls->ops) {
+		/*
+		 * kobj_class_compile doesn't want the lock held
+		 * because of the call to malloc - we drop the lock
+		 * and re-try.
+		 */
+		KOBJ_UNLOCK();
+		kobj_class_compile(cls);
+		goto retry;
+	}
+
+	kobj_init_common(obj, cls);
+
+	KOBJ_UNLOCK();
+}
+
+void
+kobj_init_static(kobj_t obj, kobj_class_t cls)
+{
+
+	KASSERT(kobj_mutex_inited == 0,
+	    ("%s: only supported during early cycles", __func__));
+
+	kobj_init_common(obj, cls);
+}
+
+void
+kobj_delete(kobj_t obj, struct malloc_type *mtype)
+{
+	kobj_class_t cls = obj->ops->cls;
+	int refs;
+
+	/*
+	 * Consider freeing the compiled method table for the class
+	 * after its last instance is deleted. As an optimisation, we
+	 * should defer this for a short while to avoid thrashing.
+	 */
+	KOBJ_ASSERT(MA_NOTOWNED);
+	KOBJ_LOCK();
+	cls->refs--;
+	refs = cls->refs;
+	KOBJ_UNLOCK();
+
+	if (!refs)
+		kobj_class_free(cls);
+
+	obj->ops = NULL;
+	if (mtype)
+		free(obj, mtype);
+}
diff --git a/sys/kern/subr_lock.c b/sys/kern/subr_lock.c
new file mode 100644
index 0000000..94908ac
--- /dev/null
+++ b/sys/kern/subr_lock.c
@@ -0,0 +1,649 @@
+/*-
+ * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This module holds the global variables and functions used to maintain
+ * lock_object structures.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_mprof.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/lock_profile.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <machine/cpufunc.h>
+
+CTASSERT(LOCK_CLASS_MAX == 15);
+
+struct lock_class *lock_classes[LOCK_CLASS_MAX + 1] = {
+	&lock_class_mtx_spin,
+	&lock_class_mtx_sleep,
+	&lock_class_sx,
+	&lock_class_rm,
+	&lock_class_rm_sleepable,
+	&lock_class_rw,
+	&lock_class_lockmgr,
+};
+
+void
+lock_init(struct lock_object *lock, struct lock_class *class, const char *name,
+    const char *type, int flags)
+{
+	int i;
+
+	/* Check for double-init and zero object. */
+	KASSERT(!lock_initalized(lock), ("lock \"%s\" %p already initialized",
+	    name, lock));
+
+	/* Look up lock class to find its index. */
+	for (i = 0; i < LOCK_CLASS_MAX; i++)
+		if (lock_classes[i] == class) {
+			lock->lo_flags = i << LO_CLASSSHIFT;
+			break;
+		}
+	KASSERT(i < LOCK_CLASS_MAX, ("unknown lock class %p", class));
+
+	/* Initialize the lock object. */
+	lock->lo_name = name;
+	lock->lo_flags |= flags | LO_INITIALIZED;
+	LOCK_LOG_INIT(lock, 0);
+	WITNESS_INIT(lock, (type != NULL) ? type : name);
+}
+
+void
+lock_destroy(struct lock_object *lock)
+{
+
+	KASSERT(lock_initalized(lock), ("lock %p is not initialized", lock));
+	WITNESS_DESTROY(lock);
+	LOCK_LOG_DESTROY(lock, 0);
+	lock->lo_flags &= ~LO_INITIALIZED;
+}
+
+#ifdef DDB
+DB_SHOW_COMMAND(lock, db_show_lock)
+{
+	struct lock_object *lock;
+	struct lock_class *class;
+
+	if (!have_addr)
+		return;
+	lock = (struct lock_object *)addr;
+	if (LO_CLASSINDEX(lock) > LOCK_CLASS_MAX) {
+		db_printf("Unknown lock class: %d\n", LO_CLASSINDEX(lock));
+		return;
+	}
+	class = LOCK_CLASS(lock);
+	db_printf(" class: %s\n", class->lc_name);
+	db_printf(" name: %s\n", lock->lo_name);
+	class->lc_ddb_show(lock);
+}
+#endif
+
+#ifdef LOCK_PROFILING
+
+/*
+ * One object per-thread for each lock the thread owns.  Tracks individual
+ * lock instances.
+ */
+struct lock_profile_object {
+	LIST_ENTRY(lock_profile_object) lpo_link;
+	struct lock_object *lpo_obj;
+	const char	*lpo_file;
+	int		lpo_line;
+	uint16_t	lpo_ref;
+	uint16_t	lpo_cnt;
+	uint64_t	lpo_acqtime;
+	uint64_t	lpo_waittime;
+	u_int		lpo_contest_locking;
+};
+
+/*
+ * One lock_prof for each (file, line, lock object) triple.
+ */
+struct lock_prof {
+	SLIST_ENTRY(lock_prof) link;
+	struct lock_class *class;
+	const char	*file;
+	const char	*name;
+	int		line;
+	int		ticks;
+	uintmax_t	cnt_wait_max;
+	uintmax_t	cnt_max;
+	uintmax_t	cnt_tot;
+	uintmax_t	cnt_wait;
+	uintmax_t	cnt_cur;
+	uintmax_t	cnt_contest_locking;
+};
+
+SLIST_HEAD(lphead, lock_prof);
+
+#define	LPROF_HASH_SIZE		4096
+#define	LPROF_HASH_MASK		(LPROF_HASH_SIZE - 1)
+#define	LPROF_CACHE_SIZE	4096
+
+/*
+ * Array of objects and profs for each type of object for each cpu.  Spinlocks
+ * are handled separately because a thread may be preempted and acquire a
+ * spinlock while in the lock profiling code of a non-spinlock.  In this way
+ * we only need a critical section to protect the per-cpu lists.
+ */
+struct lock_prof_type {
+	struct lphead		lpt_lpalloc;
+	struct lpohead		lpt_lpoalloc;
+	struct lphead		lpt_hash[LPROF_HASH_SIZE];
+	struct lock_prof	lpt_prof[LPROF_CACHE_SIZE];
+	struct lock_profile_object lpt_objs[LPROF_CACHE_SIZE];
+};
+
+struct lock_prof_cpu {
+	struct lock_prof_type	lpc_types[2]; /* One for spin one for other. */
+};
+
+struct lock_prof_cpu *lp_cpu[MAXCPU];
+
+volatile int lock_prof_enable = 0;
+static volatile int lock_prof_resetting;
+
+#define LPROF_SBUF_SIZE		256
+
+static int lock_prof_rejected;
+static int lock_prof_skipspin;
+static int lock_prof_skipcount;
+
+#ifndef USE_CPU_NANOSECONDS
+uint64_t
+nanoseconds(void)
+{
+	struct bintime bt;
+	uint64_t ns;
+
+	binuptime(&bt);
+	/* From bintime2timespec */
+	ns = bt.sec * (uint64_t)1000000000;
+	ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32;
+	return (ns);
+}
+#endif
+
+static void
+lock_prof_init_type(struct lock_prof_type *type)
+{
+	int i;
+
+	SLIST_INIT(&type->lpt_lpalloc);
+	LIST_INIT(&type->lpt_lpoalloc);
+	for (i = 0; i < LPROF_CACHE_SIZE; i++) {
+		SLIST_INSERT_HEAD(&type->lpt_lpalloc, &type->lpt_prof[i],
+		    link);
+		LIST_INSERT_HEAD(&type->lpt_lpoalloc, &type->lpt_objs[i],
+		    lpo_link);
+	}
+}
+
+static void
+lock_prof_init(void *arg)
+{
+	int cpu;
+
+	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+		lp_cpu[cpu] = malloc(sizeof(*lp_cpu[cpu]), M_DEVBUF,
+		    M_WAITOK | M_ZERO);
+		lock_prof_init_type(&lp_cpu[cpu]->lpc_types[0]);
+		lock_prof_init_type(&lp_cpu[cpu]->lpc_types[1]);
+	}
+}
+SYSINIT(lockprof, SI_SUB_SMP, SI_ORDER_ANY, lock_prof_init, NULL);
+
+static void
+lock_prof_reset_wait(void)
+{
+
+	/*
+	 * Spin relinquishing our cpu so that quiesce_all_cpus may
+	 * complete.
+	 */
+	while (lock_prof_resetting)
+		sched_relinquish(curthread);
+}
+
+static void
+lock_prof_reset(void)
+{
+	struct lock_prof_cpu *lpc;
+	int enabled, i, cpu;
+
+	/*
+	 * We not only race with acquiring and releasing locks but also
+	 * thread exit.  To be certain that threads exit without valid head
+	 * pointers they must see resetting set before enabled is cleared.
+	 * Otherwise a lock may not be removed from a per-thread list due
+	 * to disabled being set but not wait for reset() to remove it below.
+	 */
+	atomic_store_rel_int(&lock_prof_resetting, 1);
+	enabled = lock_prof_enable;
+	lock_prof_enable = 0;
+	quiesce_all_cpus("profreset", 0);
+	/*
+	 * Some objects may have migrated between CPUs.  Clear all links
+	 * before we zero the structures.  Some items may still be linked
+	 * into per-thread lists as well.
+	 */
+	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+		lpc = lp_cpu[cpu];
+		for (i = 0; i < LPROF_CACHE_SIZE; i++) {
+			LIST_REMOVE(&lpc->lpc_types[0].lpt_objs[i], lpo_link);
+			LIST_REMOVE(&lpc->lpc_types[1].lpt_objs[i], lpo_link);
+		}
+	}
+	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+		lpc = lp_cpu[cpu];
+		bzero(lpc, sizeof(*lpc));
+		lock_prof_init_type(&lpc->lpc_types[0]);
+		lock_prof_init_type(&lpc->lpc_types[1]);
+	}
+	atomic_store_rel_int(&lock_prof_resetting, 0);
+	lock_prof_enable = enabled;
+}
+
+static void
+lock_prof_output(struct lock_prof *lp, struct sbuf *sb)
+{
+	const char *p;
+
+	for (p = lp->file; p != NULL && strncmp(p, "../", 3) == 0; p += 3);
+	sbuf_printf(sb,
+	    "%8ju %9ju %11ju %11ju %11ju %6ju %6ju %2ju %6ju %s:%d (%s:%s)\n",
+	    lp->cnt_max / 1000, lp->cnt_wait_max / 1000, lp->cnt_tot / 1000,
+	    lp->cnt_wait / 1000, lp->cnt_cur,
+	    lp->cnt_cur == 0 ? (uintmax_t)0 :
+	    lp->cnt_tot / (lp->cnt_cur * 1000),
+	    lp->cnt_cur == 0 ? (uintmax_t)0 :
+	    lp->cnt_wait / (lp->cnt_cur * 1000),
+	    (uintmax_t)0, lp->cnt_contest_locking,
+	    p, lp->line, lp->class->lc_name, lp->name);
+}
+
+static void
+lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash,
+    int spin, int t)
+{
+	struct lock_prof_type *type;
+	struct lock_prof *l;
+	int cpu;
+
+	dst->file = match->file;
+	dst->line = match->line;
+	dst->class = match->class;
+	dst->name = match->name;
+
+	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+		if (lp_cpu[cpu] == NULL)
+			continue;
+		type = &lp_cpu[cpu]->lpc_types[spin];
+		SLIST_FOREACH(l, &type->lpt_hash[hash], link) {
+			if (l->ticks == t)
+				continue;
+			if (l->file != match->file || l->line != match->line ||
+			    l->name != match->name)
+				continue;
+			l->ticks = t;
+			if (l->cnt_max > dst->cnt_max)
+				dst->cnt_max = l->cnt_max;
+			if (l->cnt_wait_max > dst->cnt_wait_max)
+				dst->cnt_wait_max = l->cnt_wait_max;
+			dst->cnt_tot += l->cnt_tot;
+			dst->cnt_wait += l->cnt_wait;
+			dst->cnt_cur += l->cnt_cur;
+			dst->cnt_contest_locking += l->cnt_contest_locking;
+		}
+	}
+	
+}
+
+static void
+lock_prof_type_stats(struct lock_prof_type *type, struct sbuf *sb, int spin,
+    int t)
+{
+	struct lock_prof *l;
+	int i;
+
+	for (i = 0; i < LPROF_HASH_SIZE; ++i) {
+		SLIST_FOREACH(l, &type->lpt_hash[i], link) {
+			struct lock_prof lp = {};
+
+			if (l->ticks == t)
+				continue;
+			lock_prof_sum(l, &lp, i, spin, t);
+			lock_prof_output(&lp, sb);
+		}
+	}
+}
+
+static int
+dump_lock_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf *sb;
+	int error, cpu, t;
+	int enabled;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sb = sbuf_new_for_sysctl(NULL, NULL, LPROF_SBUF_SIZE, req);
+	sbuf_printf(sb, "\n%8s %9s %11s %11s %11s %6s %6s %2s %6s %s\n",
+	    "max", "wait_max", "total", "wait_total", "count", "avg", "wait_avg", "cnt_hold", "cnt_lock", "name");
+	enabled = lock_prof_enable;
+	lock_prof_enable = 0;
+	quiesce_all_cpus("profstat", 0);
+	t = ticks;
+	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+		if (lp_cpu[cpu] == NULL)
+			continue;
+		lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[0], sb, 0, t);
+		lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[1], sb, 1, t);
+	}
+	lock_prof_enable = enabled;
+
+	error = sbuf_finish(sb);
+	/* Output a trailing NUL. */
+	if (error == 0)
+		error = SYSCTL_OUT(req, "", 1);
+	sbuf_delete(sb);
+	return (error);
+}
+
+static int
+enable_lock_prof(SYSCTL_HANDLER_ARGS)
+{
+	int error, v;
+
+	v = lock_prof_enable;
+	error = sysctl_handle_int(oidp, &v, v, req);
+	if (error)
+		return (error);
+	if (req->newptr == NULL)
+		return (error);
+	if (v == lock_prof_enable)
+		return (0);
+	if (v == 1)
+		lock_prof_reset();
+	lock_prof_enable = !!v;
+
+	return (0);
+}
+
+static int
+reset_lock_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+	int error, v;
+
+	v = 0;
+	error = sysctl_handle_int(oidp, &v, 0, req);
+	if (error)
+		return (error);
+	if (req->newptr == NULL)
+		return (error);
+	if (v == 0)
+		return (0);
+	lock_prof_reset();
+
+	return (0);
+}
+
+static struct lock_prof *
+lock_profile_lookup(struct lock_object *lo, int spin, const char *file,
+    int line)
+{
+	const char *unknown = "(unknown)";
+	struct lock_prof_type *type;
+	struct lock_prof *lp;
+	struct lphead *head;
+	const char *p;
+	u_int hash;
+
+	p = file;
+	if (p == NULL || *p == '\0')
+		p = unknown;
+	hash = (uintptr_t)lo->lo_name * 31 + (uintptr_t)p * 31 + line;
+	hash &= LPROF_HASH_MASK;
+	type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
+	head = &type->lpt_hash[hash];
+	SLIST_FOREACH(lp, head, link) {
+		if (lp->line == line && lp->file == p &&
+		    lp->name == lo->lo_name)
+			return (lp);
+
+	}
+	lp = SLIST_FIRST(&type->lpt_lpalloc);
+	if (lp == NULL) {
+		lock_prof_rejected++;
+		return (lp);
+	}
+	SLIST_REMOVE_HEAD(&type->lpt_lpalloc, link);
+	lp->file = p;
+	lp->line = line;
+	lp->class = LOCK_CLASS(lo);
+	lp->name = lo->lo_name;
+	SLIST_INSERT_HEAD(&type->lpt_hash[hash], lp, link);
+	return (lp);
+}
+
+static struct lock_profile_object *
+lock_profile_object_lookup(struct lock_object *lo, int spin, const char *file,
+    int line)
+{
+	struct lock_profile_object *l;
+	struct lock_prof_type *type;
+	struct lpohead *head;
+
+	head = &curthread->td_lprof[spin];
+	LIST_FOREACH(l, head, lpo_link)
+		if (l->lpo_obj == lo && l->lpo_file == file &&
+		    l->lpo_line == line)
+			return (l);
+	type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
+	l = LIST_FIRST(&type->lpt_lpoalloc);
+	if (l == NULL) {
+		lock_prof_rejected++;
+		return (NULL);
+	}
+	LIST_REMOVE(l, lpo_link);
+	l->lpo_obj = lo;
+	l->lpo_file = file;
+	l->lpo_line = line;
+	l->lpo_cnt = 0;
+	LIST_INSERT_HEAD(head, l, lpo_link);
+
+	return (l);
+}
+
+void
+lock_profile_obtain_lock_success(struct lock_object *lo, int contested,
+    uint64_t waittime, const char *file, int line)
+{
+	static int lock_prof_count;
+	struct lock_profile_object *l;
+	int spin;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	/* don't reset the timer when/if recursing */
+	if (!lock_prof_enable || (lo->lo_flags & LO_NOPROFILE))
+		return;
+	if (lock_prof_skipcount &&
+	    (++lock_prof_count % lock_prof_skipcount) != 0)
+		return;
+	spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0;
+	if (spin && lock_prof_skipspin == 1)
+		return;
+	critical_enter();
+	/* Recheck enabled now that we're in a critical section. */
+	if (lock_prof_enable == 0)
+		goto out;
+	l = lock_profile_object_lookup(lo, spin, file, line);
+	if (l == NULL)
+		goto out;
+	l->lpo_cnt++;
+	if (++l->lpo_ref > 1)
+		goto out;
+	l->lpo_contest_locking = contested;
+	l->lpo_acqtime = nanoseconds(); 
+	if (waittime && (l->lpo_acqtime > waittime))
+		l->lpo_waittime = l->lpo_acqtime - waittime;
+	else
+		l->lpo_waittime = 0;
+out:
+	critical_exit();
+}
+
+void
+lock_profile_thread_exit(struct thread *td)
+{
+#ifdef INVARIANTS
+	struct lock_profile_object *l;
+
+	MPASS(curthread->td_critnest == 0);
+#endif
+	/*
+	 * If lock profiling was disabled we have to wait for reset to
+	 * clear our pointers before we can exit safely.
+	 */
+	lock_prof_reset_wait();
+#ifdef INVARIANTS
+	LIST_FOREACH(l, &td->td_lprof[0], lpo_link)
+		printf("thread still holds lock acquired at %s:%d\n",
+		    l->lpo_file, l->lpo_line);
+	LIST_FOREACH(l, &td->td_lprof[1], lpo_link)
+		printf("thread still holds lock acquired at %s:%d\n",
+		    l->lpo_file, l->lpo_line);
+#endif
+	MPASS(LIST_FIRST(&td->td_lprof[0]) == NULL);
+	MPASS(LIST_FIRST(&td->td_lprof[1]) == NULL);
+}
+
+void
+lock_profile_release_lock(struct lock_object *lo)
+{
+	struct lock_profile_object *l;
+	struct lock_prof_type *type;
+	struct lock_prof *lp;
+	uint64_t curtime, holdtime;
+	struct lpohead *head;
+	int spin;
+
+	if (SCHEDULER_STOPPED())
+		return;
+	if (lo->lo_flags & LO_NOPROFILE)
+		return;
+	spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0;
+	head = &curthread->td_lprof[spin];
+	if (LIST_FIRST(head) == NULL)
+		return;
+	critical_enter();
+	/* Recheck enabled now that we're in a critical section. */
+	if (lock_prof_enable == 0 && lock_prof_resetting == 1)
+		goto out;
+	/*
+	 * If lock profiling is not enabled we still want to remove the
+	 * lpo from our queue.
+	 */
+	LIST_FOREACH(l, head, lpo_link)
+		if (l->lpo_obj == lo)
+			break;
+	if (l == NULL)
+		goto out;
+	if (--l->lpo_ref > 0)
+		goto out;
+	lp = lock_profile_lookup(lo, spin, l->lpo_file, l->lpo_line);
+	if (lp == NULL)
+		goto release;
+	curtime = nanoseconds();
+	if (curtime < l->lpo_acqtime)
+		goto release;
+	holdtime = curtime - l->lpo_acqtime;
+
+	/*
+	 * Record if the lock has been held longer now than ever
+	 * before.
+	 */
+	if (holdtime > lp->cnt_max)
+		lp->cnt_max = holdtime;
+	if (l->lpo_waittime > lp->cnt_wait_max)
+		lp->cnt_wait_max = l->lpo_waittime;
+	lp->cnt_tot += holdtime;
+	lp->cnt_wait += l->lpo_waittime;
+	lp->cnt_contest_locking += l->lpo_contest_locking;
+	lp->cnt_cur += l->lpo_cnt;
+release:
+	LIST_REMOVE(l, lpo_link);
+	type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
+	LIST_INSERT_HEAD(&type->lpt_lpoalloc, l, lpo_link);
+out:
+	critical_exit();
+}
+
+static SYSCTL_NODE(_debug, OID_AUTO, lock, CTLFLAG_RD, NULL, "lock debugging");
+static SYSCTL_NODE(_debug_lock, OID_AUTO, prof, CTLFLAG_RD, NULL,
+    "lock profiling");
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipspin, CTLFLAG_RW,
+    &lock_prof_skipspin, 0, "Skip profiling on spinlocks.");
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipcount, CTLFLAG_RW,
+    &lock_prof_skipcount, 0, "Sample approximately every N lock acquisitions.");
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, rejected, CTLFLAG_RD,
+    &lock_prof_rejected, 0, "Number of rejected profiling records");
+SYSCTL_PROC(_debug_lock_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
+    NULL, 0, dump_lock_prof_stats, "A", "Lock profiling statistics");
+SYSCTL_PROC(_debug_lock_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
+    NULL, 0, reset_lock_prof_stats, "I", "Reset lock profiling statistics");
+SYSCTL_PROC(_debug_lock_prof, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
+    NULL, 0, enable_lock_prof, "I", "Enable lock profiling");
+
+#endif
diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c
new file mode 100644
index 0000000..1e61274
--- /dev/null
+++ b/sys/kern/subr_log.c
@@ -0,0 +1,310 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_log.c	8.1 (Berkeley) 6/10/93
+ */
+
+/*
+ * Error log buffer for kernel printf's.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/msgbuf.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/poll.h>
+#include <sys/filedesc.h>
+#include <sys/sysctl.h>
+
+#define LOG_RDPRI	(PZERO + 1)
+
+#define LOG_ASYNC	0x04
+
+static	d_open_t	logopen;
+static	d_close_t	logclose;
+static	d_read_t	logread;
+static	d_ioctl_t	logioctl;
+static	d_poll_t	logpoll;
+static	d_kqfilter_t	logkqfilter;
+
+static	void logtimeout(void *arg);
+
+static struct cdevsw log_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_open =	logopen,
+	.d_close =	logclose,
+	.d_read =	logread,
+	.d_ioctl =	logioctl,
+	.d_poll =	logpoll,
+	.d_kqfilter =	logkqfilter,
+	.d_name =	"log",
+};
+
+static int	logkqread(struct knote *note, long hint);
+static void	logkqdetach(struct knote *note);
+
+static struct filterops log_read_filterops = {
+	.f_isfd =	1,
+	.f_attach =	NULL,
+	.f_detach =	logkqdetach,
+	.f_event =	logkqread,
+};
+
+static struct logsoftc {
+	int	sc_state;		/* see above for possibilities */
+	struct	selinfo sc_selp;	/* process waiting on select call */
+	struct  sigio *sc_sigio;	/* information for async I/O */
+	struct	callout sc_callout;	/* callout to wakeup syslog  */
+} logsoftc;
+
+int			log_open;	/* also used in log() */
+static struct cv	log_wakeup;
+struct mtx		msgbuf_lock;
+MTX_SYSINIT(msgbuf_lock, &msgbuf_lock, "msgbuf lock", MTX_DEF);
+
+/* Times per second to check for a pending syslog wakeup. */
+static int	log_wakeups_per_second = 5;
+SYSCTL_INT(_kern, OID_AUTO, log_wakeups_per_second, CTLFLAG_RW,
+    &log_wakeups_per_second, 0, "");
+
+/*ARGSUSED*/
+static	int
+logopen(struct cdev *dev, int flags, int mode, struct thread *td)
+{
+
+	if (log_wakeups_per_second < 1) {
+		printf("syslog wakeup is less than one.  Adjusting to 1.\n");
+		log_wakeups_per_second = 1;
+	}
+
+	mtx_lock(&msgbuf_lock);
+	if (log_open) {
+		mtx_unlock(&msgbuf_lock);
+		return (EBUSY);
+	}
+	log_open = 1;
+	callout_reset_sbt(&logsoftc.sc_callout,
+	    SBT_1S / log_wakeups_per_second, 0, logtimeout, NULL, C_PREL(1));
+	mtx_unlock(&msgbuf_lock);
+
+	fsetown(td->td_proc->p_pid, &logsoftc.sc_sigio);	/* signal process only */
+	return (0);
+}
+
+/*ARGSUSED*/
+static	int
+logclose(struct cdev *dev, int flag, int mode, struct thread *td)
+{
+
+	funsetown(&logsoftc.sc_sigio);
+
+	mtx_lock(&msgbuf_lock);
+	callout_stop(&logsoftc.sc_callout);
+	logsoftc.sc_state = 0;
+	log_open = 0;
+	mtx_unlock(&msgbuf_lock);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static	int
+logread(struct cdev *dev, struct uio *uio, int flag)
+{
+	char buf[128];
+	struct msgbuf *mbp = msgbufp;
+	int error = 0, l;
+
+	mtx_lock(&msgbuf_lock);
+	while (msgbuf_getcount(mbp) == 0) {
+		if (flag & IO_NDELAY) {
+			mtx_unlock(&msgbuf_lock);
+			return (EWOULDBLOCK);
+		}
+		if ((error = cv_wait_sig(&log_wakeup, &msgbuf_lock)) != 0) {
+			mtx_unlock(&msgbuf_lock);
+			return (error);
+		}
+	}
+
+	while (uio->uio_resid > 0) {
+		l = imin(sizeof(buf), uio->uio_resid);
+		l = msgbuf_getbytes(mbp, buf, l);
+		if (l == 0)
+			break;
+		mtx_unlock(&msgbuf_lock);
+		error = uiomove(buf, l, uio);
+		if (error || uio->uio_resid == 0)
+			return (error);
+		mtx_lock(&msgbuf_lock);
+	}
+	mtx_unlock(&msgbuf_lock);
+	return (error);
+}
+
+/*ARGSUSED*/
+static	int
+logpoll(struct cdev *dev, int events, struct thread *td)
+{
+	int revents = 0;
+
+	if (events & (POLLIN | POLLRDNORM)) {
+		mtx_lock(&msgbuf_lock);
+		if (msgbuf_getcount(msgbufp) > 0)
+			revents |= events & (POLLIN | POLLRDNORM);
+		else
+			selrecord(td, &logsoftc.sc_selp);
+		mtx_unlock(&msgbuf_lock);
+	}
+	return (revents);
+}
+
+static int
+logkqfilter(struct cdev *dev, struct knote *kn)
+{
+
+	if (kn->kn_filter != EVFILT_READ)
+		return (EINVAL);
+
+	kn->kn_fop = &log_read_filterops;
+	kn->kn_hook = NULL;
+
+	mtx_lock(&msgbuf_lock);
+	knlist_add(&logsoftc.sc_selp.si_note, kn, 1);
+	mtx_unlock(&msgbuf_lock);
+	return (0);
+}
+
+static int
+logkqread(struct knote *kn, long hint)
+{
+
+	mtx_assert(&msgbuf_lock, MA_OWNED);
+	kn->kn_data = msgbuf_getcount(msgbufp);
+	return (kn->kn_data != 0);
+}
+
+static void
+logkqdetach(struct knote *kn)
+{
+
+	mtx_lock(&msgbuf_lock);
+	knlist_remove(&logsoftc.sc_selp.si_note, kn, 1);
+	mtx_unlock(&msgbuf_lock);
+}
+
+static void
+logtimeout(void *arg)
+{
+
+	if (!log_open)
+		return;
+	if (msgbuftrigger == 0)
+		goto done;
+	msgbuftrigger = 0;
+	selwakeuppri(&logsoftc.sc_selp, LOG_RDPRI);
+	KNOTE_LOCKED(&logsoftc.sc_selp.si_note, 0);
+	if ((logsoftc.sc_state & LOG_ASYNC) && logsoftc.sc_sigio != NULL)
+		pgsigio(&logsoftc.sc_sigio, SIGIO, 0);
+	cv_broadcastpri(&log_wakeup, LOG_RDPRI);
+done:
+	if (log_wakeups_per_second < 1) {
+		printf("syslog wakeup is less than one.  Adjusting to 1.\n");
+		log_wakeups_per_second = 1;
+	}
+	callout_reset_sbt(&logsoftc.sc_callout,
+	    SBT_1S / log_wakeups_per_second, 0, logtimeout, NULL, C_PREL(1));
+}
+
+/*ARGSUSED*/
+static	int
+logioctl(struct cdev *dev, u_long com, caddr_t data, int flag, struct thread *td)
+{
+
+	switch (com) {
+
+	/* return number of characters immediately available */
+	case FIONREAD:
+		*(int *)data = msgbuf_getcount(msgbufp);
+		break;
+
+	case FIONBIO:
+		break;
+
+	case FIOASYNC:
+		mtx_lock(&msgbuf_lock);
+		if (*(int *)data)
+			logsoftc.sc_state |= LOG_ASYNC;
+		else
+			logsoftc.sc_state &= ~LOG_ASYNC;
+		mtx_unlock(&msgbuf_lock);
+		break;
+
+	case FIOSETOWN:
+		return (fsetown(*(int *)data, &logsoftc.sc_sigio));
+
+	case FIOGETOWN:
+		*(int *)data = fgetown(&logsoftc.sc_sigio);
+		break;
+
+	/* This is deprecated, FIOSETOWN should be used instead. */
+	case TIOCSPGRP:
+		return (fsetown(-(*(int *)data), &logsoftc.sc_sigio));
+
+	/* This is deprecated, FIOGETOWN should be used instead */
+	case TIOCGPGRP:
+		*(int *)data = -fgetown(&logsoftc.sc_sigio);
+		break;
+
+	default:
+		return (ENOTTY);
+	}
+	return (0);
+}
+
+static void
+log_drvinit(void *unused)
+{
+
+	cv_init(&log_wakeup, "klog");
+	callout_init_mtx(&logsoftc.sc_callout, &msgbuf_lock, 0);
+	knlist_init_mtx(&logsoftc.sc_selp.si_note, &msgbuf_lock);
+	make_dev_credf(MAKEDEV_ETERNAL, &log_cdevsw, 0, NULL, UID_ROOT,
+	    GID_WHEEL, 0600, "klog");
+}
+
+SYSINIT(logdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,log_drvinit,NULL);
diff --git a/sys/kern/subr_mbpool.c b/sys/kern/subr_mbpool.c
new file mode 100644
index 0000000..0b8cda6
--- /dev/null
+++ b/sys/kern/subr_mbpool.c
@@ -0,0 +1,402 @@
+/*-
+ * Copyright (c) 2003
+ *	Fraunhofer Institute for Open Communication Systems (FhG Fokus).
+ * 	All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author: Hartmut Brandt <harti@freebsd.org>
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+
+#include <machine/bus.h>
+
+#include <sys/mbuf.h>
+#include <sys/mbpool.h>
+
+MODULE_VERSION(libmbpool, 1);
+
+/*
+ * Memory is allocated as DMA-able pages. Each page is divided into a number
+ * of equal chunks where the last 4 bytes of each chunk are occupied by
+ * the page number and the chunk number. The caller must take these four
+ * bytes into account when specifying the chunk size. Each page is mapped by
+ * its own DMA map using the user specified DMA tag.
+ *
+ * Each chunk has a used and a card bit in the high bits of its page number.
+ *  0    0	chunk is free and may be allocated
+ *  1    1	chunk has been given to the interface
+ *  0    1	chunk is traveling through the system
+ *  1    0	illegal
+ */
+struct mbtrail {
+	uint16_t	chunk;
+	uint16_t	page;
+};
+#define	MBP_CARD	0x8000
+#define	MBP_USED	0x4000
+#define	MBP_PMSK	0x3fff		/* page number mask */
+#define	MBP_CMSK	0x01ff		/* chunk number mask */
+
+struct mbfree {
+	SLIST_ENTRY(mbfree) link;	/* link on free list */
+};
+
+struct mbpage {
+	bus_dmamap_t	map;		/* map for this page */
+	bus_addr_t	phy;		/* physical address */
+	void		*va;		/* the memory */
+};
+
+struct mbpool {
+	const char	*name;		/* a name for this pool */
+	bus_dma_tag_t	dmat;		/* tag for mapping */
+	u_int		max_pages;	/* maximum number of pages */
+	size_t		page_size;	/* size of each allocation */
+	size_t		chunk_size;	/* size of each external mbuf */
+
+	struct mtx	free_lock;	/* lock of free list */
+	SLIST_HEAD(, mbfree) free_list;	/* free list */
+	u_int		npages;		/* current number of pages */
+	u_int		nchunks;	/* chunks per page */
+	struct mbpage	pages[];	/* pages */
+};
+
+static MALLOC_DEFINE(M_MBPOOL, "mbpools", "mbuf pools");
+
+/*
+ * Make a trail pointer from a chunk pointer
+ */
+#define	C2T(P, C)	((struct mbtrail *)((char *)(C) + (P)->chunk_size - \
+			    sizeof(struct mbtrail)))
+
+/*
+ * Make a free chunk pointer from a chunk number
+ */
+#define	N2C(P, PG, C)	((struct mbfree *)((char *)(PG)->va + \
+			    (C) * (P)->chunk_size))
+
+/*
+ * Make/parse handles
+ */
+#define	HMAKE(P, C)	((((P) & MBP_PMSK) << 16) | ((C) << 7))
+#define	HPAGE(H)	(((H) >> 16) & MBP_PMSK)
+#define	HCHUNK(H)	(((H) >>  7) & MBP_CMSK)
+
+/*
+ * initialize a pool
+ */
+int
+mbp_create(struct mbpool **pp, const char *name, bus_dma_tag_t dmat,
+    u_int max_pages, size_t page_size, size_t chunk_size)
+{
+	u_int nchunks;
+
+	if (max_pages > MBPOOL_MAX_MAXPAGES || chunk_size == 0)
+		return (EINVAL);
+	nchunks = page_size / chunk_size;
+	if (nchunks == 0 || nchunks > MBPOOL_MAX_CHUNKS)
+		return (EINVAL);
+
+	(*pp) = malloc(sizeof(struct mbpool) +
+	    max_pages * sizeof(struct mbpage),
+	    M_MBPOOL, M_WAITOK | M_ZERO);
+
+	(*pp)->name = name;
+	(*pp)->dmat = dmat;
+	(*pp)->max_pages = max_pages;
+	(*pp)->page_size = page_size;
+	(*pp)->chunk_size = chunk_size;
+	(*pp)->nchunks = nchunks;
+
+	SLIST_INIT(&(*pp)->free_list);
+	mtx_init(&(*pp)->free_lock, name, NULL, MTX_DEF);
+
+	return (0);
+}
+
+/*
+ * destroy a pool
+ */
+void
+mbp_destroy(struct mbpool *p)
+{
+	u_int i;
+	struct mbpage *pg;
+#ifdef DIAGNOSTIC
+	struct mbtrail *tr;
+	u_int b;
+#endif
+
+	for (i = 0; i < p->npages; i++) {
+		pg = &p->pages[i];
+#ifdef DIAGNOSTIC
+		for (b = 0; b < p->nchunks; b++) {
+			tr = C2T(p, N2C(p, pg, b));
+			if (tr->page & MBP_CARD)
+				printf("%s: (%s) buf still on card"
+				    " %u/%u\n", __func__, p->name, i, b);
+			if (tr->page & MBP_USED)
+				printf("%s: (%s) sbuf still in use"
+				    " %u/%u\n", __func__, p->name, i, b);
+		}
+#endif
+		bus_dmamap_unload(p->dmat, pg->map);
+		bus_dmamem_free(p->dmat, pg->va, pg->map);
+	}
+	mtx_destroy(&p->free_lock);
+
+	free(p, M_MBPOOL);
+}
+
+/*
+ * Helper function when loading a one segment DMA buffer.
+ */
+static void
+mbp_callback(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
+{
+	if (error == 0)
+		*(bus_addr_t *)arg = segs[0].ds_addr;
+}
+
+/*
+ * Allocate a new page
+ */
+static void
+mbp_alloc_page(struct mbpool *p)
+{
+	int error;
+	struct mbpage *pg;
+	u_int i;
+	struct mbfree *f;
+	struct mbtrail *t;
+
+	if (p->npages == p->max_pages) {
+#ifdef DIAGNOSTIC
+		printf("%s: (%s) page limit reached %u\n", __func__,
+		    p->name, p->max_pages);
+#endif
+		return;
+	}
+	pg = &p->pages[p->npages];
+
+	error = bus_dmamem_alloc(p->dmat, &pg->va, BUS_DMA_NOWAIT, &pg->map);
+	if (error != 0) {
+		free(pg, M_MBPOOL);
+		return;
+	}
+
+	error = bus_dmamap_load(p->dmat, pg->map, pg->va, p->page_size,
+	    mbp_callback, &pg->phy, 0);
+	if (error != 0) {
+		bus_dmamem_free(p->dmat, pg->va, pg->map);
+		free(pg, M_MBPOOL);
+		return;
+	}
+
+	for (i = 0; i < p->nchunks; i++) {
+		f = N2C(p, pg, i);
+		t = C2T(p, f);
+		t->page = p->npages;
+		t->chunk = i;
+		SLIST_INSERT_HEAD(&p->free_list, f, link);
+	}
+
+	p->npages++;
+}
+
+/*
+ * allocate a chunk
+ */
+void *
+mbp_alloc(struct mbpool *p, bus_addr_t *pap, uint32_t *hp)
+{
+	struct mbfree *cf;
+	struct mbtrail *t;
+
+	mtx_lock(&p->free_lock);
+	if ((cf = SLIST_FIRST(&p->free_list)) == NULL) {
+		mbp_alloc_page(p);
+		cf = SLIST_FIRST(&p->free_list);
+	}
+	if (cf == NULL) {
+		mtx_unlock(&p->free_lock);
+		return (NULL);
+	}
+	SLIST_REMOVE_HEAD(&p->free_list, link);
+	mtx_unlock(&p->free_lock);
+
+	t = C2T(p, cf);
+
+	*pap = p->pages[t->page].phy + t->chunk * p->chunk_size;
+	*hp = HMAKE(t->page, t->chunk);
+
+	t->page |= MBP_CARD | MBP_USED;
+
+	return (cf);
+}
+
+/*
+ * Free a chunk
+ */
+void
+mbp_free(struct mbpool *p, void *ptr)
+{
+	struct mbtrail *t;
+
+	mtx_lock(&p->free_lock);
+	t = C2T(p, ptr);
+	t->page &= ~(MBP_USED | MBP_CARD);
+	SLIST_INSERT_HEAD(&p->free_list, (struct mbfree *)ptr, link);
+	mtx_unlock(&p->free_lock);
+}
+
+/*
+ * Mbuf system external mbuf free routine
+ */
+int
+mbp_ext_free(struct mbuf *m, void *buf, void *arg)
+{
+	mbp_free(arg, buf);
+
+	return (EXT_FREE_OK);
+}
+
+/*
+ * Free all buffers that are marked as beeing on the card
+ */
+void
+mbp_card_free(struct mbpool *p)
+{
+	u_int i, b;
+	struct mbpage *pg;
+	struct mbtrail *tr;
+	struct mbfree *cf;
+
+	mtx_lock(&p->free_lock);
+	for (i = 0; i < p->npages; i++) {
+		pg = &p->pages[i];
+		for (b = 0; b < p->nchunks; b++) {
+			cf = N2C(p, pg, b);
+			tr = C2T(p, cf);
+			if (tr->page & MBP_CARD) {
+				tr->page &= MBP_PMSK;
+				SLIST_INSERT_HEAD(&p->free_list, cf, link);
+			}
+		}
+	}
+	mtx_unlock(&p->free_lock);
+}
+
+/*
+ * Count buffers
+ */
+void
+mbp_count(struct mbpool *p, u_int *used, u_int *card, u_int *free)
+{
+	u_int i, b;
+	struct mbpage *pg;
+	struct mbtrail *tr;
+	struct mbfree *cf;
+
+	*used = *card = *free = 0;
+	for (i = 0; i < p->npages; i++) {
+		pg = &p->pages[i];
+		for (b = 0; b < p->nchunks; b++) {
+			tr = C2T(p, N2C(p, pg, b));
+			if (tr->page & MBP_CARD)
+				(*card)++;
+			if (tr->page & MBP_USED)
+				(*used)++;
+		}
+	}
+	mtx_lock(&p->free_lock);
+	SLIST_FOREACH(cf, &p->free_list, link)
+		(*free)++;
+	mtx_unlock(&p->free_lock);
+}
+
+/*
+ * Get the buffer from a handle and clear the card flag.
+ */
+void *
+mbp_get(struct mbpool *p, uint32_t h)
+{
+	struct mbfree *cf;
+	struct mbtrail *tr;
+
+	cf = N2C(p, &p->pages[HPAGE(h)], HCHUNK(h));
+	tr = C2T(p, cf);
+
+#ifdef DIAGNOSTIC
+	if (!(tr->page & MBP_CARD))
+		printf("%s: (%s) chunk %u page %u not on card\n", __func__,
+		    p->name, HCHUNK(h), HPAGE(h));
+#endif
+
+	tr->page &= ~MBP_CARD;
+	return (cf);
+}
+
+/*
+ * Get the buffer from a handle and keep the card flag.
+ */
+void *
+mbp_get_keep(struct mbpool *p, uint32_t h)
+{
+	struct mbfree *cf;
+	struct mbtrail *tr;
+
+	cf = N2C(p, &p->pages[HPAGE(h)], HCHUNK(h));
+	tr = C2T(p, cf);
+
+#ifdef DIAGNOSTIC
+	if (!(tr->page & MBP_CARD))
+		printf("%s: (%s) chunk %u page %u not on card\n", __func__,
+		    p->name, HCHUNK(h), HPAGE(h));
+#endif
+
+	return (cf);
+}
+
+/*
+ * sync the chunk
+ */
+void
+mbp_sync(struct mbpool *p, uint32_t h, bus_addr_t off, bus_size_t len, u_int op)
+{
+
+#if 0
+	bus_dmamap_sync_size(p->dmat, p->pages[HPAGE(h)].map,
+	    HCHUNK(h) * p->chunk_size + off, len, op);
+#endif
+}
diff --git a/sys/kern/subr_mchain.c b/sys/kern/subr_mchain.c
new file mode 100644
index 0000000..e9d7d22
--- /dev/null
+++ b/sys/kern/subr_mchain.c
@@ -0,0 +1,554 @@
+/*-
+ * Copyright (c) 2000, 2001 Boris Popov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/endian.h>
+#include <sys/errno.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/uio.h>
+
+#include <sys/mchain.h>
+
+FEATURE(libmchain, "mchain library");
+
+MODULE_VERSION(libmchain, 1);
+
+#define MBERROR(format, ...) printf("%s(%d): "format, __func__ , \
+				    __LINE__ , ## __VA_ARGS__)
+
+#define MBPANIC(format, ...) printf("%s(%d): "format, __func__ , \
+				    __LINE__ , ## __VA_ARGS__)
+
+/*
+ * Various helper functions
+ */
+int
+mb_init(struct mbchain *mbp)
+{
+	struct mbuf *m;
+
+	m = m_gethdr(M_WAITOK, MT_DATA);
+	m->m_len = 0;
+	mb_initm(mbp, m);
+	return (0);
+}
+
+void
+mb_initm(struct mbchain *mbp, struct mbuf *m)
+{
+	bzero(mbp, sizeof(*mbp));
+	mbp->mb_top = mbp->mb_cur = m;
+	mbp->mb_mleft = M_TRAILINGSPACE(m);
+}
+
+void
+mb_done(struct mbchain *mbp)
+{
+	if (mbp->mb_top) {
+		m_freem(mbp->mb_top);
+		mbp->mb_top = NULL;
+	}
+}
+
+struct mbuf *
+mb_detach(struct mbchain *mbp)
+{
+	struct mbuf *m;
+
+	m = mbp->mb_top;
+	mbp->mb_top = NULL;
+	return (m);
+}
+
+int
+mb_fixhdr(struct mbchain *mbp)
+{
+	return (mbp->mb_top->m_pkthdr.len = m_fixhdr(mbp->mb_top));
+}
+
+/*
+ * Check if object of size 'size' fit to the current position and
+ * allocate new mbuf if not. Advance pointers and increase length of mbuf(s).
+ * Return pointer to the object placeholder or NULL if any error occured.
+ * Note: size should be <= MLEN 
+ */
+caddr_t
+mb_reserve(struct mbchain *mbp, int size)
+{
+	struct mbuf *m, *mn;
+	caddr_t bpos;
+
+	if (size > MLEN)
+		panic("mb_reserve: size = %d\n", size);
+	m = mbp->mb_cur;
+	if (mbp->mb_mleft < size) {
+		mn = m_get(M_WAITOK, MT_DATA);
+		mbp->mb_cur = m->m_next = mn;
+		m = mn;
+		m->m_len = 0;
+		mbp->mb_mleft = M_TRAILINGSPACE(m);
+	}
+	mbp->mb_mleft -= size;
+	mbp->mb_count += size;
+	bpos = mtod(m, caddr_t) + m->m_len;
+	m->m_len += size;
+	return (bpos);
+}
+
+int
+mb_put_padbyte(struct mbchain *mbp)
+{
+	caddr_t dst;
+	uint8_t x = 0;
+
+	dst = mtod(mbp->mb_cur, caddr_t) + mbp->mb_cur->m_len;
+
+	/* Only add padding if address is odd */
+	if ((unsigned long)dst & 1)
+		return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+	else
+		return (0);
+}
+
+int
+mb_put_uint8(struct mbchain *mbp, uint8_t x)
+{
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_uint16be(struct mbchain *mbp, uint16_t x)
+{
+	x = htobe16(x);
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_uint16le(struct mbchain *mbp, uint16_t x)
+{
+	x = htole16(x);
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_uint32be(struct mbchain *mbp, uint32_t x)
+{
+	x = htobe32(x);
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_uint32le(struct mbchain *mbp, uint32_t x)
+{
+	x = htole32(x);
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_int64be(struct mbchain *mbp, int64_t x)
+{
+	x = htobe64(x);
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_int64le(struct mbchain *mbp, int64_t x)
+{
+	x = htole64(x);
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_mem(struct mbchain *mbp, c_caddr_t source, int size, int type)
+{
+	struct mbuf *m;
+	caddr_t dst;
+	c_caddr_t src;
+	int cplen, error, mleft, count;
+	size_t srclen, dstlen;
+
+	m = mbp->mb_cur;
+	mleft = mbp->mb_mleft;
+
+	while (size > 0) {
+		if (mleft == 0) {
+			if (m->m_next == NULL)
+				m = m_getm(m, size, M_WAITOK, MT_DATA);
+			else
+				m = m->m_next;
+			mleft = M_TRAILINGSPACE(m);
+			continue;
+		}
+		cplen = mleft > size ? size : mleft;
+		srclen = dstlen = cplen;
+		dst = mtod(m, caddr_t) + m->m_len;
+		switch (type) {
+		    case MB_MCUSTOM:
+			srclen = size;
+			dstlen = mleft;
+			error = mbp->mb_copy(mbp, source, dst, &srclen, &dstlen);
+			if (error)
+				return (error);
+			break;
+		    case MB_MINLINE:
+			for (src = source, count = cplen; count; count--)
+				*dst++ = *src++;
+			break;
+		    case MB_MSYSTEM:
+			bcopy(source, dst, cplen);
+			break;
+		    case MB_MUSER:
+			error = copyin(source, dst, cplen);
+			if (error)
+				return (error);
+			break;
+		    case MB_MZERO:
+			bzero(dst, cplen);
+			break;
+		}
+		size -= srclen;
+		source += srclen;
+		m->m_len += dstlen;
+		mleft -= dstlen;
+		mbp->mb_count += dstlen;
+	}
+	mbp->mb_cur = m;
+	mbp->mb_mleft = mleft;
+	return (0);
+}
+
+int
+mb_put_mbuf(struct mbchain *mbp, struct mbuf *m)
+{
+	mbp->mb_cur->m_next = m;
+	while (m) {
+		mbp->mb_count += m->m_len;
+		if (m->m_next == NULL)
+			break;
+		m = m->m_next;
+	}
+	mbp->mb_mleft = M_TRAILINGSPACE(m);
+	mbp->mb_cur = m;
+	return (0);
+}
+
+/*
+ * copies a uio scatter/gather list to an mbuf chain.
+ */
+int
+mb_put_uio(struct mbchain *mbp, struct uio *uiop, int size)
+{
+	long left;
+	int mtype, error;
+
+	mtype = (uiop->uio_segflg == UIO_SYSSPACE) ? MB_MSYSTEM : MB_MUSER;
+
+	while (size > 0 && uiop->uio_resid) {
+		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
+			return (EFBIG);
+		left = uiop->uio_iov->iov_len;
+		if (left == 0) {
+			uiop->uio_iov++;
+			uiop->uio_iovcnt--;
+			continue;
+		}
+		if (left > size)
+			left = size;
+		error = mb_put_mem(mbp, uiop->uio_iov->iov_base, left, mtype);
+		if (error)
+			return (error);
+		uiop->uio_offset += left;
+		uiop->uio_resid -= left;
+		uiop->uio_iov->iov_base =
+		    (char *)uiop->uio_iov->iov_base + left;
+		uiop->uio_iov->iov_len -= left;
+		size -= left;
+	}
+	return (0);
+}
+
+/*
+ * Routines for fetching data from an mbuf chain
+ */
+int
+md_init(struct mdchain *mdp)
+{
+	struct mbuf *m;
+
+	m = m_gethdr(M_WAITOK, MT_DATA);
+	m->m_len = 0;
+	md_initm(mdp, m);
+	return (0);
+}
+
+void
+md_initm(struct mdchain *mdp, struct mbuf *m)
+{
+	bzero(mdp, sizeof(*mdp));
+	mdp->md_top = mdp->md_cur = m;
+	mdp->md_pos = mtod(m, u_char*);
+}
+
+void
+md_done(struct mdchain *mdp)
+{
+	if (mdp->md_top) {
+		m_freem(mdp->md_top);
+		mdp->md_top = NULL;
+	}
+}
+
+/*
+ * Append a separate mbuf chain. It is caller responsibility to prevent
+ * multiple calls to fetch/record routines.
+ */
+void
+md_append_record(struct mdchain *mdp, struct mbuf *top)
+{
+	struct mbuf *m;
+
+	if (mdp->md_top == NULL) {
+		md_initm(mdp, top);
+		return;
+	}
+	m = mdp->md_top;
+	while (m->m_nextpkt)
+		m = m->m_nextpkt;
+	m->m_nextpkt = top;
+	top->m_nextpkt = NULL;
+	return;
+}
+
+/*
+ * Put next record in place of existing
+ */
+int
+md_next_record(struct mdchain *mdp)
+{
+	struct mbuf *m;
+
+	if (mdp->md_top == NULL)
+		return (ENOENT);
+	m = mdp->md_top->m_nextpkt;
+	md_done(mdp);
+	if (m == NULL)
+		return (ENOENT);
+	md_initm(mdp, m);
+	return (0);
+}
+
+int
+md_get_uint8(struct mdchain *mdp, uint8_t *x)
+{
+	return (md_get_mem(mdp, x, 1, MB_MINLINE));
+}
+
+int
+md_get_uint16(struct mdchain *mdp, uint16_t *x)
+{
+	return (md_get_mem(mdp, (caddr_t)x, 2, MB_MINLINE));
+}
+
+int
+md_get_uint16le(struct mdchain *mdp, uint16_t *x)
+{
+	uint16_t v;
+	int error = md_get_uint16(mdp, &v);
+
+	if (x != NULL)
+		*x = le16toh(v);
+	return (error);
+}
+
+int
+md_get_uint16be(struct mdchain *mdp, uint16_t *x)
+{
+	uint16_t v;
+	int error = md_get_uint16(mdp, &v);
+
+	if (x != NULL)
+		*x = be16toh(v);
+	return (error);
+}
+
+int
+md_get_uint32(struct mdchain *mdp, uint32_t *x)
+{
+	return (md_get_mem(mdp, (caddr_t)x, 4, MB_MINLINE));
+}
+
+int
+md_get_uint32be(struct mdchain *mdp, uint32_t *x)
+{
+	uint32_t v;
+	int error;
+
+	error = md_get_uint32(mdp, &v);
+	if (x != NULL)
+		*x = be32toh(v);
+	return (error);
+}
+
+int
+md_get_uint32le(struct mdchain *mdp, uint32_t *x)
+{
+	uint32_t v;
+	int error;
+
+	error = md_get_uint32(mdp, &v);
+	if (x != NULL)
+		*x = le32toh(v);
+	return (error);
+}
+
+int
+md_get_int64(struct mdchain *mdp, int64_t *x)
+{
+	return (md_get_mem(mdp, (caddr_t)x, 8, MB_MINLINE));
+}
+
+int
+md_get_int64be(struct mdchain *mdp, int64_t *x)
+{
+	int64_t v;
+	int error;
+
+	error = md_get_int64(mdp, &v);
+	if (x != NULL)
+		*x = be64toh(v);
+	return (error);
+}
+
+int
+md_get_int64le(struct mdchain *mdp, int64_t *x)
+{
+	int64_t v;
+	int error;
+
+	error = md_get_int64(mdp, &v);
+	if (x != NULL)
+		*x = le64toh(v);
+	return (error);
+}
+
+int
+md_get_mem(struct mdchain *mdp, caddr_t target, int size, int type)
+{
+	struct mbuf *m = mdp->md_cur;
+	int error;
+	u_int count;
+	u_char *s;
+	
+	while (size > 0) {
+		if (m == NULL) {
+			MBERROR("incomplete copy\n");
+			return (EBADRPC);
+		}
+		s = mdp->md_pos;
+		count = mtod(m, u_char*) + m->m_len - s;
+		if (count == 0) {
+			mdp->md_cur = m = m->m_next;
+			if (m)
+				s = mdp->md_pos = mtod(m, caddr_t);
+			continue;
+		}
+		if (count > size)
+			count = size;
+		size -= count;
+		mdp->md_pos += count;
+		if (target == NULL)
+			continue;
+		switch (type) {
+		    case MB_MUSER:
+			error = copyout(s, target, count);
+			if (error)
+				return error;
+			break;
+		    case MB_MSYSTEM:
+			bcopy(s, target, count);
+			break;
+		    case MB_MINLINE:
+			while (count--)
+				*target++ = *s++;
+			continue;
+		}
+		target += count;
+	}
+	return (0);
+}
+
+int
+md_get_mbuf(struct mdchain *mdp, int size, struct mbuf **ret)
+{
+	struct mbuf *m = mdp->md_cur, *rm;
+
+	rm = m_copym(m, mdp->md_pos - mtod(m, u_char*), size, M_WAITOK);
+	md_get_mem(mdp, NULL, size, MB_MZERO);
+	*ret = rm;
+	return (0);
+}
+
+int
+md_get_uio(struct mdchain *mdp, struct uio *uiop, int size)
+{
+	char *uiocp;
+	long left;
+	int mtype, error;
+
+	mtype = (uiop->uio_segflg == UIO_SYSSPACE) ? MB_MSYSTEM : MB_MUSER;
+	while (size > 0 && uiop->uio_resid) {
+		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
+			return (EFBIG);
+		left = uiop->uio_iov->iov_len;
+		if (left == 0) {
+			uiop->uio_iov++;
+			uiop->uio_iovcnt--;
+			continue;
+		}
+		uiocp = uiop->uio_iov->iov_base;
+		if (left > size)
+			left = size;
+		error = md_get_mem(mdp, uiocp, left, mtype);
+		if (error)
+			return (error);
+		uiop->uio_offset += left;
+		uiop->uio_resid -= left;
+		uiop->uio_iov->iov_base =
+		    (char *)uiop->uio_iov->iov_base + left;
+		uiop->uio_iov->iov_len -= left;
+		size -= left;
+	}
+	return (0);
+}
diff --git a/sys/kern/subr_module.c b/sys/kern/subr_module.c
new file mode 100644
index 0000000..2485c94
--- /dev/null
+++ b/sys/kern/subr_module.c
@@ -0,0 +1,290 @@
+/*-
+ * Copyright (c) 1998 Michael Smith
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/linker.h>
+
+/*
+ * Preloaded module support
+ */
+
+vm_offset_t preload_addr_relocate = 0;
+caddr_t preload_metadata;
+
+/*
+ * Search for the preloaded module (name)
+ */
+caddr_t
+preload_search_by_name(const char *name)
+{
+    caddr_t	curp;
+    uint32_t	*hdr;
+    int		next;
+    
+    if (preload_metadata != NULL) {
+	
+	curp = preload_metadata;
+	for (;;) {
+	    hdr = (uint32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* Search for a MODINFO_NAME field */
+	    if ((hdr[0] == MODINFO_NAME) &&
+		!strcmp(name, curp + sizeof(uint32_t) * 2))
+		return(curp);
+
+	    /* skip to next field */
+	    next = sizeof(uint32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+    return(NULL);
+}
+
+/*
+ * Search for the first preloaded module of (type)
+ */
+caddr_t
+preload_search_by_type(const char *type)
+{
+    caddr_t	curp, lname;
+    uint32_t	*hdr;
+    int		next;
+
+    if (preload_metadata != NULL) {
+
+	curp = preload_metadata;
+	lname = NULL;
+	for (;;) {
+	    hdr = (uint32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* remember the start of each record */
+	    if (hdr[0] == MODINFO_NAME)
+		lname = curp;
+
+	    /* Search for a MODINFO_TYPE field */
+	    if ((hdr[0] == MODINFO_TYPE) &&
+		!strcmp(type, curp + sizeof(uint32_t) * 2))
+		return(lname);
+
+	    /* skip to next field */
+	    next = sizeof(uint32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+    return(NULL);
+}
+
+/*
+ * Walk through the preloaded module list
+ */
+caddr_t
+preload_search_next_name(caddr_t base)
+{
+    caddr_t	curp;
+    uint32_t	*hdr;
+    int		next;
+    
+    if (preload_metadata != NULL) {
+	
+	/* Pick up where we left off last time */
+	if (base) {
+	    /* skip to next field */
+	    curp = base;
+	    hdr = (uint32_t *)curp;
+	    next = sizeof(uint32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	} else
+	    curp = preload_metadata;
+
+	for (;;) {
+	    hdr = (uint32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* Found a new record? */
+	    if (hdr[0] == MODINFO_NAME)
+		return curp;
+
+	    /* skip to next field */
+	    next = sizeof(uint32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+    return(NULL);
+}
+
+/*
+ * Given a preloaded module handle (mod), return a pointer
+ * to the data for the attribute (inf).
+ */
+caddr_t
+preload_search_info(caddr_t mod, int inf)
+{
+    caddr_t	curp;
+    uint32_t	*hdr;
+    uint32_t	type = 0;
+    int		next;
+
+    curp = mod;
+    for (;;) {
+	hdr = (uint32_t *)curp;
+	/* end of module data? */
+	if (hdr[0] == 0 && hdr[1] == 0)
+	    break;
+	/* 
+	 * We give up once we've looped back to what we were looking at 
+	 * first - this should normally be a MODINFO_NAME field.
+	 */
+	if (type == 0) {
+	    type = hdr[0];
+	} else {
+	    if (hdr[0] == type)
+		break;
+	}
+	
+	/* 
+	 * Attribute match? Return pointer to data.
+	 * Consumer may safely assume that size value precedes	
+	 * data.
+	 */
+	if (hdr[0] == inf)
+	    return(curp + (sizeof(uint32_t) * 2));
+
+	/* skip to next field */
+	next = sizeof(uint32_t) * 2 + hdr[1];
+	next = roundup(next, sizeof(u_long));
+	curp += next;
+    }
+    return(NULL);
+}
+
+/*
+ * Delete a preload record by name.
+ */
+void
+preload_delete_name(const char *name)
+{
+    caddr_t	curp;
+    uint32_t	*hdr;
+    int		next;
+    int		clearing;
+    
+    if (preload_metadata != NULL) {
+	
+	clearing = 0;
+	curp = preload_metadata;
+	for (;;) {
+	    hdr = (uint32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* Search for a MODINFO_NAME field */
+	    if (hdr[0] == MODINFO_NAME) {
+		if (!strcmp(name, curp + sizeof(uint32_t) * 2))
+		    clearing = 1;	/* got it, start clearing */
+		else if (clearing)
+		    clearing = 0;	/* at next one now.. better stop */
+	    }
+	    if (clearing)
+		hdr[0] = MODINFO_EMPTY;
+
+	    /* skip to next field */
+	    next = sizeof(uint32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+}
+
+void *
+preload_fetch_addr(caddr_t mod)
+{
+	caddr_t *mdp;
+
+	mdp = (caddr_t *)preload_search_info(mod, MODINFO_ADDR);
+	if (mdp == NULL)
+		return (NULL);
+	return (*mdp + preload_addr_relocate);
+}
+
+size_t
+preload_fetch_size(caddr_t mod)
+{
+	size_t *mdp;
+
+	mdp = (size_t *)preload_search_info(mod, MODINFO_SIZE);
+	if (mdp == NULL)
+		return (0);
+	return (*mdp);
+}
+
+/* Called from locore on i386.  Convert physical pointers to kvm. Sigh. */
+void
+preload_bootstrap_relocate(vm_offset_t offset)
+{
+    caddr_t	curp;
+    uint32_t	*hdr;
+    vm_offset_t	*ptr;
+    int		next;
+    
+    if (preload_metadata != NULL) {
+	
+	curp = preload_metadata;
+	for (;;) {
+	    hdr = (uint32_t *)curp;
+	    if (hdr[0] == 0 && hdr[1] == 0)
+		break;
+
+	    /* Deal with the ones that we know we have to fix */
+	    switch (hdr[0]) {
+	    case MODINFO_ADDR:
+	    case MODINFO_METADATA|MODINFOMD_SSYM:
+	    case MODINFO_METADATA|MODINFOMD_ESYM:
+		ptr = (vm_offset_t *)(curp + (sizeof(uint32_t) * 2));
+		*ptr += offset;
+		break;
+	    }
+	    /* The rest is beyond us for now */
+
+	    /* skip to next field */
+	    next = sizeof(uint32_t) * 2 + hdr[1];
+	    next = roundup(next, sizeof(u_long));
+	    curp += next;
+	}
+    }
+}
diff --git a/sys/kern/subr_msgbuf.c b/sys/kern/subr_msgbuf.c
new file mode 100644
index 0000000..ecdbe72
--- /dev/null
+++ b/sys/kern/subr_msgbuf.c
@@ -0,0 +1,418 @@
+/*-
+ * Copyright (c) 2003 Ian Dowse.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Generic message buffer support routines.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/mutex.h>
+#include <sys/msgbuf.h>
+#include <sys/sysctl.h>
+
+/*
+ * Maximum number conversion buffer length: uintmax_t in base 2, plus <>
+ * around the priority, and a terminating NUL.
+ */
+#define	MAXPRIBUF	(sizeof(intmax_t) * NBBY + 3)
+
+/* Read/write sequence numbers are modulo a multiple of the buffer size. */
+#define SEQMOD(size) ((size) * 16)
+
+static u_int msgbuf_cksum(struct msgbuf *mbp);
+
+/*
+ * Timestamps in msgbuf are useful when trying to diagnose when core dumps
+ * or other actions occured.
+ */
+static int msgbuf_show_timestamp = 0;
+SYSCTL_INT(_kern, OID_AUTO, msgbuf_show_timestamp, CTLFLAG_RW | CTLFLAG_TUN,
+    &msgbuf_show_timestamp, 0, "Show timestamp in msgbuf");
+TUNABLE_INT("kern.msgbuf_show_timestamp", &msgbuf_show_timestamp);
+
+/*
+ * Initialize a message buffer of the specified size at the specified
+ * location. This also zeros the buffer area.
+ */
+void
+msgbuf_init(struct msgbuf *mbp, void *ptr, int size)
+{
+
+	mbp->msg_ptr = ptr;
+	mbp->msg_size = size;
+	mbp->msg_seqmod = SEQMOD(size);
+	msgbuf_clear(mbp);
+	mbp->msg_magic = MSG_MAGIC;
+	mbp->msg_lastpri = -1;
+	mbp->msg_flags = 0;
+	bzero(&mbp->msg_lock, sizeof(mbp->msg_lock));
+	mtx_init(&mbp->msg_lock, "msgbuf", NULL, MTX_SPIN);
+}
+
+/*
+ * Reinitialize a message buffer, retaining its previous contents if
+ * the size and checksum are correct. If the old contents cannot be
+ * recovered, the message buffer is cleared.
+ */
+void
+msgbuf_reinit(struct msgbuf *mbp, void *ptr, int size)
+{
+	u_int cksum;
+
+	if (mbp->msg_magic != MSG_MAGIC || mbp->msg_size != size) {
+		msgbuf_init(mbp, ptr, size);
+		return;
+	}
+	mbp->msg_seqmod = SEQMOD(size);
+	mbp->msg_wseq = MSGBUF_SEQNORM(mbp, mbp->msg_wseq);
+	mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq);
+        mbp->msg_ptr = ptr;
+	cksum = msgbuf_cksum(mbp);
+	if (cksum != mbp->msg_cksum) {
+		if (bootverbose) {
+			printf("msgbuf cksum mismatch (read %x, calc %x)\n",
+			    mbp->msg_cksum, cksum);
+			printf("Old msgbuf not recovered\n");
+		}
+		msgbuf_clear(mbp);
+	}
+
+	mbp->msg_lastpri = -1;
+	/* Assume that the old message buffer didn't end in a newline. */
+	mbp->msg_flags |= MSGBUF_NEEDNL;
+	bzero(&mbp->msg_lock, sizeof(mbp->msg_lock));
+	mtx_init(&mbp->msg_lock, "msgbuf", NULL, MTX_SPIN);
+}
+
+/*
+ * Clear the message buffer.
+ */
+void
+msgbuf_clear(struct msgbuf *mbp)
+{
+
+	bzero(mbp->msg_ptr, mbp->msg_size);
+	mbp->msg_wseq = 0;
+	mbp->msg_rseq = 0;
+	mbp->msg_cksum = 0;
+}
+
+/*
+ * Get a count of the number of unread characters in the message buffer.
+ */
+int
+msgbuf_getcount(struct msgbuf *mbp)
+{
+	u_int len;
+
+	len = MSGBUF_SEQSUB(mbp, mbp->msg_wseq, mbp->msg_rseq);
+	if (len > mbp->msg_size)
+		len = mbp->msg_size;
+	return (len);
+}
+
+/*
+ * Add a character into the message buffer, and update the checksum and
+ * sequence number.
+ *
+ * The caller should hold the message buffer spinlock.
+ */
+
+static void
+msgbuf_do_addchar(struct msgbuf * const mbp, u_int * const seq, const int c)
+{
+	u_int pos;
+
+	/* Make sure we properly wrap the sequence number. */
+	pos = MSGBUF_SEQ_TO_POS(mbp, *seq);
+	mbp->msg_cksum += (u_int)(u_char)c -
+	    (u_int)(u_char)mbp->msg_ptr[pos];
+	mbp->msg_ptr[pos] = c;
+	*seq = MSGBUF_SEQNORM(mbp, *seq + 1);
+}
+
+/*
+ * Append a character to a message buffer.
+ */
+void
+msgbuf_addchar(struct msgbuf *mbp, int c)
+{
+	mtx_lock_spin(&mbp->msg_lock);
+
+	msgbuf_do_addchar(mbp, &mbp->msg_wseq, c);
+
+	mtx_unlock_spin(&mbp->msg_lock);
+}
+
+/*
+ * Append a NUL-terminated string with a priority to a message buffer.
+ * Filter carriage returns if the caller requests it.
+ *
+ * XXX The carriage return filtering behavior is present in the
+ * msglogchar() API, however testing has shown that we don't seem to send
+ * carriage returns down this path.  So do we still need it?
+ */
+void
+msgbuf_addstr(struct msgbuf *mbp, int pri, char *str, int filter_cr)
+{
+	u_int seq;
+	size_t len, prefix_len;
+	char prefix[MAXPRIBUF];
+	char buf[32];
+	int nl, i, j, needtime;
+
+	len = strlen(str);
+	prefix_len = 0;
+	nl = 0;
+
+	/* If we have a zero-length string, no need to do anything. */
+	if (len == 0)
+		return;
+
+	mtx_lock_spin(&mbp->msg_lock);
+
+	/*
+	 * If this is true, we may need to insert a new priority sequence,
+	 * so prepare the prefix.
+	 */
+	if (pri != -1)
+		prefix_len = sprintf(prefix, "<%d>", pri);
+
+	/*
+	 * Starting write sequence number.
+	 */
+	seq = mbp->msg_wseq;
+
+	/*
+	 * Whenever there is a change in priority, we have to insert a
+	 * newline, and a priority prefix if the priority is not -1.  Here
+	 * we detect whether there was a priority change, and whether we
+	 * did not end with a newline.  If that is the case, we need to
+	 * insert a newline before this string.
+	 */
+	if (mbp->msg_lastpri != pri && (mbp->msg_flags & MSGBUF_NEEDNL) != 0) {
+
+		msgbuf_do_addchar(mbp, &seq, '\n');
+		mbp->msg_flags &= ~MSGBUF_NEEDNL;
+	}
+
+	needtime = 1;
+	for (i = 0; i < len; i++) {
+		/*
+		 * If we just had a newline, and the priority is not -1
+		 * (and therefore prefix_len != 0), then we need a priority
+		 * prefix for this line.
+		 */
+		if ((mbp->msg_flags & MSGBUF_NEEDNL) == 0 && prefix_len != 0) {
+			int j;
+
+			for (j = 0; j < prefix_len; j++)
+				msgbuf_do_addchar(mbp, &seq, prefix[j]);
+		}
+
+		if (msgbuf_show_timestamp && needtime == 1 &&
+		    (mbp->msg_flags & MSGBUF_NEEDNL) == 0) {
+
+			snprintf(buf, sizeof(buf), "[%jd] ",
+			    (intmax_t)time_uptime);
+			for (j = 0; buf[j] != '\0'; j++)
+				msgbuf_do_addchar(mbp, &seq, buf[j]);
+			needtime = 0;
+		}
+
+		/*
+		 * Don't copy carriage returns if the caller requested
+		 * filtering.
+		 * 
+		 * XXX This matches the behavior of msglogchar(), but is it
+		 * necessary?  Testing has shown that we don't seem to get
+		 * carriage returns here.
+		 */
+		if ((filter_cr != 0) && (str[i] == '\r'))
+			continue;
+
+		/*
+		 * Clear this flag if we see a newline.  This affects whether
+		 * we need to insert a new prefix or insert a newline later.
+		 */
+		if (str[i] == '\n')
+			mbp->msg_flags &= ~MSGBUF_NEEDNL;
+		else
+			mbp->msg_flags |= MSGBUF_NEEDNL;
+
+		msgbuf_do_addchar(mbp, &seq, str[i]);
+	}
+	/*
+	 * Update the write sequence number for the actual number of
+	 * characters we put in the message buffer.  (Depends on whether
+	 * carriage returns are filtered.)
+	 */
+	mbp->msg_wseq = seq;
+
+	/*
+	 * Set the last priority.
+	 */
+	mbp->msg_lastpri = pri;
+
+	mtx_unlock_spin(&mbp->msg_lock);
+
+}
+
+/*
+ * Read and mark as read a character from a message buffer.
+ * Returns the character, or -1 if no characters are available.
+ */
+int
+msgbuf_getchar(struct msgbuf *mbp)
+{
+	u_int len, wseq;
+	int c;
+
+	mtx_lock_spin(&mbp->msg_lock);
+
+	wseq = mbp->msg_wseq;
+	len = MSGBUF_SEQSUB(mbp, wseq, mbp->msg_rseq);
+	if (len == 0) {
+		mtx_unlock_spin(&mbp->msg_lock);
+		return (-1);
+	}
+	if (len > mbp->msg_size)
+		mbp->msg_rseq = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
+	c = (u_char)mbp->msg_ptr[MSGBUF_SEQ_TO_POS(mbp, mbp->msg_rseq)];
+	mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq + 1);
+
+	mtx_unlock_spin(&mbp->msg_lock);
+
+	return (c);
+}
+
+/*
+ * Read and mark as read a number of characters from a message buffer.
+ * Returns the number of characters that were placed in `buf'.
+ */
+int
+msgbuf_getbytes(struct msgbuf *mbp, char *buf, int buflen)
+{
+	u_int len, pos, wseq;
+
+	mtx_lock_spin(&mbp->msg_lock);
+
+	wseq = mbp->msg_wseq;
+	len = MSGBUF_SEQSUB(mbp, wseq, mbp->msg_rseq);
+	if (len == 0) {
+		mtx_unlock_spin(&mbp->msg_lock);
+		return (0);
+	}
+	if (len > mbp->msg_size) {
+		mbp->msg_rseq = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
+		len = mbp->msg_size;
+	}
+	pos = MSGBUF_SEQ_TO_POS(mbp, mbp->msg_rseq);
+	len = min(len, mbp->msg_size - pos);
+	len = min(len, (u_int)buflen);
+
+	bcopy(&mbp->msg_ptr[pos], buf, len);
+	mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq + len);
+
+	mtx_unlock_spin(&mbp->msg_lock);
+
+	return (len);
+}
+
+/*
+ * Peek at the full contents of a message buffer without marking any
+ * data as read. `seqp' should point to an unsigned integer that
+ * msgbuf_peekbytes() can use to retain state between calls so that
+ * the whole message buffer can be read in multiple short reads.
+ * To initialise this variable to the start of the message buffer,
+ * call msgbuf_peekbytes() with a NULL `buf' parameter.
+ *
+ * Returns the number of characters that were placed in `buf'.
+ */
+int
+msgbuf_peekbytes(struct msgbuf *mbp, char *buf, int buflen, u_int *seqp)
+{
+	u_int len, pos, wseq;
+
+	mtx_lock_spin(&mbp->msg_lock);
+
+	if (buf == NULL) {
+		/* Just initialise *seqp. */
+		*seqp = MSGBUF_SEQNORM(mbp, mbp->msg_wseq - mbp->msg_size);
+		mtx_unlock_spin(&mbp->msg_lock);
+		return (0);
+	}
+
+	wseq = mbp->msg_wseq;
+	len = MSGBUF_SEQSUB(mbp, wseq, *seqp);
+	if (len == 0) {
+		mtx_unlock_spin(&mbp->msg_lock);
+		return (0);
+	}
+	if (len > mbp->msg_size) {
+		*seqp = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
+		len = mbp->msg_size;
+	}
+	pos = MSGBUF_SEQ_TO_POS(mbp, *seqp);
+	len = min(len, mbp->msg_size - pos);
+	len = min(len, (u_int)buflen);
+	bcopy(&mbp->msg_ptr[MSGBUF_SEQ_TO_POS(mbp, *seqp)], buf, len);
+	*seqp = MSGBUF_SEQNORM(mbp, *seqp + len);
+
+	mtx_unlock_spin(&mbp->msg_lock);
+
+	return (len);
+}
+
+/*
+ * Compute the checksum for the complete message buffer contents.
+ */
+static u_int
+msgbuf_cksum(struct msgbuf *mbp)
+{
+	u_int i, sum;
+
+	sum = 0;
+	for (i = 0; i < mbp->msg_size; i++)
+		sum += (u_char)mbp->msg_ptr[i];
+	return (sum);
+}
+
+/*
+ * Copy from one message buffer to another.
+ */
+void
+msgbuf_copy(struct msgbuf *src, struct msgbuf *dst)
+{
+	int c;
+
+	while ((c = msgbuf_getchar(src)) >= 0)
+		msgbuf_addchar(dst, c);
+}
diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c
new file mode 100644
index 0000000..a2e822c
--- /dev/null
+++ b/sys/kern/subr_param.c
@@ -0,0 +1,354 @@
+/*-
+ * Copyright (c) 1980, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)param.c	8.3 (Berkeley) 8/20/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_param.h"
+#include "opt_msgbuf.h"
+#include "opt_maxusers.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/msgbuf.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+
+/*
+ * System parameter formulae.
+ */
+
+#ifndef HZ
+#  if defined(__mips__) || defined(__arm__)
+#    define	HZ 100
+#  else
+#    define	HZ 1000
+#  endif
+#  ifndef HZ_VM
+#    define	HZ_VM 100
+#  endif
+#else
+#  ifndef HZ_VM
+#    define	HZ_VM HZ
+#  endif
+#endif
+#define	NPROC (20 + 16 * maxusers)
+#ifndef NBUF
+#define NBUF 0
+#endif
+#ifndef MAXFILES
+#define	MAXFILES (maxproc * 2)
+#endif
+
+static int sysctl_kern_vm_guest(SYSCTL_HANDLER_ARGS);
+
+int	hz;				/* system clock's frequency */
+int	tick;				/* usec per tick (1000000 / hz) */
+struct bintime tick_bt;			/* bintime per tick (1s / hz) */
+sbintime_t tick_sbt;
+int	maxusers;			/* base tunable */
+int	maxproc;			/* maximum # of processes */
+int	maxprocperuid;			/* max # of procs per user */
+int	maxfiles;			/* sys. wide open files limit */
+int	maxfilesperproc;		/* per-proc open files limit */
+int	msgbufsize;			/* size of kernel message buffer */
+int	nbuf;
+int	bio_transient_maxcnt;
+int	ngroups_max;			/* max # groups per process */
+int	nswbuf;
+pid_t	pid_max = PID_MAX;
+long	maxswzone;			/* max swmeta KVA storage */
+long	maxbcache;			/* max buffer cache KVA storage */
+long	maxpipekva;			/* Limit on pipe KVA */
+int 	vm_guest;			/* Running as virtual machine guest? */
+u_long	maxtsiz;			/* max text size */
+u_long	dfldsiz;			/* initial data size limit */
+u_long	maxdsiz;			/* max data size */
+u_long	dflssiz;			/* initial stack size limit */
+u_long	maxssiz;			/* max stack size */
+u_long	sgrowsiz;			/* amount to grow stack */
+
+SYSCTL_INT(_kern, OID_AUTO, hz, CTLFLAG_RDTUN, &hz, 0,
+    "Number of clock ticks per second");
+SYSCTL_INT(_kern, OID_AUTO, nbuf, CTLFLAG_RDTUN, &nbuf, 0,
+    "Number of buffers in the buffer cache");
+SYSCTL_INT(_kern, OID_AUTO, nswbuf, CTLFLAG_RDTUN, &nswbuf, 0,
+    "Number of swap buffers");
+SYSCTL_INT(_kern, OID_AUTO, msgbufsize, CTLFLAG_RDTUN, &msgbufsize, 0,
+    "Size of the kernel message buffer");
+SYSCTL_LONG(_kern, OID_AUTO, maxswzone, CTLFLAG_RDTUN, &maxswzone, 0,
+    "Maximum memory for swap metadata");
+SYSCTL_LONG(_kern, OID_AUTO, maxbcache, CTLFLAG_RDTUN, &maxbcache, 0,
+    "Maximum value of vfs.maxbufspace");
+SYSCTL_INT(_kern, OID_AUTO, bio_transient_maxcnt, CTLFLAG_RDTUN,
+    &bio_transient_maxcnt, 0,
+    "Maximum number of transient BIOs mappings");
+SYSCTL_ULONG(_kern, OID_AUTO, maxtsiz, CTLFLAG_RW | CTLFLAG_TUN, &maxtsiz, 0,
+    "Maximum text size");
+SYSCTL_ULONG(_kern, OID_AUTO, dfldsiz, CTLFLAG_RW | CTLFLAG_TUN, &dfldsiz, 0,
+    "Initial data size limit");
+SYSCTL_ULONG(_kern, OID_AUTO, maxdsiz, CTLFLAG_RW | CTLFLAG_TUN, &maxdsiz, 0,
+    "Maximum data size");
+SYSCTL_ULONG(_kern, OID_AUTO, dflssiz, CTLFLAG_RW | CTLFLAG_TUN, &dflssiz, 0,
+    "Initial stack size limit");
+SYSCTL_ULONG(_kern, OID_AUTO, maxssiz, CTLFLAG_RW | CTLFLAG_TUN, &maxssiz, 0,
+    "Maximum stack size");
+SYSCTL_ULONG(_kern, OID_AUTO, sgrowsiz, CTLFLAG_RW | CTLFLAG_TUN, &sgrowsiz, 0,
+    "Amount to grow stack on a stack fault");
+SYSCTL_PROC(_kern, OID_AUTO, vm_guest, CTLFLAG_RD | CTLTYPE_STRING,
+    NULL, 0, sysctl_kern_vm_guest, "A",
+    "Virtual machine guest detected? (none|generic|xen)");
+
+/*
+ * These have to be allocated somewhere; allocating
+ * them here forces loader errors if this file is omitted
+ * (if they've been externed everywhere else; hah!).
+ */
+struct	buf *swbuf;
+
+/*
+ * The elements of this array are ordered based upon the values of the
+ * corresponding enum VM_GUEST members.
+ */
+static const char *const vm_guest_sysctl_names[] = {
+	"none",
+	"generic",
+	"xen",
+	NULL
+};
+
+#ifndef XEN
+static const char *const vm_bnames[] = {
+	"QEMU",				/* QEMU */
+	"Plex86",			/* Plex86 */
+	"Bochs",			/* Bochs */
+	"Xen",				/* Xen */
+	"BHYVE",			/* bhyve */
+	"Seabios",			/* KVM */
+	NULL
+};
+
+static const char *const vm_pnames[] = {
+	"VMware Virtual Platform",	/* VMWare VM */
+	"Virtual Machine",		/* Microsoft VirtualPC */
+	"VirtualBox",			/* Sun xVM VirtualBox */
+	"Parallels Virtual Platform",	/* Parallels VM */
+	"KVM",				/* KVM */
+	NULL
+};
+
+
+/*
+ * Detect known Virtual Machine hosts by inspecting the emulated BIOS.
+ */
+static enum VM_GUEST
+detect_virtual(void)
+{
+	char *sysenv;
+	int i;
+
+	sysenv = getenv("smbios.bios.vendor");
+	if (sysenv != NULL) {
+		for (i = 0; vm_bnames[i] != NULL; i++)
+			if (strcmp(sysenv, vm_bnames[i]) == 0) {
+				freeenv(sysenv);
+				return (VM_GUEST_VM);
+			}
+		freeenv(sysenv);
+	}
+	sysenv = getenv("smbios.system.product");
+	if (sysenv != NULL) {
+		for (i = 0; vm_pnames[i] != NULL; i++)
+			if (strcmp(sysenv, vm_pnames[i]) == 0) {
+				freeenv(sysenv);
+				return (VM_GUEST_VM);
+			}
+		freeenv(sysenv);
+	}
+	return (VM_GUEST_NO);
+}
+#endif
+
+/*
+ * Boot time overrides that are not scaled against main memory
+ */
+void
+init_param1(void)
+{
+#ifndef XEN
+	vm_guest = detect_virtual();
+#else
+	vm_guest = VM_GUEST_XEN;
+#endif
+	hz = -1;
+	TUNABLE_INT_FETCH("kern.hz", &hz);
+	if (hz == -1)
+		hz = vm_guest > VM_GUEST_NO ? HZ_VM : HZ;
+	tick = 1000000 / hz;
+	tick_sbt = SBT_1S / hz;
+	tick_bt = sbttobt(tick_sbt);
+
+#ifdef VM_SWZONE_SIZE_MAX
+	maxswzone = VM_SWZONE_SIZE_MAX;
+#endif
+	TUNABLE_LONG_FETCH("kern.maxswzone", &maxswzone);
+#ifdef VM_BCACHE_SIZE_MAX
+	maxbcache = VM_BCACHE_SIZE_MAX;
+#endif
+	TUNABLE_LONG_FETCH("kern.maxbcache", &maxbcache);
+	msgbufsize = MSGBUF_SIZE;
+	TUNABLE_INT_FETCH("kern.msgbufsize", &msgbufsize);
+
+	maxtsiz = MAXTSIZ;
+	TUNABLE_ULONG_FETCH("kern.maxtsiz", &maxtsiz);
+	dfldsiz = DFLDSIZ;
+	TUNABLE_ULONG_FETCH("kern.dfldsiz", &dfldsiz);
+	maxdsiz = MAXDSIZ;
+	TUNABLE_ULONG_FETCH("kern.maxdsiz", &maxdsiz);
+	dflssiz = DFLSSIZ;
+	TUNABLE_ULONG_FETCH("kern.dflssiz", &dflssiz);
+	maxssiz = MAXSSIZ;
+	TUNABLE_ULONG_FETCH("kern.maxssiz", &maxssiz);
+	sgrowsiz = SGROWSIZ;
+	TUNABLE_ULONG_FETCH("kern.sgrowsiz", &sgrowsiz);
+
+	/*
+	 * Let the administrator set {NGROUPS_MAX}, but disallow values
+	 * less than NGROUPS_MAX which would violate POSIX.1-2008 or
+	 * greater than INT_MAX-1 which would result in overflow.
+	 */
+	ngroups_max = NGROUPS_MAX;
+	TUNABLE_INT_FETCH("kern.ngroups", &ngroups_max);
+	if (ngroups_max < NGROUPS_MAX)
+		ngroups_max = NGROUPS_MAX;
+
+	/*
+	 * Only allow to lower the maximal pid.
+	 * Prevent setting up a non-bootable system if pid_max is too low.
+	 */
+	TUNABLE_INT_FETCH("kern.pid_max", &pid_max);
+	if (pid_max > PID_MAX)
+		pid_max = PID_MAX;
+	else if (pid_max < 300)
+		pid_max = 300;
+
+	TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed);
+}
+
+/*
+ * Boot time overrides that are scaled against main memory
+ */
+void
+init_param2(long physpages)
+{
+
+	/* Base parameters */
+	maxusers = MAXUSERS;
+	TUNABLE_INT_FETCH("kern.maxusers", &maxusers);
+	if (maxusers == 0) {
+		maxusers = physpages / (2 * 1024 * 1024 / PAGE_SIZE);
+		if (maxusers < 32)
+			maxusers = 32;
+#ifdef VM_MAX_AUTOTUNE_MAXUSERS
+                if (maxusers > VM_MAX_AUTOTUNE_MAXUSERS)
+                        maxusers = VM_MAX_AUTOTUNE_MAXUSERS;
+#endif
+                /*
+                 * Scales down the function in which maxusers grows once
+                 * we hit 384.
+                 */
+                if (maxusers > 384)
+                        maxusers = 384 + ((maxusers - 384) / 8);
+        }
+
+	/*
+	 * The following can be overridden after boot via sysctl.  Note:
+	 * unless overriden, these macros are ultimately based on maxusers.
+	 * Limit maxproc so that kmap entries cannot be exhausted by
+	 * processes.
+	 */
+	maxproc = NPROC;
+	TUNABLE_INT_FETCH("kern.maxproc", &maxproc);
+	if (maxproc > (physpages / 12))
+		maxproc = physpages / 12;
+	maxprocperuid = (maxproc * 9) / 10;
+
+	/*
+	 * The default limit for maxfiles is 1/12 of the number of
+	 * physical page but not less than 16 times maxusers.
+	 * At most it can be 1/6 the number of physical pages.
+	 */
+	maxfiles = imax(MAXFILES, physpages / 8);
+	TUNABLE_INT_FETCH("kern.maxfiles", &maxfiles);
+	if (maxfiles > (physpages / 4))
+		maxfiles = physpages / 4;
+	maxfilesperproc = (maxfiles / 10) * 9;
+	
+	/*
+	 * Cannot be changed after boot.
+	 */
+	nbuf = NBUF;
+	TUNABLE_INT_FETCH("kern.nbuf", &nbuf);
+	TUNABLE_INT_FETCH("kern.bio_transient_maxcnt", &bio_transient_maxcnt);
+
+	/*
+	 * The default for maxpipekva is min(1/64 of the kernel address space,
+	 * max(1/64 of main memory, 512KB)).  See sys_pipe.c for more details.
+	 */
+	maxpipekva = (physpages / 64) * PAGE_SIZE;
+	TUNABLE_LONG_FETCH("kern.ipc.maxpipekva", &maxpipekva);
+	if (maxpipekva < 512 * 1024)
+		maxpipekva = 512 * 1024;
+	if (maxpipekva > (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 64)
+		maxpipekva = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
+		    64;
+}
+
+/*
+ * Sysctl stringiying handler for kern.vm_guest.
+ */
+static int
+sysctl_kern_vm_guest(SYSCTL_HANDLER_ARGS)
+{
+	return (SYSCTL_OUT(req, vm_guest_sysctl_names[vm_guest], 
+	    strlen(vm_guest_sysctl_names[vm_guest])));
+}
diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c
new file mode 100644
index 0000000..505a4df
--- /dev/null
+++ b/sys/kern/subr_pcpu.c
@@ -0,0 +1,394 @@
+/*-
+ * Copyright (c) 2001 Wind River Systems, Inc.
+ * All rights reserved.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ *
+ * Copyright (c) 2009 Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This module provides MI support for per-cpu data.
+ *
+ * Each architecture determines the mapping of logical CPU IDs to physical
+ * CPUs.  The requirements of this mapping are as follows:
+ *  - Logical CPU IDs must reside in the range 0 ... MAXCPU - 1.
+ *  - The mapping is not required to be dense.  That is, there may be
+ *    gaps in the mappings.
+ *  - The platform sets the value of MAXCPU in <machine/param.h>.
+ *  - It is suggested, but not required, that in the non-SMP case, the
+ *    platform define MAXCPU to be 1 and define the logical ID of the
+ *    sole CPU as 0.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <ddb/ddb.h>
+
+static MALLOC_DEFINE(M_PCPU, "Per-cpu", "Per-cpu resource accouting.");
+
+struct dpcpu_free {
+	uintptr_t	df_start;
+	int		df_len;
+	TAILQ_ENTRY(dpcpu_free) df_link;
+};
+
+static DPCPU_DEFINE(char, modspace[DPCPU_MODMIN]);
+static TAILQ_HEAD(, dpcpu_free) dpcpu_head = TAILQ_HEAD_INITIALIZER(dpcpu_head);
+static struct sx dpcpu_lock;
+uintptr_t dpcpu_off[MAXCPU];
+struct pcpu *cpuid_to_pcpu[MAXCPU];
+struct cpuhead cpuhead = STAILQ_HEAD_INITIALIZER(cpuhead);
+
+/*
+ * Initialize the MI portions of a struct pcpu.
+ */
+void
+pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
+{
+
+	bzero(pcpu, size);
+	KASSERT(cpuid >= 0 && cpuid < MAXCPU,
+	    ("pcpu_init: invalid cpuid %d", cpuid));
+	pcpu->pc_cpuid = cpuid;
+	cpuid_to_pcpu[cpuid] = pcpu;
+	STAILQ_INSERT_TAIL(&cpuhead, pcpu, pc_allcpu);
+	cpu_pcpu_init(pcpu, cpuid, size);
+	pcpu->pc_rm_queue.rmq_next = &pcpu->pc_rm_queue;
+	pcpu->pc_rm_queue.rmq_prev = &pcpu->pc_rm_queue;
+}
+
+void
+dpcpu_init(void *dpcpu, int cpuid)
+{
+	struct pcpu *pcpu;
+
+	pcpu = pcpu_find(cpuid);
+	pcpu->pc_dynamic = (uintptr_t)dpcpu - DPCPU_START;
+
+	/*
+	 * Initialize defaults from our linker section.
+	 */
+	memcpy(dpcpu, (void *)DPCPU_START, DPCPU_BYTES);
+
+	/*
+	 * Place it in the global pcpu offset array.
+	 */
+	dpcpu_off[cpuid] = pcpu->pc_dynamic;
+}
+
+static void
+dpcpu_startup(void *dummy __unused)
+{
+	struct dpcpu_free *df;
+
+	df = malloc(sizeof(*df), M_PCPU, M_WAITOK | M_ZERO);
+	df->df_start = (uintptr_t)&DPCPU_NAME(modspace);
+	df->df_len = DPCPU_MODMIN;
+	TAILQ_INSERT_HEAD(&dpcpu_head, df, df_link);
+	sx_init(&dpcpu_lock, "dpcpu alloc lock");
+}
+SYSINIT(dpcpu, SI_SUB_KLD, SI_ORDER_FIRST, dpcpu_startup, 0);
+
+/*
+ * First-fit extent based allocator for allocating space in the per-cpu
+ * region reserved for modules.  This is only intended for use by the
+ * kernel linkers to place module linker sets.
+ */
+void *
+dpcpu_alloc(int size)
+{
+	struct dpcpu_free *df;
+	void *s;
+
+	s = NULL;
+	size = roundup2(size, sizeof(void *));
+	sx_xlock(&dpcpu_lock);
+	TAILQ_FOREACH(df, &dpcpu_head, df_link) {
+		if (df->df_len < size)
+			continue;
+		if (df->df_len == size) {
+			s = (void *)df->df_start;
+			TAILQ_REMOVE(&dpcpu_head, df, df_link);
+			free(df, M_PCPU);
+			break;
+		}
+		s = (void *)df->df_start;
+		df->df_len -= size;
+		df->df_start = df->df_start + size;
+		break;
+	}
+	sx_xunlock(&dpcpu_lock);
+
+	return (s);
+}
+
+/*
+ * Free dynamic per-cpu space at module unload time. 
+ */
+void
+dpcpu_free(void *s, int size)
+{
+	struct dpcpu_free *df;
+	struct dpcpu_free *dn;
+	uintptr_t start;
+	uintptr_t end;
+
+	size = roundup2(size, sizeof(void *));
+	start = (uintptr_t)s;
+	end = start + size;
+	/*
+	 * Free a region of space and merge it with as many neighbors as
+	 * possible.  Keeping the list sorted simplifies this operation.
+	 */
+	sx_xlock(&dpcpu_lock);
+	TAILQ_FOREACH(df, &dpcpu_head, df_link) {
+		if (df->df_start > end)
+			break;
+		/*
+		 * If we expand at the end of an entry we may have to
+		 * merge it with the one following it as well.
+		 */
+		if (df->df_start + df->df_len == start) {
+			df->df_len += size;
+			dn = TAILQ_NEXT(df, df_link);
+			if (df->df_start + df->df_len == dn->df_start) {
+				df->df_len += dn->df_len;
+				TAILQ_REMOVE(&dpcpu_head, dn, df_link);
+				free(dn, M_PCPU);
+			}
+			sx_xunlock(&dpcpu_lock);
+			return;
+		}
+		if (df->df_start == end) {
+			df->df_start = start;
+			df->df_len += size;
+			sx_xunlock(&dpcpu_lock);
+			return;
+		}
+	}
+	dn = malloc(sizeof(*df), M_PCPU, M_WAITOK | M_ZERO);
+	dn->df_start = start;
+	dn->df_len = size;
+	if (df)
+		TAILQ_INSERT_BEFORE(df, dn, df_link);
+	else
+		TAILQ_INSERT_TAIL(&dpcpu_head, dn, df_link);
+	sx_xunlock(&dpcpu_lock);
+}
+
+/*
+ * Initialize the per-cpu storage from an updated linker-set region.
+ */
+void
+dpcpu_copy(void *s, int size)
+{
+#ifdef SMP
+	uintptr_t dpcpu;
+	int i;
+
+	for (i = 0; i < mp_ncpus; ++i) {
+		dpcpu = dpcpu_off[i];
+		if (dpcpu == 0)
+			continue;
+		memcpy((void *)(dpcpu + (uintptr_t)s), s, size);
+	}
+#else
+	memcpy((void *)(dpcpu_off[0] + (uintptr_t)s), s, size);
+#endif
+}
+
+/*
+ * Destroy a struct pcpu.
+ */
+void
+pcpu_destroy(struct pcpu *pcpu)
+{
+
+	STAILQ_REMOVE(&cpuhead, pcpu, pcpu, pc_allcpu);
+	cpuid_to_pcpu[pcpu->pc_cpuid] = NULL;
+	dpcpu_off[pcpu->pc_cpuid] = 0;
+}
+
+/*
+ * Locate a struct pcpu by cpu id.
+ */
+struct pcpu *
+pcpu_find(u_int cpuid)
+{
+
+	return (cpuid_to_pcpu[cpuid]);
+}
+
+int
+sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS)
+{
+	uintptr_t dpcpu;
+	int64_t count;
+	int i;
+
+	count = 0;
+	for (i = 0; i < mp_ncpus; ++i) {
+		dpcpu = dpcpu_off[i];
+		if (dpcpu == 0)
+			continue;
+		count += *(int64_t *)(dpcpu + (uintptr_t)arg1);
+	}
+	return (SYSCTL_OUT(req, &count, sizeof(count)));
+}
+
+int
+sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS)
+{
+	uintptr_t dpcpu;
+	long count;
+	int i;
+
+	count = 0;
+	for (i = 0; i < mp_ncpus; ++i) {
+		dpcpu = dpcpu_off[i];
+		if (dpcpu == 0)
+			continue;
+		count += *(long *)(dpcpu + (uintptr_t)arg1);
+	}
+	return (SYSCTL_OUT(req, &count, sizeof(count)));
+}
+
+int
+sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS)
+{
+	uintptr_t dpcpu;
+	int count;
+	int i;
+
+	count = 0;
+	for (i = 0; i < mp_ncpus; ++i) {
+		dpcpu = dpcpu_off[i];
+		if (dpcpu == 0)
+			continue;
+		count += *(int *)(dpcpu + (uintptr_t)arg1);
+	}
+	return (SYSCTL_OUT(req, &count, sizeof(count)));
+}
+
+#ifdef DDB
+DB_SHOW_COMMAND(dpcpu_off, db_show_dpcpu_off)
+{
+	int id;
+
+	CPU_FOREACH(id) {
+		db_printf("dpcpu_off[%2d] = 0x%jx (+ DPCPU_START = %p)\n",
+		    id, (uintmax_t)dpcpu_off[id],
+		    (void *)(uintptr_t)(dpcpu_off[id] + DPCPU_START));
+	}
+}
+
+static void
+show_pcpu(struct pcpu *pc)
+{
+	struct thread *td;
+
+	db_printf("cpuid        = %d\n", pc->pc_cpuid);
+	db_printf("dynamic pcpu = %p\n", (void *)pc->pc_dynamic);
+	db_printf("curthread    = ");
+	td = pc->pc_curthread;
+	if (td != NULL)
+		db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid,
+		    td->td_name);
+	else
+		db_printf("none\n");
+	db_printf("curpcb       = %p\n", pc->pc_curpcb);
+	db_printf("fpcurthread  = ");
+	td = pc->pc_fpcurthread;
+	if (td != NULL)
+		db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid,
+		    td->td_name);
+	else
+		db_printf("none\n");
+	db_printf("idlethread   = ");
+	td = pc->pc_idlethread;
+	if (td != NULL)
+		db_printf("%p: tid %d \"%s\"\n", td, td->td_tid, td->td_name);
+	else
+		db_printf("none\n");
+	db_show_mdpcpu(pc);
+
+#ifdef VIMAGE
+	db_printf("curvnet      = %p\n", pc->pc_curthread->td_vnet);
+#endif
+
+#ifdef WITNESS
+	db_printf("spin locks held:\n");
+	witness_list_locks(&pc->pc_spinlocks, db_printf);
+#endif
+}
+
+DB_SHOW_COMMAND(pcpu, db_show_pcpu)
+{
+	struct pcpu *pc;
+	int id;
+
+	if (have_addr)
+		id = ((addr >> 4) % 16) * 10 + (addr % 16);
+	else
+		id = PCPU_GET(cpuid);
+	pc = pcpu_find(id);
+	if (pc == NULL) {
+		db_printf("CPU %d not found\n", id);
+		return;
+	}
+	show_pcpu(pc);
+}
+
+DB_SHOW_ALL_COMMAND(pcpu, db_show_cpu_all)
+{
+	struct pcpu *pc;
+	int id;
+
+	db_printf("Current CPU: %d\n\n", PCPU_GET(cpuid));
+	for (id = 0; id <= mp_maxid; id++) {
+		pc = pcpu_find(id);
+		if (pc != NULL) {
+			show_pcpu(pc);
+			db_printf("\n");
+		}
+	}
+}
+DB_SHOW_ALIAS(allpcpu, db_show_cpu_all);
+#endif
diff --git a/sys/kern/subr_pctrie.c b/sys/kern/subr_pctrie.c
new file mode 100644
index 0000000..2bbd16d
--- /dev/null
+++ b/sys/kern/subr_pctrie.c
@@ -0,0 +1,705 @@
+/*
+ * Copyright (c) 2013 EMC Corp.
+ * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Path-compressed radix trie implementation.
+ *
+ * The implementation takes into account the following rationale:
+ * - Size of the nodes should be as small as possible but still big enough
+ *   to avoid a large maximum depth for the trie.  This is a balance
+ *   between the necessity to not wire too much physical memory for the nodes
+ *   and the necessity to avoid too much cache pollution during the trie
+ *   operations.
+ * - There is not a huge bias toward the number of lookup operations over
+ *   the number of insert and remove operations.  This basically implies
+ *   that optimizations supposedly helping one operation but hurting the
+ *   other might be carefully evaluated.
+ * - On average not many nodes are expected to be fully populated, hence
+ *   level compression may just complicate things.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/pctrie.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * These widths should allow the pointers to a node's children to fit within
+ * a single cache line.  The extra levels from a narrow width should not be
+ * a problem thanks to path compression.
+ */
+#ifdef __LP64__
+#define	PCTRIE_WIDTH	4
+#else
+#define	PCTRIE_WIDTH	3
+#endif
+
+#define	PCTRIE_COUNT	(1 << PCTRIE_WIDTH)
+#define	PCTRIE_MASK	(PCTRIE_COUNT - 1)
+#define	PCTRIE_LIMIT	(howmany((sizeof(uint64_t) * NBBY), PCTRIE_WIDTH) - 1)
+
+/* Flag bits stored in node pointers. */
+#define	PCTRIE_ISLEAF	0x1
+#define	PCTRIE_FLAGS	0x1
+#define	PCTRIE_PAD	PCTRIE_FLAGS
+
+/* Returns one unit associated with specified level. */
+#define	PCTRIE_UNITLEVEL(lev)						\
+	((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
+
+struct pctrie_node {
+	uint64_t	 pn_owner;			/* Owner of record. */
+	uint16_t	 pn_count;			/* Valid children. */
+	uint16_t	 pn_clev;			/* Current level. */
+	void		*pn_child[PCTRIE_COUNT];	/* Child nodes. */
+};
+
+/*
+ * Allocate a node.  Pre-allocation should ensure that the request
+ * will always be satisfied.
+ */
+static __inline struct pctrie_node *
+pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
+    uint16_t count, uint16_t clevel)
+{
+	struct pctrie_node *node;
+
+	node = allocfn(ptree);
+	if (node == NULL)
+		return (NULL);
+	node->pn_owner = owner;
+	node->pn_count = count;
+	node->pn_clev = clevel;
+
+	return (node);
+}
+
+/*
+ * Free radix node.
+ */
+static __inline void
+pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
+    pctrie_free_t freefn)
+{
+#ifdef INVARIANTS
+	int slot;
+
+	KASSERT(node->pn_count == 0,
+	    ("pctrie_node_put: node %p has %d children", node,
+	    node->pn_count));
+	for (slot = 0; slot < PCTRIE_COUNT; slot++)
+		KASSERT(node->pn_child[slot] == NULL,
+		    ("pctrie_node_put: node %p has a child", node));
+#endif
+	freefn(ptree, node);
+}
+
+/*
+ * Return the position in the array for a given level.
+ */
+static __inline int
+pctrie_slot(uint64_t index, uint16_t level)
+{
+
+	return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
+}
+
+/* Trims the key after the specified level. */
+static __inline uint64_t
+pctrie_trimkey(uint64_t index, uint16_t level)
+{
+	uint64_t ret;
+
+	ret = index;
+	if (level > 0) {
+		ret >>= level * PCTRIE_WIDTH;
+		ret <<= level * PCTRIE_WIDTH;
+	}
+	return (ret);
+}
+
+/*
+ * Get the root node for a tree.
+ */
+static __inline struct pctrie_node *
+pctrie_getroot(struct pctrie *ptree)
+{
+
+	return ((struct pctrie_node *)ptree->pt_root);
+}
+
+/*
+ * Set the root node for a tree.
+ */
+static __inline void
+pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node)
+{
+
+	ptree->pt_root = (uintptr_t)node;
+}
+
+/*
+ * Returns TRUE if the specified node is a leaf and FALSE otherwise.
+ */
+static __inline boolean_t
+pctrie_isleaf(struct pctrie_node *node)
+{
+
+	return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
+}
+
+/*
+ * Returns the associated val extracted from node.
+ */
+static __inline uint64_t *
+pctrie_toval(struct pctrie_node *node)
+{
+
+	return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
+}
+
+/*
+ * Adds the val as a child of the provided node.
+ */
+static __inline void
+pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
+    uint64_t *val)
+{
+	int slot;
+
+	slot = pctrie_slot(index, clev);
+	node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF);
+}
+
+/*
+ * Returns the slot where two keys differ.
+ * It cannot accept 2 equal keys.
+ */
+static __inline uint16_t
+pctrie_keydiff(uint64_t index1, uint64_t index2)
+{
+	uint16_t clev;
+
+	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
+	    __func__, (uintmax_t)index1));
+
+	index1 ^= index2;
+	for (clev = PCTRIE_LIMIT;; clev--)
+		if (pctrie_slot(index1, clev) != 0)
+			return (clev);
+}
+
+/*
+ * Returns TRUE if it can be determined that key does not belong to the
+ * specified node.  Otherwise, returns FALSE.
+ */
+static __inline boolean_t
+pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
+{
+
+	if (node->pn_clev < PCTRIE_LIMIT) {
+		idx = pctrie_trimkey(idx, node->pn_clev + 1);
+		return (idx != node->pn_owner);
+	}
+	return (FALSE);
+}
+
+/*
+ * Internal helper for pctrie_reclaim_allnodes().
+ * This function is recursive.
+ */
+static void
+pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
+    pctrie_free_t freefn)
+{
+	int slot;
+
+	KASSERT(node->pn_count <= PCTRIE_COUNT,
+	    ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
+	for (slot = 0; node->pn_count != 0; slot++) {
+		if (node->pn_child[slot] == NULL)
+			continue;
+		if (!pctrie_isleaf(node->pn_child[slot]))
+			pctrie_reclaim_allnodes_int(ptree,
+			    node->pn_child[slot], freefn);
+		node->pn_child[slot] = NULL;
+		node->pn_count--;
+	}
+	pctrie_node_put(ptree, node, freefn);
+}
+
+/*
+ * pctrie node zone initializer.
+ */
+int
+pctrie_zone_init(void *mem, int size __unused, int flags __unused)
+{
+	struct pctrie_node *node;
+
+	node = mem;
+	memset(node->pn_child, 0, sizeof(node->pn_child));
+	return (0);
+}
+
+size_t
+pctrie_node_size(void)
+{
+
+	return (sizeof(struct pctrie_node));
+}
+
+/*
+ * Inserts the key-value pair into the trie.
+ * Panics if the key already exists.
+ */
+int
+pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
+{
+	uint64_t index, newind;
+	void **parentp;
+	struct pctrie_node *node, *tmp;
+	uint64_t *m;
+	int slot;
+	uint16_t clev;
+
+	index = *val;
+
+	/*
+	 * The owner of record for root is not really important because it
+	 * will never be used.
+	 */
+	node = pctrie_getroot(ptree);
+	if (node == NULL) {
+		ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
+		return (0);
+	}
+	parentp = (void **)&ptree->pt_root;
+	for (;;) {
+		if (pctrie_isleaf(node)) {
+			m = pctrie_toval(node);
+			if (*m == index)
+				panic("%s: key %jx is already present",
+				    __func__, (uintmax_t)index);
+			clev = pctrie_keydiff(*m, index);
+			tmp = pctrie_node_get(ptree, allocfn,
+			    pctrie_trimkey(index, clev + 1), 2, clev);
+			if (tmp == NULL)
+				return (ENOMEM);
+			*parentp = tmp;
+			pctrie_addval(tmp, index, clev, val);
+			pctrie_addval(tmp, *m, clev, m);
+			return (0);
+		} else if (pctrie_keybarr(node, index))
+			break;
+		slot = pctrie_slot(index, node->pn_clev);
+		if (node->pn_child[slot] == NULL) {
+			node->pn_count++;
+			pctrie_addval(node, index, node->pn_clev, val);
+			return (0);
+		}
+		parentp = &node->pn_child[slot];
+		node = node->pn_child[slot];
+	}
+
+	/*
+	 * A new node is needed because the right insertion level is reached.
+	 * Setup the new intermediate node and add the 2 children: the
+	 * new object and the older edge.
+	 */
+	newind = node->pn_owner;
+	clev = pctrie_keydiff(newind, index);
+	tmp = pctrie_node_get(ptree, allocfn,
+	    pctrie_trimkey(index, clev + 1), 2, clev);
+	if (tmp == NULL)
+		return (ENOMEM);
+	*parentp = tmp;
+	pctrie_addval(tmp, index, clev, val);
+	slot = pctrie_slot(newind, clev);
+	tmp->pn_child[slot] = node;
+
+	return (0);
+}
+
+/*
+ * Returns the value stored at the index.  If the index is not present,
+ * NULL is returned.
+ */
+uint64_t *
+pctrie_lookup(struct pctrie *ptree, uint64_t index)
+{
+	struct pctrie_node *node;
+	uint64_t *m;
+	int slot;
+
+	node = pctrie_getroot(ptree);
+	while (node != NULL) {
+		if (pctrie_isleaf(node)) {
+			m = pctrie_toval(node);
+			if (*m == index)
+				return (m);
+			else
+				break;
+		} else if (pctrie_keybarr(node, index))
+			break;
+		slot = pctrie_slot(index, node->pn_clev);
+		node = node->pn_child[slot];
+	}
+	return (NULL);
+}
+
+/*
+ * Look up the nearest entry at a position bigger than or equal to index.
+ */
+uint64_t *
+pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
+{
+	struct pctrie_node *stack[PCTRIE_LIMIT];
+	uint64_t inc;
+	uint64_t *m;
+	struct pctrie_node *child, *node;
+#ifdef INVARIANTS
+	int loops = 0;
+#endif
+	int slot, tos;
+
+	node = pctrie_getroot(ptree);
+	if (node == NULL)
+		return (NULL);
+	else if (pctrie_isleaf(node)) {
+		m = pctrie_toval(node);
+		if (*m >= index)
+			return (m);
+		else
+			return (NULL);
+	}
+	tos = 0;
+	for (;;) {
+		/*
+		 * If the keys differ before the current bisection node,
+		 * then the search key might rollback to the earliest
+		 * available bisection node or to the smallest key
+		 * in the current node (if the owner is bigger than the
+		 * search key).
+		 */
+		if (pctrie_keybarr(node, index)) {
+			if (index > node->pn_owner) {
+ascend:
+				KASSERT(++loops < 1000,
+				    ("pctrie_lookup_ge: too many loops"));
+
+				/*
+				 * Pop nodes from the stack until either the
+				 * stack is empty or a node that could have a
+				 * matching descendant is found.
+				 */
+				do {
+					if (tos == 0)
+						return (NULL);
+					node = stack[--tos];
+				} while (pctrie_slot(index,
+				    node->pn_clev) == (PCTRIE_COUNT - 1));
+
+				/*
+				 * The following computation cannot overflow
+				 * because index's slot at the current level
+				 * is less than PCTRIE_COUNT - 1.
+				 */
+				index = pctrie_trimkey(index,
+				    node->pn_clev);
+				index += PCTRIE_UNITLEVEL(node->pn_clev);
+			} else
+				index = node->pn_owner;
+			KASSERT(!pctrie_keybarr(node, index),
+			    ("pctrie_lookup_ge: keybarr failed"));
+		}
+		slot = pctrie_slot(index, node->pn_clev);
+		child = node->pn_child[slot];
+		if (pctrie_isleaf(child)) {
+			m = pctrie_toval(child);
+			if (*m >= index)
+				return (m);
+		} else if (child != NULL)
+			goto descend;
+
+		/*
+		 * Look for an available edge or val within the current
+		 * bisection node.
+		 */
+                if (slot < (PCTRIE_COUNT - 1)) {
+			inc = PCTRIE_UNITLEVEL(node->pn_clev);
+			index = pctrie_trimkey(index, node->pn_clev);
+			do {
+				index += inc;
+				slot++;
+				child = node->pn_child[slot];
+				if (pctrie_isleaf(child)) {
+					m = pctrie_toval(child);
+					if (*m >= index)
+						return (m);
+				} else if (child != NULL)
+					goto descend;
+			} while (slot < (PCTRIE_COUNT - 1));
+		}
+		KASSERT(child == NULL || pctrie_isleaf(child),
+		    ("pctrie_lookup_ge: child is radix node"));
+
+		/*
+		 * If a value or edge bigger than the search slot is not found
+		 * in the current node, ascend to the next higher-level node.
+		 */
+		goto ascend;
+descend:
+		KASSERT(node->pn_clev > 0,
+		    ("pctrie_lookup_ge: pushing leaf's parent"));
+		KASSERT(tos < PCTRIE_LIMIT,
+		    ("pctrie_lookup_ge: stack overflow"));
+		stack[tos++] = node;
+		node = child;
+	}
+}
+
+/*
+ * Look up the nearest entry at a position less than or equal to index.
+ */
+uint64_t *
+pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
+{
+	struct pctrie_node *stack[PCTRIE_LIMIT];
+	uint64_t inc;
+	uint64_t *m;
+	struct pctrie_node *child, *node;
+#ifdef INVARIANTS
+	int loops = 0;
+#endif
+	int slot, tos;
+
+	node = pctrie_getroot(ptree);
+	if (node == NULL)
+		return (NULL);
+	else if (pctrie_isleaf(node)) {
+		m = pctrie_toval(node);
+		if (*m <= index)
+			return (m);
+		else
+			return (NULL);
+	}
+	tos = 0;
+	for (;;) {
+		/*
+		 * If the keys differ before the current bisection node,
+		 * then the search key might rollback to the earliest
+		 * available bisection node or to the largest key
+		 * in the current node (if the owner is smaller than the
+		 * search key).
+		 */
+		if (pctrie_keybarr(node, index)) {
+			if (index > node->pn_owner) {
+				index = node->pn_owner + PCTRIE_COUNT *
+				    PCTRIE_UNITLEVEL(node->pn_clev);
+			} else {
+ascend:
+				KASSERT(++loops < 1000,
+				    ("pctrie_lookup_le: too many loops"));
+
+				/*
+				 * Pop nodes from the stack until either the
+				 * stack is empty or a node that could have a
+				 * matching descendant is found.
+				 */
+				do {
+					if (tos == 0)
+						return (NULL);
+					node = stack[--tos];
+				} while (pctrie_slot(index,
+				    node->pn_clev) == 0);
+
+				/*
+				 * The following computation cannot overflow
+				 * because index's slot at the current level
+				 * is greater than 0.
+				 */
+				index = pctrie_trimkey(index,
+				    node->pn_clev);
+			}
+			index--;
+			KASSERT(!pctrie_keybarr(node, index),
+			    ("pctrie_lookup_le: keybarr failed"));
+		}
+		slot = pctrie_slot(index, node->pn_clev);
+		child = node->pn_child[slot];
+		if (pctrie_isleaf(child)) {
+			m = pctrie_toval(child);
+			if (*m <= index)
+				return (m);
+		} else if (child != NULL)
+			goto descend;
+
+		/*
+		 * Look for an available edge or value within the current
+		 * bisection node.
+		 */
+		if (slot > 0) {
+			inc = PCTRIE_UNITLEVEL(node->pn_clev);
+			index |= inc - 1;
+			do {
+				index -= inc;
+				slot--;
+				child = node->pn_child[slot];
+				if (pctrie_isleaf(child)) {
+					m = pctrie_toval(child);
+					if (*m <= index)
+						return (m);
+				} else if (child != NULL)
+					goto descend;
+			} while (slot > 0);
+		}
+		KASSERT(child == NULL || pctrie_isleaf(child),
+		    ("pctrie_lookup_le: child is radix node"));
+
+		/*
+		 * If a value or edge smaller than the search slot is not found
+		 * in the current node, ascend to the next higher-level node.
+		 */
+		goto ascend;
+descend:
+		KASSERT(node->pn_clev > 0,
+		    ("pctrie_lookup_le: pushing leaf's parent"));
+		KASSERT(tos < PCTRIE_LIMIT,
+		    ("pctrie_lookup_le: stack overflow"));
+		stack[tos++] = node;
+		node = child;
+	}
+}
+
+/*
+ * Remove the specified index from the tree.
+ * Panics if the key is not present.
+ */
+void
+pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
+{
+	struct pctrie_node *node, *parent;
+	uint64_t *m;
+	int i, slot;
+
+	node = pctrie_getroot(ptree);
+	if (pctrie_isleaf(node)) {
+		m = pctrie_toval(node);
+		if (*m != index)
+			panic("%s: invalid key found", __func__);
+		pctrie_setroot(ptree, NULL);
+		return;
+	}
+	parent = NULL;
+	for (;;) {
+		if (node == NULL)
+			panic("pctrie_remove: impossible to locate the key");
+		slot = pctrie_slot(index, node->pn_clev);
+		if (pctrie_isleaf(node->pn_child[slot])) {
+			m = pctrie_toval(node->pn_child[slot]);
+			if (*m != index)
+				panic("%s: invalid key found", __func__);
+			node->pn_child[slot] = NULL;
+			node->pn_count--;
+			if (node->pn_count > 1)
+				break;
+			for (i = 0; i < PCTRIE_COUNT; i++)
+				if (node->pn_child[i] != NULL)
+					break;
+			KASSERT(i != PCTRIE_COUNT,
+			    ("%s: invalid node configuration", __func__));
+			if (parent == NULL)
+				pctrie_setroot(ptree, node->pn_child[i]);
+			else {
+				slot = pctrie_slot(index, parent->pn_clev);
+				KASSERT(parent->pn_child[slot] == node,
+				    ("%s: invalid child value", __func__));
+				parent->pn_child[slot] = node->pn_child[i];
+			}
+			node->pn_count--;
+			node->pn_child[i] = NULL;
+			pctrie_node_put(ptree, node, freefn);
+			break;
+		}
+		parent = node;
+		node = node->pn_child[slot];
+	}
+}
+
+/*
+ * Remove and free all the nodes from the tree.
+ * This function is recursive but there is a tight control on it as the
+ * maximum depth of the tree is fixed.
+ */
+void
+pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
+{
+	struct pctrie_node *root;
+
+	root = pctrie_getroot(ptree);
+	if (root == NULL)
+		return;
+	pctrie_setroot(ptree, NULL);
+	if (!pctrie_isleaf(root))
+		pctrie_reclaim_allnodes_int(ptree, root, freefn);
+}
+
+#ifdef DDB
+/*
+ * Show details about the given node.
+ */
+DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
+{
+	struct pctrie_node *node;
+	int i;
+
+        if (!have_addr)
+                return;
+	node = (struct pctrie_node *)addr;
+	db_printf("node %p, owner %jx, children count %u, level %u:\n",
+	    (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
+	    node->pn_clev);
+	for (i = 0; i < PCTRIE_COUNT; i++)
+		if (node->pn_child[i] != NULL)
+			db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
+			    i, (void *)node->pn_child[i],
+			    pctrie_isleaf(node->pn_child[i]) ?
+			    pctrie_toval(node->pn_child[i]) : NULL,
+			    node->pn_clev);
+}
+#endif /* DDB */
diff --git a/sys/kern/subr_power.c b/sys/kern/subr_power.c
new file mode 100644
index 0000000..ac6cd71
--- /dev/null
+++ b/sys/kern/subr_power.c
@@ -0,0 +1,122 @@
+/*-
+ * Copyright (c) 2001 Mitsuru IWASAKI
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+
+#include <sys/power.h>
+#include <sys/taskqueue.h>
+
+static u_int		 power_pm_type	= POWER_PM_TYPE_NONE;
+static power_pm_fn_t	 power_pm_fn	= NULL;
+static void		*power_pm_arg	= NULL;
+static struct task	 power_pm_task;
+
+static void
+power_pm_deferred_fn(void *arg, int pending)
+{
+	int state = (intptr_t)arg;
+
+	power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, state);
+}
+
+int
+power_pm_register(u_int pm_type, power_pm_fn_t pm_fn, void *pm_arg)
+{
+	int	error;
+
+	if (power_pm_type == POWER_PM_TYPE_NONE ||
+	    power_pm_type == pm_type) {
+		power_pm_type	= pm_type;
+		power_pm_fn	= pm_fn;
+		power_pm_arg	= pm_arg;
+		error = 0;
+		TASK_INIT(&power_pm_task, 0, power_pm_deferred_fn, NULL);
+	} else {
+		error = ENXIO;
+	}
+
+	return (error);
+}
+
+u_int
+power_pm_get_type(void)
+{
+
+	return (power_pm_type);
+}
+
+void
+power_pm_suspend(int state)
+{
+	if (power_pm_fn == NULL)
+		return;
+
+	if (state != POWER_SLEEP_STATE_STANDBY &&
+	    state != POWER_SLEEP_STATE_SUSPEND &&
+	    state != POWER_SLEEP_STATE_HIBERNATE)
+		return;
+	power_pm_task.ta_context = (void *)(intptr_t)state;
+	taskqueue_enqueue(taskqueue_thread, &power_pm_task);
+}
+
+/*
+ * Power profile.
+ */
+
+static int	power_profile_state = POWER_PROFILE_PERFORMANCE;
+
+int
+power_profile_get_state(void)
+{
+	return (power_profile_state);
+}
+
+void
+power_profile_set_state(int state) 
+{
+	int		changed;
+    
+	if (state != power_profile_state) {
+		power_profile_state = state;
+		changed = 1;
+		if (bootverbose) {
+			printf("system power profile changed to '%s'\n",
+				(state == POWER_PROFILE_PERFORMANCE) ?
+				"performance" : "economy");
+		}
+	} else {
+		changed = 0;
+	}
+
+	if (changed)
+		EVENTHANDLER_INVOKE(power_profile_change, 0);
+}
+
diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c
new file mode 100644
index 0000000..042afa3
--- /dev/null
+++ b/sys/kern/subr_prf.c
@@ -0,0 +1,1140 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_prf.c	8.3 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_printf.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kdb.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/kernel.h>
+#include <sys/msgbuf.h>
+#include <sys/malloc.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/stddef.h>
+#include <sys/sysctl.h>
+#include <sys/tty.h>
+#include <sys/syslog.h>
+#include <sys/cons.h>
+#include <sys/uio.h>
+#include <sys/ctype.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#define TOCONS	0x01
+#define TOTTY	0x02
+#define TOLOG	0x04
+
+/* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */
+#define MAXNBUF	(sizeof(intmax_t) * NBBY + 1)
+
+struct putchar_arg {
+	int	flags;
+	int	pri;
+	struct	tty *tty;
+	char	*p_bufr;
+	size_t	n_bufr;
+	char	*p_next;
+	size_t	remain;
+};
+
+struct snprintf_arg {
+	char	*str;
+	size_t	remain;
+};
+
+extern	int log_open;
+
+static void  msglogchar(int c, int pri);
+static void  msglogstr(char *str, int pri, int filter_cr);
+static void  putchar(int ch, void *arg);
+static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len, int upper);
+static void  snprintf_func(int ch, void *arg);
+
+static int msgbufmapped;		/* Set when safe to use msgbuf */
+int msgbuftrigger;
+
+static int      log_console_output = 1;
+TUNABLE_INT("kern.log_console_output", &log_console_output);
+SYSCTL_INT(_kern, OID_AUTO, log_console_output, CTLFLAG_RW,
+    &log_console_output, 0, "Duplicate console output to the syslog.");
+
+/*
+ * See the comment in log_console() below for more explanation of this.
+ */
+static int log_console_add_linefeed = 0;
+TUNABLE_INT("kern.log_console_add_linefeed", &log_console_add_linefeed);
+SYSCTL_INT(_kern, OID_AUTO, log_console_add_linefeed, CTLFLAG_RW,
+    &log_console_add_linefeed, 0, "log_console() adds extra newlines.");
+
+static int	always_console_output = 0;
+TUNABLE_INT("kern.always_console_output", &always_console_output);
+SYSCTL_INT(_kern, OID_AUTO, always_console_output, CTLFLAG_RW,
+    &always_console_output, 0, "Always output to console despite TIOCCONS.");
+
+/*
+ * Warn that a system table is full.
+ */
+void
+tablefull(const char *tab)
+{
+
+	log(LOG_ERR, "%s: table is full\n", tab);
+}
+
+/*
+ * Uprintf prints to the controlling terminal for the current process.
+ */
+int
+uprintf(const char *fmt, ...)
+{
+	va_list ap;
+	struct putchar_arg pca;
+	struct proc *p;
+	struct thread *td;
+	int retval;
+
+	td = curthread;
+	if (TD_IS_IDLETHREAD(td))
+		return (0);
+
+	sx_slock(&proctree_lock);
+	p = td->td_proc;
+	PROC_LOCK(p);
+	if ((p->p_flag & P_CONTROLT) == 0) {
+		PROC_UNLOCK(p);
+		retval = 0;
+		goto out;
+	}
+	SESS_LOCK(p->p_session);
+	pca.tty = p->p_session->s_ttyp;
+	SESS_UNLOCK(p->p_session);
+	PROC_UNLOCK(p);
+	if (pca.tty == NULL) {
+		retval = 0;
+		goto out;
+	}
+	pca.flags = TOTTY;
+	pca.p_bufr = NULL;
+	va_start(ap, fmt);
+	tty_lock(pca.tty);
+	retval = kvprintf(fmt, putchar, &pca, 10, ap);
+	tty_unlock(pca.tty);
+	va_end(ap);
+out:
+	sx_sunlock(&proctree_lock);
+	return (retval);
+}
+
+/*
+ * tprintf and vtprintf print on the controlling terminal associated with the
+ * given session, possibly to the log as well.
+ */
+void
+tprintf(struct proc *p, int pri, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vtprintf(p, pri, fmt, ap);
+	va_end(ap);
+}
+
+void
+vtprintf(struct proc *p, int pri, const char *fmt, va_list ap)
+{
+	struct tty *tp = NULL;
+	int flags = 0;
+	struct putchar_arg pca;
+	struct session *sess = NULL;
+
+	sx_slock(&proctree_lock);
+	if (pri != -1)
+		flags |= TOLOG;
+	if (p != NULL) {
+		PROC_LOCK(p);
+		if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
+			sess = p->p_session;
+			sess_hold(sess);
+			PROC_UNLOCK(p);
+			tp = sess->s_ttyp;
+			if (tp != NULL && tty_checkoutq(tp))
+				flags |= TOTTY;
+			else
+				tp = NULL;
+		} else
+			PROC_UNLOCK(p);
+	}
+	pca.pri = pri;
+	pca.tty = tp;
+	pca.flags = flags;
+	pca.p_bufr = NULL;
+	if (pca.tty != NULL)
+		tty_lock(pca.tty);
+	kvprintf(fmt, putchar, &pca, 10, ap);
+	if (pca.tty != NULL)
+		tty_unlock(pca.tty);
+	if (sess != NULL)
+		sess_release(sess);
+	msgbuftrigger = 1;
+	sx_sunlock(&proctree_lock);
+}
+
+/*
+ * Ttyprintf displays a message on a tty; it should be used only by
+ * the tty driver, or anything that knows the underlying tty will not
+ * be revoke(2)'d away.  Other callers should use tprintf.
+ */
+int
+ttyprintf(struct tty *tp, const char *fmt, ...)
+{
+	va_list ap;
+	struct putchar_arg pca;
+	int retval;
+
+	va_start(ap, fmt);
+	pca.tty = tp;
+	pca.flags = TOTTY;
+	pca.p_bufr = NULL;
+	retval = kvprintf(fmt, putchar, &pca, 10, ap);
+	va_end(ap);
+	return (retval);
+}
+
+/*
+ * Log writes to the log buffer, and guarantees not to sleep (so can be
+ * called by interrupt routines).  If there is no process reading the
+ * log yet, it writes to the console also.
+ */
+void
+log(int level, const char *fmt, ...)
+{
+	va_list ap;
+	struct putchar_arg pca;
+#ifdef PRINTF_BUFR_SIZE
+	char bufr[PRINTF_BUFR_SIZE];
+#endif
+
+	pca.tty = NULL;
+	pca.pri = level;
+	pca.flags = log_open ? TOLOG : TOCONS;
+#ifdef PRINTF_BUFR_SIZE
+	pca.p_bufr = bufr;
+	pca.p_next = pca.p_bufr;
+	pca.n_bufr = sizeof(bufr);
+	pca.remain = sizeof(bufr);
+	*pca.p_next = '\0';
+#else
+	pca.p_bufr = NULL;
+#endif
+
+	va_start(ap, fmt);
+	kvprintf(fmt, putchar, &pca, 10, ap);
+	va_end(ap);
+
+#ifdef PRINTF_BUFR_SIZE
+	/* Write any buffered console/log output: */
+	if (*pca.p_bufr != '\0') {
+		if (pca.flags & TOLOG)
+			msglogstr(pca.p_bufr, level, /*filter_cr*/1);
+
+		if (pca.flags & TOCONS)
+			cnputs(pca.p_bufr);
+	}
+#endif
+	msgbuftrigger = 1;
+}
+
+#define CONSCHUNK 128
+
+void
+log_console(struct uio *uio)
+{
+	int c, error, nl;
+	char *consbuffer;
+	int pri;
+
+	if (!log_console_output)
+		return;
+
+	pri = LOG_INFO | LOG_CONSOLE;
+	uio = cloneuio(uio);
+	consbuffer = malloc(CONSCHUNK, M_TEMP, M_WAITOK);
+
+	nl = 0;
+	while (uio->uio_resid > 0) {
+		c = imin(uio->uio_resid, CONSCHUNK - 1);
+		error = uiomove(consbuffer, c, uio);
+		if (error != 0)
+			break;
+		/* Make sure we're NUL-terminated */
+		consbuffer[c] = '\0';
+		if (consbuffer[c - 1] == '\n')
+			nl = 1;
+		else
+			nl = 0;
+		msglogstr(consbuffer, pri, /*filter_cr*/ 1);
+	}
+	/*
+	 * The previous behavior in log_console() is preserved when
+	 * log_console_add_linefeed is non-zero.  For that behavior, if an
+	 * individual console write came in that was not terminated with a
+	 * line feed, it would add a line feed.
+	 *
+	 * This results in different data in the message buffer than
+	 * appears on the system console (which doesn't add extra line feed
+	 * characters).
+	 *
+	 * A number of programs and rc scripts write a line feed, or a period
+	 * and a line feed when they have completed their operation.  On
+	 * the console, this looks seamless, but when displayed with
+	 * 'dmesg -a', you wind up with output that looks like this:
+	 *
+	 * Updating motd:
+	 * .
+	 *
+	 * On the console, it looks like this:
+	 * Updating motd:.
+	 *
+	 * We could add logic to detect that situation, or just not insert
+	 * the extra newlines.  Set the kern.log_console_add_linefeed
+	 * sysctl/tunable variable to get the old behavior.
+	 */
+	if (!nl && log_console_add_linefeed) {
+		consbuffer[0] = '\n';
+		consbuffer[1] = '\0';
+		msglogstr(consbuffer, pri, /*filter_cr*/ 1);
+	}
+	msgbuftrigger = 1;
+	free(uio, M_IOV);
+	free(consbuffer, M_TEMP);
+	return;
+}
+
+int
+printf(const char *fmt, ...)
+{
+	va_list ap;
+	int retval;
+
+	va_start(ap, fmt);
+	retval = vprintf(fmt, ap);
+	va_end(ap);
+
+	return (retval);
+}
+
+int
+vprintf(const char *fmt, va_list ap)
+{
+	struct putchar_arg pca;
+	int retval;
+#ifdef PRINTF_BUFR_SIZE
+	char bufr[PRINTF_BUFR_SIZE];
+#endif
+
+	pca.tty = NULL;
+	pca.flags = TOCONS | TOLOG;
+	pca.pri = -1;
+#ifdef PRINTF_BUFR_SIZE
+	pca.p_bufr = bufr;
+	pca.p_next = pca.p_bufr;
+	pca.n_bufr = sizeof(bufr);
+	pca.remain = sizeof(bufr);
+	*pca.p_next = '\0';
+#else
+	/* Don't buffer console output. */
+	pca.p_bufr = NULL;
+#endif
+
+	retval = kvprintf(fmt, putchar, &pca, 10, ap);
+
+#ifdef PRINTF_BUFR_SIZE
+	/* Write any buffered console/log output: */
+	if (*pca.p_bufr != '\0') {
+		cnputs(pca.p_bufr);
+		msglogstr(pca.p_bufr, pca.pri, /*filter_cr*/ 1);
+	}
+#endif
+
+	if (!panicstr)
+		msgbuftrigger = 1;
+
+	return (retval);
+}
+
+static void
+putbuf(int c, struct putchar_arg *ap)
+{
+	/* Check if no console output buffer was provided. */
+	if (ap->p_bufr == NULL) {
+		/* Output direct to the console. */
+		if (ap->flags & TOCONS)
+			cnputc(c);
+
+		if (ap->flags & TOLOG)
+			msglogchar(c, ap->pri);
+	} else {
+		/* Buffer the character: */
+		*ap->p_next++ = c;
+		ap->remain--;
+
+		/* Always leave the buffer zero terminated. */
+		*ap->p_next = '\0';
+
+		/* Check if the buffer needs to be flushed. */
+		if (ap->remain == 2 || c == '\n') {
+
+			if (ap->flags & TOLOG)
+				msglogstr(ap->p_bufr, ap->pri, /*filter_cr*/1);
+
+			if (ap->flags & TOCONS) {
+				if ((panicstr == NULL) && (constty != NULL))
+					msgbuf_addstr(&consmsgbuf, -1,
+					    ap->p_bufr, /*filter_cr*/ 0);
+
+				if ((constty == NULL) ||(always_console_output))
+					cnputs(ap->p_bufr);
+			}
+
+			ap->p_next = ap->p_bufr;
+			ap->remain = ap->n_bufr;
+			*ap->p_next = '\0';
+		}
+
+		/*
+		 * Since we fill the buffer up one character at a time,
+		 * this should not happen.  We should always catch it when
+		 * ap->remain == 2 (if not sooner due to a newline), flush
+		 * the buffer and move on.  One way this could happen is
+		 * if someone sets PRINTF_BUFR_SIZE to 1 or something
+		 * similarly silly.
+		 */
+		KASSERT(ap->remain > 2, ("Bad buffer logic, remain = %zd",
+		    ap->remain));
+	}
+}
+
+/*
+ * Print a character on console or users terminal.  If destination is
+ * the console then the last bunch of characters are saved in msgbuf for
+ * inspection later.
+ */
+static void
+putchar(int c, void *arg)
+{
+	struct putchar_arg *ap = (struct putchar_arg*) arg;
+	struct tty *tp = ap->tty;
+	int flags = ap->flags;
+
+	/* Don't use the tty code after a panic or while in ddb. */
+	if (kdb_active) {
+		if (c != '\0')
+			cnputc(c);
+		return;
+	}
+
+	if ((flags & TOTTY) && tp != NULL && panicstr == NULL)
+		tty_putchar(tp, c);
+
+	if ((flags & (TOCONS | TOLOG)) && c != '\0')
+		putbuf(c, ap);
+}
+
+/*
+ * Scaled down version of sprintf(3).
+ */
+int
+sprintf(char *buf, const char *cfmt, ...)
+{
+	int retval;
+	va_list ap;
+
+	va_start(ap, cfmt);
+	retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+	buf[retval] = '\0';
+	va_end(ap);
+	return (retval);
+}
+
+/*
+ * Scaled down version of vsprintf(3).
+ */
+int
+vsprintf(char *buf, const char *cfmt, va_list ap)
+{
+	int retval;
+
+	retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+	buf[retval] = '\0';
+	return (retval);
+}
+
+/*
+ * Scaled down version of snprintf(3).
+ */
+int
+snprintf(char *str, size_t size, const char *format, ...)
+{
+	int retval;
+	va_list ap;
+
+	va_start(ap, format);
+	retval = vsnprintf(str, size, format, ap);
+	va_end(ap);
+	return(retval);
+}
+
+/*
+ * Scaled down version of vsnprintf(3).
+ */
+int
+vsnprintf(char *str, size_t size, const char *format, va_list ap)
+{
+	struct snprintf_arg info;
+	int retval;
+
+	info.str = str;
+	info.remain = size;
+	retval = kvprintf(format, snprintf_func, &info, 10, ap);
+	if (info.remain >= 1)
+		*info.str++ = '\0';
+	return (retval);
+}
+
+/*
+ * Kernel version which takes radix argument vsnprintf(3).
+ */
+int
+vsnrprintf(char *str, size_t size, int radix, const char *format, va_list ap)
+{
+	struct snprintf_arg info;
+	int retval;
+
+	info.str = str;
+	info.remain = size;
+	retval = kvprintf(format, snprintf_func, &info, radix, ap);
+	if (info.remain >= 1)
+		*info.str++ = '\0';
+	return (retval);
+}
+
+static void
+snprintf_func(int ch, void *arg)
+{
+	struct snprintf_arg *const info = arg;
+
+	if (info->remain >= 2) {
+		*info->str++ = ch;
+		info->remain--;
+	}
+}
+
+/*
+ * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse
+ * order; return an optional length and a pointer to the last character
+ * written in the buffer (i.e., the first character of the string).
+ * The buffer pointed to by `nbuf' must have length >= MAXNBUF.
+ */
+static char *
+ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper)
+{
+	char *p, c;
+
+	p = nbuf;
+	*p = '\0';
+	do {
+		c = hex2ascii(num % base);
+		*++p = upper ? toupper(c) : c;
+	} while (num /= base);
+	if (lenp)
+		*lenp = p - nbuf;
+	return (p);
+}
+
+/*
+ * Scaled down version of printf(3).
+ *
+ * Two additional formats:
+ *
+ * The format %b is supported to decode error registers.
+ * Its usage is:
+ *
+ *	printf("reg=%b\n", regval, "<base><arg>*");
+ *
+ * where <base> is the output base expressed as a control character, e.g.
+ * \10 gives octal; \20 gives hex.  Each arg is a sequence of characters,
+ * the first of which gives the bit number to be inspected (origin 1), and
+ * the next characters (up to a control character, i.e. a character <= 32),
+ * give the name of the register.  Thus:
+ *
+ *	kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n");
+ *
+ * would produce output:
+ *
+ *	reg=3<BITTWO,BITONE>
+ *
+ * XXX:  %D  -- Hexdump, takes pointer and separator string:
+ *		("%6D", ptr, ":")   -> XX:XX:XX:XX:XX:XX
+ *		("%*D", len, ptr, " " -> XX XX XX XX ...
+ */
+int
+kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap)
+{
+#define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; }
+	char nbuf[MAXNBUF];
+	char *d;
+	const char *p, *percent, *q;
+	u_char *up;
+	int ch, n;
+	uintmax_t num;
+	int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
+	int cflag, hflag, jflag, tflag, zflag;
+	int dwidth, upper;
+	char padc;
+	int stop = 0, retval = 0;
+
+	num = 0;
+	if (!func)
+		d = (char *) arg;
+	else
+		d = NULL;
+
+	if (fmt == NULL)
+		fmt = "(fmt null)\n";
+
+	if (radix < 2 || radix > 36)
+		radix = 10;
+
+	for (;;) {
+		padc = ' ';
+		width = 0;
+		while ((ch = (u_char)*fmt++) != '%' || stop) {
+			if (ch == '\0')
+				return (retval);
+			PCHAR(ch);
+		}
+		percent = fmt - 1;
+		qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
+		sign = 0; dot = 0; dwidth = 0; upper = 0;
+		cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0;
+reswitch:	switch (ch = (u_char)*fmt++) {
+		case '.':
+			dot = 1;
+			goto reswitch;
+		case '#':
+			sharpflag = 1;
+			goto reswitch;
+		case '+':
+			sign = 1;
+			goto reswitch;
+		case '-':
+			ladjust = 1;
+			goto reswitch;
+		case '%':
+			PCHAR(ch);
+			break;
+		case '*':
+			if (!dot) {
+				width = va_arg(ap, int);
+				if (width < 0) {
+					ladjust = !ladjust;
+					width = -width;
+				}
+			} else {
+				dwidth = va_arg(ap, int);
+			}
+			goto reswitch;
+		case '0':
+			if (!dot) {
+				padc = '0';
+				goto reswitch;
+			}
+		case '1': case '2': case '3': case '4':
+		case '5': case '6': case '7': case '8': case '9':
+				for (n = 0;; ++fmt) {
+					n = n * 10 + ch - '0';
+					ch = *fmt;
+					if (ch < '0' || ch > '9')
+						break;
+				}
+			if (dot)
+				dwidth = n;
+			else
+				width = n;
+			goto reswitch;
+		case 'b':
+			num = (u_int)va_arg(ap, int);
+			p = va_arg(ap, char *);
+			for (q = ksprintn(nbuf, num, *p++, NULL, 0); *q;)
+				PCHAR(*q--);
+
+			if (num == 0)
+				break;
+
+			for (tmp = 0; *p;) {
+				n = *p++;
+				if (num & (1 << (n - 1))) {
+					PCHAR(tmp ? ',' : '<');
+					for (; (n = *p) > ' '; ++p)
+						PCHAR(n);
+					tmp = 1;
+				} else
+					for (; *p > ' '; ++p)
+						continue;
+			}
+			if (tmp)
+				PCHAR('>');
+			break;
+		case 'c':
+			PCHAR(va_arg(ap, int));
+			break;
+		case 'D':
+			up = va_arg(ap, u_char *);
+			p = va_arg(ap, char *);
+			if (!width)
+				width = 16;
+			while(width--) {
+				PCHAR(hex2ascii(*up >> 4));
+				PCHAR(hex2ascii(*up & 0x0f));
+				up++;
+				if (width)
+					for (q=p;*q;q++)
+						PCHAR(*q);
+			}
+			break;
+		case 'd':
+		case 'i':
+			base = 10;
+			sign = 1;
+			goto handle_sign;
+		case 'h':
+			if (hflag) {
+				hflag = 0;
+				cflag = 1;
+			} else
+				hflag = 1;
+			goto reswitch;
+		case 'j':
+			jflag = 1;
+			goto reswitch;
+		case 'l':
+			if (lflag) {
+				lflag = 0;
+				qflag = 1;
+			} else
+				lflag = 1;
+			goto reswitch;
+		case 'n':
+			if (jflag)
+				*(va_arg(ap, intmax_t *)) = retval;
+			else if (qflag)
+				*(va_arg(ap, quad_t *)) = retval;
+			else if (lflag)
+				*(va_arg(ap, long *)) = retval;
+			else if (zflag)
+				*(va_arg(ap, size_t *)) = retval;
+			else if (hflag)
+				*(va_arg(ap, short *)) = retval;
+			else if (cflag)
+				*(va_arg(ap, char *)) = retval;
+			else
+				*(va_arg(ap, int *)) = retval;
+			break;
+		case 'o':
+			base = 8;
+			goto handle_nosign;
+		case 'p':
+			base = 16;
+			sharpflag = (width == 0);
+			sign = 0;
+			num = (uintptr_t)va_arg(ap, void *);
+			goto number;
+		case 'q':
+			qflag = 1;
+			goto reswitch;
+		case 'r':
+			base = radix;
+			if (sign)
+				goto handle_sign;
+			goto handle_nosign;
+		case 's':
+			p = va_arg(ap, char *);
+			if (p == NULL)
+				p = "(null)";
+			if (!dot)
+				n = strlen (p);
+			else
+				for (n = 0; n < dwidth && p[n]; n++)
+					continue;
+
+			width -= n;
+
+			if (!ladjust && width > 0)
+				while (width--)
+					PCHAR(padc);
+			while (n--)
+				PCHAR(*p++);
+			if (ladjust && width > 0)
+				while (width--)
+					PCHAR(padc);
+			break;
+		case 't':
+			tflag = 1;
+			goto reswitch;
+		case 'u':
+			base = 10;
+			goto handle_nosign;
+		case 'X':
+			upper = 1;
+		case 'x':
+			base = 16;
+			goto handle_nosign;
+		case 'y':
+			base = 16;
+			sign = 1;
+			goto handle_sign;
+		case 'z':
+			zflag = 1;
+			goto reswitch;
+handle_nosign:
+			sign = 0;
+			if (jflag)
+				num = va_arg(ap, uintmax_t);
+			else if (qflag)
+				num = va_arg(ap, u_quad_t);
+			else if (tflag)
+				num = va_arg(ap, ptrdiff_t);
+			else if (lflag)
+				num = va_arg(ap, u_long);
+			else if (zflag)
+				num = va_arg(ap, size_t);
+			else if (hflag)
+				num = (u_short)va_arg(ap, int);
+			else if (cflag)
+				num = (u_char)va_arg(ap, int);
+			else
+				num = va_arg(ap, u_int);
+			goto number;
+handle_sign:
+			if (jflag)
+				num = va_arg(ap, intmax_t);
+			else if (qflag)
+				num = va_arg(ap, quad_t);
+			else if (tflag)
+				num = va_arg(ap, ptrdiff_t);
+			else if (lflag)
+				num = va_arg(ap, long);
+			else if (zflag)
+				num = va_arg(ap, ssize_t);
+			else if (hflag)
+				num = (short)va_arg(ap, int);
+			else if (cflag)
+				num = (char)va_arg(ap, int);
+			else
+				num = va_arg(ap, int);
+number:
+			if (sign && (intmax_t)num < 0) {
+				neg = 1;
+				num = -(intmax_t)num;
+			}
+			p = ksprintn(nbuf, num, base, &n, upper);
+			tmp = 0;
+			if (sharpflag && num != 0) {
+				if (base == 8)
+					tmp++;
+				else if (base == 16)
+					tmp += 2;
+			}
+			if (neg)
+				tmp++;
+
+			if (!ladjust && padc == '0')
+				dwidth = width - tmp;
+			width -= tmp + imax(dwidth, n);
+			dwidth -= n;
+			if (!ladjust)
+				while (width-- > 0)
+					PCHAR(' ');
+			if (neg)
+				PCHAR('-');
+			if (sharpflag && num != 0) {
+				if (base == 8) {
+					PCHAR('0');
+				} else if (base == 16) {
+					PCHAR('0');
+					PCHAR('x');
+				}
+			}
+			while (dwidth-- > 0)
+				PCHAR('0');
+
+			while (*p)
+				PCHAR(*p--);
+
+			if (ladjust)
+				while (width-- > 0)
+					PCHAR(' ');
+
+			break;
+		default:
+			while (percent < fmt)
+				PCHAR(*percent++);
+			/*
+			 * Since we ignore an formatting argument it is no 
+			 * longer safe to obey the remaining formatting
+			 * arguments as the arguments will no longer match
+			 * the format specs.
+			 */
+			stop = 1;
+			break;
+		}
+	}
+#undef PCHAR
+}
+
+/*
+ * Put character in log buffer with a particular priority.
+ */
+static void
+msglogchar(int c, int pri)
+{
+	static int lastpri = -1;
+	static int dangling;
+	char nbuf[MAXNBUF];
+	char *p;
+
+	if (!msgbufmapped)
+		return;
+	if (c == '\0' || c == '\r')
+		return;
+	if (pri != -1 && pri != lastpri) {
+		if (dangling) {
+			msgbuf_addchar(msgbufp, '\n');
+			dangling = 0;
+		}
+		msgbuf_addchar(msgbufp, '<');
+		for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL, 0); *p;)
+			msgbuf_addchar(msgbufp, *p--);
+		msgbuf_addchar(msgbufp, '>');
+		lastpri = pri;
+	}
+	msgbuf_addchar(msgbufp, c);
+	if (c == '\n') {
+		dangling = 0;
+		lastpri = -1;
+	} else {
+		dangling = 1;
+	}
+}
+
+static void
+msglogstr(char *str, int pri, int filter_cr)
+{
+	if (!msgbufmapped)
+		return;
+
+	msgbuf_addstr(msgbufp, pri, str, filter_cr);
+}
+
+void
+msgbufinit(void *ptr, int size)
+{
+	char *cp;
+	static struct msgbuf *oldp = NULL;
+
+	size -= sizeof(*msgbufp);
+	cp = (char *)ptr;
+	msgbufp = (struct msgbuf *)(cp + size);
+	msgbuf_reinit(msgbufp, cp, size);
+	if (msgbufmapped && oldp != msgbufp)
+		msgbuf_copy(oldp, msgbufp);
+	msgbufmapped = 1;
+	oldp = msgbufp;
+}
+
+static int unprivileged_read_msgbuf = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_read_msgbuf,
+    CTLFLAG_RW, &unprivileged_read_msgbuf, 0,
+    "Unprivileged processes may read the kernel message buffer");
+
+/* Sysctls for accessing/clearing the msgbuf */
+static int
+sysctl_kern_msgbuf(SYSCTL_HANDLER_ARGS)
+{
+	char buf[128];
+	u_int seq;
+	int error, len;
+
+	if (!unprivileged_read_msgbuf) {
+		error = priv_check(req->td, PRIV_MSGBUF);
+		if (error)
+			return (error);
+	}
+
+	/* Read the whole buffer, one chunk at a time. */
+	mtx_lock(&msgbuf_lock);
+	msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
+	for (;;) {
+		len = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq);
+		mtx_unlock(&msgbuf_lock);
+		if (len == 0)
+			return (0);
+
+		error = sysctl_handle_opaque(oidp, buf, len, req);
+		if (error)
+			return (error);
+
+		mtx_lock(&msgbuf_lock);
+	}
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, msgbuf,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+    NULL, 0, sysctl_kern_msgbuf, "A", "Contents of kernel message buffer");
+
+static int msgbuf_clearflag;
+
+static int
+sysctl_kern_msgbuf_clear(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+	if (!error && req->newptr) {
+		mtx_lock(&msgbuf_lock);
+		msgbuf_clear(msgbufp);
+		mtx_unlock(&msgbuf_lock);
+		msgbuf_clearflag = 0;
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, msgbuf_clear,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE,
+    &msgbuf_clearflag, 0, sysctl_kern_msgbuf_clear, "I",
+    "Clear kernel message buffer");
+
+#ifdef DDB
+
+DB_SHOW_COMMAND(msgbuf, db_show_msgbuf)
+{
+	int i, j;
+
+	if (!msgbufmapped) {
+		db_printf("msgbuf not mapped yet\n");
+		return;
+	}
+	db_printf("msgbufp = %p\n", msgbufp);
+	db_printf("magic = %x, size = %d, r= %u, w = %u, ptr = %p, cksum= %u\n",
+	    msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_rseq,
+	    msgbufp->msg_wseq, msgbufp->msg_ptr, msgbufp->msg_cksum);
+	for (i = 0; i < msgbufp->msg_size && !db_pager_quit; i++) {
+		j = MSGBUF_SEQ_TO_POS(msgbufp, i + msgbufp->msg_rseq);
+		db_printf("%c", msgbufp->msg_ptr[j]);
+	}
+	db_printf("\n");
+}
+
+#endif /* DDB */
+
+void
+hexdump(const void *ptr, int length, const char *hdr, int flags)
+{
+	int i, j, k;
+	int cols;
+	const unsigned char *cp;
+	char delim;
+
+	if ((flags & HD_DELIM_MASK) != 0)
+		delim = (flags & HD_DELIM_MASK) >> 8;
+	else
+		delim = ' ';
+
+	if ((flags & HD_COLUMN_MASK) != 0)
+		cols = flags & HD_COLUMN_MASK;
+	else
+		cols = 16;
+
+	cp = ptr;
+	for (i = 0; i < length; i+= cols) {
+		if (hdr != NULL)
+			printf("%s", hdr);
+
+		if ((flags & HD_OMIT_COUNT) == 0)
+			printf("%04x  ", i);
+
+		if ((flags & HD_OMIT_HEX) == 0) {
+			for (j = 0; j < cols; j++) {
+				k = i + j;
+				if (k < length)
+					printf("%c%02x", delim, cp[k]);
+				else
+					printf("   ");
+			}
+		}
+
+		if ((flags & HD_OMIT_CHARS) == 0) {
+			printf("  |");
+			for (j = 0; j < cols; j++) {
+				k = i + j;
+				if (k >= length)
+					printf(" ");
+				else if (cp[k] >= ' ' && cp[k] <= '~')
+					printf("%c", cp[k]);
+				else
+					printf(".");
+			}
+			printf("|");
+		}
+		printf("\n");
+	}
+}
+
diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c
new file mode 100644
index 0000000..c5b6b08
--- /dev/null
+++ b/sys/kern/subr_prof.c
@@ -0,0 +1,589 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)subr_prof.c	8.3 (Berkeley) 9/23/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+
+#ifdef GPROF
+#include <sys/malloc.h>
+#include <sys/gmon.h>
+#undef MCOUNT
+
+static MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");
+
+static void kmstartup(void *);
+SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL);
+
+struct gmonparam _gmonparam = { GMON_PROF_OFF };
+
+#ifdef GUPROF
+void
+nullfunc_loop_profiled()
+{
+	int i;
+
+	for (i = 0; i < CALIB_SCALE; i++)
+		nullfunc_profiled();
+}
+
+#define	nullfunc_loop_profiled_end	nullfunc_profiled	/* XXX */
+
+void
+nullfunc_profiled()
+{
+}
+#endif /* GUPROF */
+
+/*
+ * Update the histograms to support extending the text region arbitrarily.
+ * This is done slightly naively (no sparse regions), so will waste slight
+ * amounts of memory, but will overall work nicely enough to allow profiling
+ * of KLDs.
+ */
+void
+kmupetext(uintfptr_t nhighpc)
+{
+	struct gmonparam np;	/* slightly large */
+	struct gmonparam *p = &_gmonparam;
+	char *cp;
+
+	GIANT_REQUIRED;
+	bcopy(p, &np, sizeof(*p));
+	np.highpc = ROUNDUP(nhighpc, HISTFRACTION * sizeof(HISTCOUNTER));
+	if (np.highpc <= p->highpc)
+		return;
+	np.textsize = np.highpc - p->lowpc;
+	np.kcountsize = np.textsize / HISTFRACTION;
+	np.hashfraction = HASHFRACTION;
+	np.fromssize = np.textsize / HASHFRACTION;
+	np.tolimit = np.textsize * ARCDENSITY / 100;
+	if (np.tolimit < MINARCS)
+		np.tolimit = MINARCS;
+	else if (np.tolimit > MAXARCS)
+		np.tolimit = MAXARCS;
+	np.tossize = np.tolimit * sizeof(struct tostruct);
+	cp = malloc(np.kcountsize + np.fromssize + np.tossize,
+	    M_GPROF, M_WAITOK);
+	/*
+	 * Check for something else extending highpc while we slept.
+	 */
+	if (np.highpc <= p->highpc) {
+		free(cp, M_GPROF);
+		return;
+	}
+	np.tos = (struct tostruct *)cp;
+	cp += np.tossize;
+	np.kcount = (HISTCOUNTER *)cp;
+	cp += np.kcountsize;
+	np.froms = (u_short *)cp;
+#ifdef GUPROF
+	/* Reinitialize pointers to overhead counters. */
+	np.cputime_count = &KCOUNT(&np, PC_TO_I(&np, cputime));
+	np.mcount_count = &KCOUNT(&np, PC_TO_I(&np, mcount));
+	np.mexitcount_count = &KCOUNT(&np, PC_TO_I(&np, mexitcount));
+#endif
+	critical_enter();
+	bcopy(p->tos, np.tos, p->tossize);
+	bzero((char *)np.tos + p->tossize, np.tossize - p->tossize);
+	bcopy(p->kcount, np.kcount, p->kcountsize);
+	bzero((char *)np.kcount + p->kcountsize, np.kcountsize -
+	    p->kcountsize);
+	bcopy(p->froms, np.froms, p->fromssize);
+	bzero((char *)np.froms + p->fromssize, np.fromssize - p->fromssize);
+	cp = (char *)p->tos;
+	bcopy(&np, p, sizeof(*p));
+	critical_exit();
+	free(cp, M_GPROF);
+}
+
+static void
+kmstartup(dummy)
+	void *dummy;
+{
+	char *cp;
+	struct gmonparam *p = &_gmonparam;
+#ifdef GUPROF
+	int cputime_overhead;
+	int empty_loop_time;
+	int i;
+	int mcount_overhead;
+	int mexitcount_overhead;
+	int nullfunc_loop_overhead;
+	int nullfunc_loop_profiled_time;
+	uintfptr_t tmp_addr;
+#endif
+
+	/*
+	 * Round lowpc and highpc to multiples of the density we're using
+	 * so the rest of the scaling (here and in gprof) stays in ints.
+	 */
+	p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER));
+	p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER));
+	p->textsize = p->highpc - p->lowpc;
+	printf("Profiling kernel, textsize=%lu [%jx..%jx]\n",
+	    p->textsize, (uintmax_t)p->lowpc, (uintmax_t)p->highpc);
+	p->kcountsize = p->textsize / HISTFRACTION;
+	p->hashfraction = HASHFRACTION;
+	p->fromssize = p->textsize / HASHFRACTION;
+	p->tolimit = p->textsize * ARCDENSITY / 100;
+	if (p->tolimit < MINARCS)
+		p->tolimit = MINARCS;
+	else if (p->tolimit > MAXARCS)
+		p->tolimit = MAXARCS;
+	p->tossize = p->tolimit * sizeof(struct tostruct);
+	cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize,
+	    M_GPROF, M_WAITOK | M_ZERO);
+	p->tos = (struct tostruct *)cp;
+	cp += p->tossize;
+	p->kcount = (HISTCOUNTER *)cp;
+	cp += p->kcountsize;
+	p->froms = (u_short *)cp;
+	p->histcounter_type = FUNCTION_ALIGNMENT / HISTFRACTION * NBBY;
+
+#ifdef GUPROF
+	/* Signed counters. */
+	p->histcounter_type = -p->histcounter_type;
+
+	/* Initialize pointers to overhead counters. */
+	p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime));
+	p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount));
+	p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount));
+
+	/*
+	 * Disable interrupts to avoid interference while we calibrate
+	 * things.
+	 */
+	critical_enter();
+
+	/*
+	 * Determine overheads.
+	 * XXX this needs to be repeated for each useful timer/counter.
+	 */
+	cputime_overhead = 0;
+	startguprof(p);
+	for (i = 0; i < CALIB_SCALE; i++)
+		cputime_overhead += cputime();
+
+	empty_loop();
+	startguprof(p);
+	empty_loop();
+	empty_loop_time = cputime();
+
+	nullfunc_loop_profiled();
+
+	/*
+	 * Start profiling.  There won't be any normal function calls since
+	 * interrupts are disabled, but we will call the profiling routines
+	 * directly to determine their overheads.
+	 */
+	p->state = GMON_PROF_HIRES;
+
+	startguprof(p);
+	nullfunc_loop_profiled();
+
+	startguprof(p);
+	for (i = 0; i < CALIB_SCALE; i++)
+		MCOUNT_OVERHEAD(sys_profil);
+	mcount_overhead = KCOUNT(p, PC_TO_I(p, sys_profil));
+
+	startguprof(p);
+	for (i = 0; i < CALIB_SCALE; i++)
+		MEXITCOUNT_OVERHEAD();
+	MEXITCOUNT_OVERHEAD_GETLABEL(tmp_addr);
+	mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr));
+
+	p->state = GMON_PROF_OFF;
+	stopguprof(p);
+
+	critical_exit();
+
+	nullfunc_loop_profiled_time = 0;
+	for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled;
+	     tmp_addr < (uintfptr_t)nullfunc_loop_profiled_end;
+	     tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER))
+		nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr));
+#define CALIB_DOSCALE(count)	(((count) + CALIB_SCALE / 3) / CALIB_SCALE)
+#define	c2n(count, freq)	((int)((count) * 1000000000LL / freq))
+	printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n",
+	       CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)),
+	       CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)),
+	       CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)),
+	       CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)),
+	       CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate)));
+	cputime_overhead -= empty_loop_time;
+	mcount_overhead -= empty_loop_time;
+	mexitcount_overhead -= empty_loop_time;
+
+	/*-
+	 * Profiling overheads are determined by the times between the
+	 * following events:
+	 *	MC1: mcount() is called
+	 *	MC2: cputime() (called from mcount()) latches the timer
+	 *	MC3: mcount() completes
+	 *	ME1: mexitcount() is called
+	 *	ME2: cputime() (called from mexitcount()) latches the timer
+	 *	ME3: mexitcount() completes.
+	 * The times between the events vary slightly depending on instruction
+	 * combination and cache misses, etc.  Attempt to determine the
+	 * minimum times.  These can be subtracted from the profiling times
+	 * without much risk of reducing the profiling times below what they
+	 * would be when profiling is not configured.  Abbreviate:
+	 *	ab = minimum time between MC1 and MC3
+	 *	a  = minumum time between MC1 and MC2
+	 *	b  = minimum time between MC2 and MC3
+	 *	cd = minimum time between ME1 and ME3
+	 *	c  = minimum time between ME1 and ME2
+	 *	d  = minimum time between ME2 and ME3.
+	 * These satisfy the relations:
+	 *	ab            <= mcount_overhead		(just measured)
+	 *	a + b         <= ab
+	 *	        cd    <= mexitcount_overhead		(just measured)
+	 *	        c + d <= cd
+	 *	a         + d <= nullfunc_loop_profiled_time	(just measured)
+	 *	a >= 0, b >= 0, c >= 0, d >= 0.
+	 * Assume that ab and cd are equal to the minimums.
+	 */
+	p->cputime_overhead = CALIB_DOSCALE(cputime_overhead);
+	p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead);
+	p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead
+					       - cputime_overhead);
+	nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time;
+	p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead
+						     - nullfunc_loop_overhead)
+						    / 4);
+	p->mexitcount_pre_overhead = p->mexitcount_overhead
+				     + p->cputime_overhead
+				     - p->mexitcount_post_overhead;
+	p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead)
+				 - p->mexitcount_post_overhead;
+	p->mcount_post_overhead = p->mcount_overhead
+				  + p->cputime_overhead
+				  - p->mcount_pre_overhead;
+	printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n",
+	       c2n(p->cputime_overhead, p->profrate),
+	       c2n(p->mcount_overhead, p->profrate),
+	       c2n(p->mcount_pre_overhead, p->profrate),
+	       c2n(p->mcount_post_overhead, p->profrate),
+	       c2n(p->cputime_overhead, p->profrate),
+	       c2n(p->mexitcount_overhead, p->profrate),
+	       c2n(p->mexitcount_pre_overhead, p->profrate),
+	       c2n(p->mexitcount_post_overhead, p->profrate));
+	printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n",
+	       p->cputime_overhead, p->mcount_overhead,
+	       p->mcount_pre_overhead, p->mcount_post_overhead,
+	       p->cputime_overhead, p->mexitcount_overhead,
+	       p->mexitcount_pre_overhead, p->mexitcount_post_overhead);
+#endif /* GUPROF */
+}
+
+/*
+ * Return kernel profiling information.
+ */
+static int
+sysctl_kern_prof(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
+	struct gmonparam *gp = &_gmonparam;
+	int error;
+	int state;
+
+	/* all sysctl names at this level are terminal */
+	if (namelen != 1)
+		return (ENOTDIR);		/* overloaded */
+
+	switch (name[0]) {
+	case GPROF_STATE:
+		state = gp->state;
+		error = sysctl_handle_int(oidp, &state, 0, req);
+		if (error)
+			return (error);
+		if (!req->newptr)
+			return (0);
+		if (state == GMON_PROF_OFF) {
+			gp->state = state;
+			PROC_LOCK(&proc0);
+			stopprofclock(&proc0);
+			PROC_UNLOCK(&proc0);
+			stopguprof(gp);
+		} else if (state == GMON_PROF_ON) {
+			gp->state = GMON_PROF_OFF;
+			stopguprof(gp);
+			gp->profrate = profhz;
+			PROC_LOCK(&proc0);
+			startprofclock(&proc0);
+			PROC_UNLOCK(&proc0);
+			gp->state = state;
+#ifdef GUPROF
+		} else if (state == GMON_PROF_HIRES) {
+			gp->state = GMON_PROF_OFF;
+			PROC_LOCK(&proc0);
+			stopprofclock(&proc0);
+			PROC_UNLOCK(&proc0);
+			startguprof(gp);
+			gp->state = state;
+#endif
+		} else if (state != gp->state)
+			return (EINVAL);
+		return (0);
+	case GPROF_COUNT:
+		return (sysctl_handle_opaque(oidp, 
+			gp->kcount, gp->kcountsize, req));
+	case GPROF_FROMS:
+		return (sysctl_handle_opaque(oidp, 
+			gp->froms, gp->fromssize, req));
+	case GPROF_TOS:
+		return (sysctl_handle_opaque(oidp, 
+			gp->tos, gp->tossize, req));
+	case GPROF_GMONPARAM:
+		return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req));
+	default:
+		return (EOPNOTSUPP);
+	}
+	/* NOTREACHED */
+}
+
+static SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, "");
+#endif /* GPROF */
+
+/*
+ * Profiling system call.
+ *
+ * The scale factor is a fixed point number with 16 bits of fraction, so that
+ * 1.0 is represented as 0x10000.  A scale factor of 0 turns off profiling.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct profil_args {
+	caddr_t	samples;
+	size_t	size;
+	size_t	offset;
+	u_int	scale;
+};
+#endif
+/* ARGSUSED */
+int
+sys_profil(struct thread *td, struct profil_args *uap)
+{
+	struct uprof *upp;
+	struct proc *p;
+
+	if (uap->scale > (1 << 16))
+		return (EINVAL);
+
+	p = td->td_proc;
+	if (uap->scale == 0) {
+		PROC_LOCK(p);
+		stopprofclock(p);
+		PROC_UNLOCK(p);
+		return (0);
+	}
+	PROC_LOCK(p);
+	upp = &td->td_proc->p_stats->p_prof;
+	PROC_SLOCK(p);
+	upp->pr_off = uap->offset;
+	upp->pr_scale = uap->scale;
+	upp->pr_base = uap->samples;
+	upp->pr_size = uap->size;
+	PROC_SUNLOCK(p);
+	startprofclock(p);
+	PROC_UNLOCK(p);
+
+	return (0);
+}
+
+/*
+ * Scale is a fixed-point number with the binary point 16 bits
+ * into the value, and is <= 1.0.  pc is at most 32 bits, so the
+ * intermediate result is at most 48 bits.
+ */
+#define	PC_TO_INDEX(pc, prof) \
+	((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
+	    (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
+
+/*
+ * Collect user-level profiling statistics; called on a profiling tick,
+ * when a process is running in user-mode.  This routine may be called
+ * from an interrupt context.  We try to update the user profiling buffers
+ * cheaply with fuswintr() and suswintr().  If that fails, we revert to
+ * an AST that will vector us to trap() with a context in which copyin
+ * and copyout will work.  Trap will then call addupc_task().
+ *
+ * Note that we may (rarely) not get around to the AST soon enough, and
+ * lose profile ticks when the next tick overwrites this one, but in this
+ * case the system is overloaded and the profile is probably already
+ * inaccurate.
+ */
+void
+addupc_intr(struct thread *td, uintfptr_t pc, u_int ticks)
+{
+	struct uprof *prof;
+	caddr_t addr;
+	u_int i;
+	int v;
+
+	if (ticks == 0)
+		return;
+	prof = &td->td_proc->p_stats->p_prof;
+	PROC_SLOCK(td->td_proc);
+	if (pc < prof->pr_off ||
+	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
+		PROC_SUNLOCK(td->td_proc);
+		return;			/* out of range; ignore */
+	}
+
+	addr = prof->pr_base + i;
+	PROC_SUNLOCK(td->td_proc);
+	if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) {
+		td->td_profil_addr = pc;
+		td->td_profil_ticks = ticks;
+		td->td_pflags |= TDP_OWEUPC;
+		thread_lock(td);
+		td->td_flags |= TDF_ASTPENDING;
+		thread_unlock(td);
+	}
+}
+
+/*
+ * Much like before, but we can afford to take faults here.  If the
+ * update fails, we simply turn off profiling.
+ */
+void
+addupc_task(struct thread *td, uintfptr_t pc, u_int ticks)
+{
+	struct proc *p = td->td_proc; 
+	struct uprof *prof;
+	caddr_t addr;
+	u_int i;
+	u_short v;
+	int stop = 0;
+
+	if (ticks == 0)
+		return;
+
+	PROC_LOCK(p);
+	if (!(p->p_flag & P_PROFIL)) {
+		PROC_UNLOCK(p);
+		return;
+	}
+	p->p_profthreads++;
+	prof = &p->p_stats->p_prof;
+	PROC_SLOCK(p);
+	if (pc < prof->pr_off ||
+	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
+		PROC_SUNLOCK(p);
+		goto out;
+	}
+
+	addr = prof->pr_base + i;
+	PROC_SUNLOCK(p);
+	PROC_UNLOCK(p);
+	if (copyin(addr, &v, sizeof(v)) == 0) {
+		v += ticks;
+		if (copyout(&v, addr, sizeof(v)) == 0) {
+			PROC_LOCK(p);
+			goto out;
+		}
+	}
+	stop = 1;
+	PROC_LOCK(p);
+
+out:
+	if (--p->p_profthreads == 0) {
+		if (p->p_flag & P_STOPPROF) {
+			wakeup(&p->p_profthreads);
+			stop = 0;
+		}
+	}
+	if (stop)
+		stopprofclock(p);
+	PROC_UNLOCK(p);
+}
+
+#if (defined(__amd64__) || defined(__i386__)) && \
+	defined(__GNUCLIKE_CTOR_SECTION_HANDLING)
+/*
+ * Support for "--test-coverage --profile-arcs" in GCC.
+ *
+ * We need to call all the functions in the .ctor section, in order
+ * to get all the counter-arrays strung into a list.
+ *
+ * XXX: the .ctors call __bb_init_func which is located in over in 
+ * XXX: i386/i386/support.s for historical reasons.  There is probably
+ * XXX: no reason for that to be assembler anymore, but doing it right
+ * XXX: in MI C code requires one to reverse-engineer the type-selection
+ * XXX: inside GCC.  Have fun.
+ *
+ * XXX: Worrisome perspective: Calling the .ctors may make C++ in the
+ * XXX: kernel feasible.  Don't.
+ */
+typedef void (*ctor_t)(void);
+extern ctor_t _start_ctors, _stop_ctors;
+
+static void
+tcov_init(void *foo __unused)
+{
+	ctor_t *p, q;
+
+	for (p = &_start_ctors; p < &_stop_ctors; p++) {
+		q = *p;
+		q();
+	}
+}
+
+SYSINIT(tcov_init, SI_SUB_KPROF, SI_ORDER_SECOND, tcov_init, NULL);
+
+/*
+ * GCC contains magic to recognize calls to for instance execve() and
+ * puts in calls to this function to preserve the profile counters.
+ * XXX: Put zinging punchline here.
+ */
+void __bb_fork_func(void);
+void
+__bb_fork_func(void)
+{
+}
+
+#endif
+
diff --git a/sys/kern/subr_rman.c b/sys/kern/subr_rman.c
new file mode 100644
index 0000000..e43dfcf
--- /dev/null
+++ b/sys/kern/subr_rman.c
@@ -0,0 +1,1160 @@
+/*-
+ * Copyright 1998 Massachusetts Institute of Technology
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation for any purpose and without fee is hereby
+ * granted, provided that both the above copyright notice and this
+ * permission notice appear in all copies, that both the above
+ * copyright notice and this permission notice appear in all
+ * supporting documentation, and that the name of M.I.T. not be used
+ * in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission.  M.I.T. makes
+ * no representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied
+ * warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
+ * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
+ * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The kernel resource manager.  This code is responsible for keeping track
+ * of hardware resources which are apportioned out to various drivers.
+ * It does not actually assign those resources, and it is not expected
+ * that end-device drivers will call into this code directly.  Rather,
+ * the code which implements the buses that those devices are attached to,
+ * and the code which manages CPU resources, will call this code, and the
+ * end-device drivers will make upcalls to that code to actually perform
+ * the allocation.
+ *
+ * There are two sorts of resources managed by this code.  The first is
+ * the more familiar array (RMAN_ARRAY) type; resources in this class
+ * consist of a sequence of individually-allocatable objects which have
+ * been numbered in some well-defined order.  Most of the resources
+ * are of this type, as it is the most familiar.  The second type is
+ * called a gauge (RMAN_GAUGE), and models fungible resources (i.e.,
+ * resources in which each instance is indistinguishable from every
+ * other instance).  The principal anticipated application of gauges
+ * is in the context of power consumption, where a bus may have a specific
+ * power budget which all attached devices share.  RMAN_GAUGE is not
+ * implemented yet.
+ *
+ * For array resources, we make one simplifying assumption: two clients
+ * sharing the same resource must use the same range of indices.  That
+ * is to say, sharing of overlapping-but-not-identical regions is not
+ * permitted.
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/bus.h>		/* XXX debugging */
+#include <machine/bus.h>
+#include <sys/rman.h>
+#include <sys/sysctl.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * We use a linked list rather than a bitmap because we need to be able to
+ * represent potentially huge objects (like all of a processor's physical
+ * address space).  That is also why the indices are defined to have type
+ * `unsigned long' -- that being the largest integral type in ISO C (1990).
+ * The 1999 version of C allows `long long'; we may need to switch to that
+ * at some point in the future, particularly if we want to support 36-bit
+ * addresses on IA32 hardware.
+ */
+struct resource_i {
+	struct resource		r_r;
+	TAILQ_ENTRY(resource_i)	r_link;
+	LIST_ENTRY(resource_i)	r_sharelink;
+	LIST_HEAD(, resource_i)	*r_sharehead;
+	u_long	r_start;	/* index of the first entry in this resource */
+	u_long	r_end;		/* index of the last entry (inclusive) */
+	u_int	r_flags;
+	void	*r_virtual;	/* virtual address of this resource */
+	struct	device *r_dev;	/* device which has allocated this resource */
+	struct	rman *r_rm;	/* resource manager from whence this came */
+	int	r_rid;		/* optional rid for this resource. */
+};
+
+static int     rman_debug = 0;
+TUNABLE_INT("debug.rman_debug", &rman_debug);
+SYSCTL_INT(_debug, OID_AUTO, rman_debug, CTLFLAG_RW,
+    &rman_debug, 0, "rman debug");
+
+#define DPRINTF(params) if (rman_debug) printf params
+
+static MALLOC_DEFINE(M_RMAN, "rman", "Resource manager");
+
+struct	rman_head rman_head;
+static	struct mtx rman_mtx; /* mutex to protect rman_head */
+static	int int_rman_activate_resource(struct rman *rm, struct resource_i *r,
+				       struct resource_i **whohas);
+static	int int_rman_deactivate_resource(struct resource_i *r);
+static	int int_rman_release_resource(struct rman *rm, struct resource_i *r);
+
+static __inline struct resource_i *
+int_alloc_resource(int malloc_flag)
+{
+	struct resource_i *r;
+
+	r = malloc(sizeof *r, M_RMAN, malloc_flag | M_ZERO);
+	if (r != NULL) {
+		r->r_r.__r_i = r;
+	}
+	return (r);
+}
+
+int
+rman_init(struct rman *rm)
+{
+	static int once = 0;
+
+	if (once == 0) {
+		once = 1;
+		TAILQ_INIT(&rman_head);
+		mtx_init(&rman_mtx, "rman head", NULL, MTX_DEF);
+	}
+
+	if (rm->rm_start == 0 && rm->rm_end == 0)
+		rm->rm_end = ~0ul;
+	if (rm->rm_type == RMAN_UNINIT)
+		panic("rman_init");
+	if (rm->rm_type == RMAN_GAUGE)
+		panic("implement RMAN_GAUGE");
+
+	TAILQ_INIT(&rm->rm_list);
+	rm->rm_mtx = malloc(sizeof *rm->rm_mtx, M_RMAN, M_NOWAIT | M_ZERO);
+	if (rm->rm_mtx == NULL)
+		return ENOMEM;
+	mtx_init(rm->rm_mtx, "rman", NULL, MTX_DEF);
+
+	mtx_lock(&rman_mtx);
+	TAILQ_INSERT_TAIL(&rman_head, rm, rm_link);
+	mtx_unlock(&rman_mtx);
+	return 0;
+}
+
+int
+rman_manage_region(struct rman *rm, u_long start, u_long end)
+{
+	struct resource_i *r, *s, *t;
+	int rv = 0;
+
+	DPRINTF(("rman_manage_region: <%s> request: start %#lx, end %#lx\n",
+	    rm->rm_descr, start, end));
+	if (start < rm->rm_start || end > rm->rm_end)
+		return EINVAL;
+	r = int_alloc_resource(M_NOWAIT);
+	if (r == NULL)
+		return ENOMEM;
+	r->r_start = start;
+	r->r_end = end;
+	r->r_rm = rm;
+
+	mtx_lock(rm->rm_mtx);
+
+	/* Skip entries before us. */
+	TAILQ_FOREACH(s, &rm->rm_list, r_link) {
+		if (s->r_end == ULONG_MAX)
+			break;
+		if (s->r_end + 1 >= r->r_start)
+			break;
+	}
+
+	/* If we ran off the end of the list, insert at the tail. */
+	if (s == NULL) {
+		TAILQ_INSERT_TAIL(&rm->rm_list, r, r_link);
+	} else {
+		/* Check for any overlap with the current region. */
+		if (r->r_start <= s->r_end && r->r_end >= s->r_start) {
+			rv = EBUSY;
+			goto out;
+		}
+
+		/* Check for any overlap with the next region. */
+		t = TAILQ_NEXT(s, r_link);
+		if (t && r->r_start <= t->r_end && r->r_end >= t->r_start) {
+			rv = EBUSY;
+			goto out;
+		}
+
+		/*
+		 * See if this region can be merged with the next region.  If
+		 * not, clear the pointer.
+		 */
+		if (t && (r->r_end + 1 != t->r_start || t->r_flags != 0))
+			t = NULL;
+
+		/* See if we can merge with the current region. */
+		if (s->r_end + 1 == r->r_start && s->r_flags == 0) {
+			/* Can we merge all 3 regions? */
+			if (t != NULL) {
+				s->r_end = t->r_end;
+				TAILQ_REMOVE(&rm->rm_list, t, r_link);
+				free(r, M_RMAN);
+				free(t, M_RMAN);
+			} else {
+				s->r_end = r->r_end;
+				free(r, M_RMAN);
+			}
+		} else if (t != NULL) {
+			/* Can we merge with just the next region? */
+			t->r_start = r->r_start;
+			free(r, M_RMAN);
+		} else if (s->r_end < r->r_start) {
+			TAILQ_INSERT_AFTER(&rm->rm_list, s, r, r_link);
+		} else {
+			TAILQ_INSERT_BEFORE(s, r, r_link);
+		}
+	}
+out:
+	mtx_unlock(rm->rm_mtx);
+	return rv;
+}
+
+int
+rman_init_from_resource(struct rman *rm, struct resource *r)
+{
+	int rv;
+
+	if ((rv = rman_init(rm)) != 0)
+		return (rv);
+	return (rman_manage_region(rm, r->__r_i->r_start, r->__r_i->r_end));
+}
+
+int
+rman_fini(struct rman *rm)
+{
+	struct resource_i *r;
+
+	mtx_lock(rm->rm_mtx);
+	TAILQ_FOREACH(r, &rm->rm_list, r_link) {
+		if (r->r_flags & RF_ALLOCATED) {
+			mtx_unlock(rm->rm_mtx);
+			return EBUSY;
+		}
+	}
+
+	/*
+	 * There really should only be one of these if we are in this
+	 * state and the code is working properly, but it can't hurt.
+	 */
+	while (!TAILQ_EMPTY(&rm->rm_list)) {
+		r = TAILQ_FIRST(&rm->rm_list);
+		TAILQ_REMOVE(&rm->rm_list, r, r_link);
+		free(r, M_RMAN);
+	}
+	mtx_unlock(rm->rm_mtx);
+	mtx_lock(&rman_mtx);
+	TAILQ_REMOVE(&rman_head, rm, rm_link);
+	mtx_unlock(&rman_mtx);
+	mtx_destroy(rm->rm_mtx);
+	free(rm->rm_mtx, M_RMAN);
+
+	return 0;
+}
+
+int
+rman_first_free_region(struct rman *rm, u_long *start, u_long *end)
+{
+	struct resource_i *r;
+
+	mtx_lock(rm->rm_mtx);
+	TAILQ_FOREACH(r, &rm->rm_list, r_link) {
+		if (!(r->r_flags & RF_ALLOCATED)) {
+			*start = r->r_start;
+			*end = r->r_end;
+			mtx_unlock(rm->rm_mtx);
+			return (0);
+		}
+	}
+	mtx_unlock(rm->rm_mtx);
+	return (ENOENT);
+}
+
+int
+rman_last_free_region(struct rman *rm, u_long *start, u_long *end)
+{
+	struct resource_i *r;
+
+	mtx_lock(rm->rm_mtx);
+	TAILQ_FOREACH_REVERSE(r, &rm->rm_list, resource_head, r_link) {
+		if (!(r->r_flags & RF_ALLOCATED)) {
+			*start = r->r_start;
+			*end = r->r_end;
+			mtx_unlock(rm->rm_mtx);
+			return (0);
+		}
+	}
+	mtx_unlock(rm->rm_mtx);
+	return (ENOENT);
+}
+
+/* Shrink or extend one or both ends of an allocated resource. */
+int
+rman_adjust_resource(struct resource *rr, u_long start, u_long end)
+{
+	struct	resource_i *r, *s, *t, *new;
+	struct	rman *rm;
+
+	/* Not supported for shared resources. */
+	r = rr->__r_i;
+	if (r->r_flags & (RF_TIMESHARE | RF_SHAREABLE))
+		return (EINVAL);
+
+	/*
+	 * This does not support wholesale moving of a resource.  At
+	 * least part of the desired new range must overlap with the
+	 * existing resource.
+	 */
+	if (end < r->r_start || r->r_end < start)
+		return (EINVAL);
+
+	/*
+	 * Find the two resource regions immediately adjacent to the
+	 * allocated resource.
+	 */
+	rm = r->r_rm;
+	mtx_lock(rm->rm_mtx);
+#ifdef INVARIANTS
+	TAILQ_FOREACH(s, &rm->rm_list, r_link) {
+		if (s == r)
+			break;
+	}
+	if (s == NULL)
+		panic("resource not in list");
+#endif
+	s = TAILQ_PREV(r, resource_head, r_link);
+	t = TAILQ_NEXT(r, r_link);
+	KASSERT(s == NULL || s->r_end + 1 == r->r_start,
+	    ("prev resource mismatch"));
+	KASSERT(t == NULL || r->r_end + 1 == t->r_start,
+	    ("next resource mismatch"));
+
+	/*
+	 * See if the changes are permitted.  Shrinking is always allowed,
+	 * but growing requires sufficient room in the adjacent region.
+	 */
+	if (start < r->r_start && (s == NULL || (s->r_flags & RF_ALLOCATED) ||
+	    s->r_start > start)) {
+		mtx_unlock(rm->rm_mtx);
+		return (EBUSY);
+	}
+	if (end > r->r_end && (t == NULL || (t->r_flags & RF_ALLOCATED) ||
+	    t->r_end < end)) {
+		mtx_unlock(rm->rm_mtx);
+		return (EBUSY);
+	}
+
+	/*
+	 * While holding the lock, grow either end of the resource as
+	 * needed and shrink either end if the shrinking does not require
+	 * allocating a new resource.  We can safely drop the lock and then
+	 * insert a new range to handle the shrinking case afterwards.
+	 */
+	if (start < r->r_start ||
+	    (start > r->r_start && s != NULL && !(s->r_flags & RF_ALLOCATED))) {
+		KASSERT(s->r_flags == 0, ("prev is busy"));
+		r->r_start = start;
+		if (s->r_start == start) {
+			TAILQ_REMOVE(&rm->rm_list, s, r_link);
+			free(s, M_RMAN);
+		} else
+			s->r_end = start - 1;
+	}
+	if (end > r->r_end ||
+	    (end < r->r_end && t != NULL && !(t->r_flags & RF_ALLOCATED))) {
+		KASSERT(t->r_flags == 0, ("next is busy"));
+		r->r_end = end;
+		if (t->r_end == end) {
+			TAILQ_REMOVE(&rm->rm_list, t, r_link);
+			free(t, M_RMAN);
+		} else
+			t->r_start = end + 1;
+	}
+	mtx_unlock(rm->rm_mtx);
+
+	/*
+	 * Handle the shrinking cases that require allocating a new
+	 * resource to hold the newly-free region.  We have to recheck
+	 * if we still need this new region after acquiring the lock.
+	 */
+	if (start > r->r_start) {
+		new = int_alloc_resource(M_WAITOK);
+		new->r_start = r->r_start;
+		new->r_end = start - 1;
+		new->r_rm = rm;
+		mtx_lock(rm->rm_mtx);
+		r->r_start = start;
+		s = TAILQ_PREV(r, resource_head, r_link);
+		if (s != NULL && !(s->r_flags & RF_ALLOCATED)) {
+			s->r_end = start - 1;
+			free(new, M_RMAN);
+		} else
+			TAILQ_INSERT_BEFORE(r, new, r_link);
+		mtx_unlock(rm->rm_mtx);
+	}
+	if (end < r->r_end) {
+		new = int_alloc_resource(M_WAITOK);
+		new->r_start = end + 1;
+		new->r_end = r->r_end;
+		new->r_rm = rm;
+		mtx_lock(rm->rm_mtx);
+		r->r_end = end;
+		t = TAILQ_NEXT(r, r_link);
+		if (t != NULL && !(t->r_flags & RF_ALLOCATED)) {
+			t->r_start = end + 1;
+			free(new, M_RMAN);
+		} else
+			TAILQ_INSERT_AFTER(&rm->rm_list, r, new, r_link);
+		mtx_unlock(rm->rm_mtx);
+	}
+	return (0);
+}
+
+struct resource *
+rman_reserve_resource_bound(struct rman *rm, u_long start, u_long end,
+		      u_long count, u_long bound,  u_int flags,
+		      struct device *dev)
+{
+	u_int	want_activate;
+	struct	resource_i *r, *s, *rv;
+	u_long	rstart, rend, amask, bmask;
+
+	rv = NULL;
+
+	DPRINTF(("rman_reserve_resource_bound: <%s> request: [%#lx, %#lx], "
+	       "length %#lx, flags %u, device %s\n", rm->rm_descr, start, end,
+	       count, flags,
+	       dev == NULL ? "<null>" : device_get_nameunit(dev)));
+	want_activate = (flags & RF_ACTIVE);
+	flags &= ~RF_ACTIVE;
+
+	mtx_lock(rm->rm_mtx);
+
+	for (r = TAILQ_FIRST(&rm->rm_list);
+	     r && r->r_end < start;
+	     r = TAILQ_NEXT(r, r_link))
+		;
+
+	if (r == NULL) {
+		DPRINTF(("could not find a region\n"));
+		goto out;
+	}
+
+	amask = (1ul << RF_ALIGNMENT(flags)) - 1;
+	/* If bound is 0, bmask will also be 0 */
+	bmask = ~(bound - 1);
+	/*
+	 * First try to find an acceptable totally-unshared region.
+	 */
+	for (s = r; s; s = TAILQ_NEXT(s, r_link)) {
+		DPRINTF(("considering [%#lx, %#lx]\n", s->r_start, s->r_end));
+		if (s->r_start + count - 1 > end) {
+			DPRINTF(("s->r_start (%#lx) + count - 1> end (%#lx)\n",
+			    s->r_start, end));
+			break;
+		}
+		if (s->r_flags & RF_ALLOCATED) {
+			DPRINTF(("region is allocated\n"));
+			continue;
+		}
+		rstart = ulmax(s->r_start, start);
+		/*
+		 * Try to find a region by adjusting to boundary and alignment
+		 * until both conditions are satisfied. This is not an optimal
+		 * algorithm, but in most cases it isn't really bad, either.
+		 */
+		do {
+			rstart = (rstart + amask) & ~amask;
+			if (((rstart ^ (rstart + count - 1)) & bmask) != 0)
+				rstart += bound - (rstart & ~bmask);
+		} while ((rstart & amask) != 0 && rstart < end &&
+		    rstart < s->r_end);
+		rend = ulmin(s->r_end, ulmax(rstart + count - 1, end));
+		if (rstart > rend) {
+			DPRINTF(("adjusted start exceeds end\n"));
+			continue;
+		}
+		DPRINTF(("truncated region: [%#lx, %#lx]; size %#lx (requested %#lx)\n",
+		       rstart, rend, (rend - rstart + 1), count));
+
+		if ((rend - rstart + 1) >= count) {
+			DPRINTF(("candidate region: [%#lx, %#lx], size %#lx\n",
+			       rstart, rend, (rend - rstart + 1)));
+			if ((s->r_end - s->r_start + 1) == count) {
+				DPRINTF(("candidate region is entire chunk\n"));
+				rv = s;
+				rv->r_flags |= RF_ALLOCATED | flags;
+				rv->r_dev = dev;
+				goto out;
+			}
+
+			/*
+			 * If s->r_start < rstart and
+			 *    s->r_end > rstart + count - 1, then
+			 * we need to split the region into three pieces
+			 * (the middle one will get returned to the user).
+			 * Otherwise, we are allocating at either the
+			 * beginning or the end of s, so we only need to
+			 * split it in two.  The first case requires
+			 * two new allocations; the second requires but one.
+			 */
+			rv = int_alloc_resource(M_NOWAIT);
+			if (rv == NULL)
+				goto out;
+			rv->r_start = rstart;
+			rv->r_end = rstart + count - 1;
+			rv->r_flags = flags | RF_ALLOCATED;
+			rv->r_dev = dev;
+			rv->r_rm = rm;
+
+			if (s->r_start < rv->r_start && s->r_end > rv->r_end) {
+				DPRINTF(("splitting region in three parts: "
+				       "[%#lx, %#lx]; [%#lx, %#lx]; [%#lx, %#lx]\n",
+				       s->r_start, rv->r_start - 1,
+				       rv->r_start, rv->r_end,
+				       rv->r_end + 1, s->r_end));
+				/*
+				 * We are allocating in the middle.
+				 */
+				r = int_alloc_resource(M_NOWAIT);
+				if (r == NULL) {
+					free(rv, M_RMAN);
+					rv = NULL;
+					goto out;
+				}
+				r->r_start = rv->r_end + 1;
+				r->r_end = s->r_end;
+				r->r_flags = s->r_flags;
+				r->r_rm = rm;
+				s->r_end = rv->r_start - 1;
+				TAILQ_INSERT_AFTER(&rm->rm_list, s, rv,
+						     r_link);
+				TAILQ_INSERT_AFTER(&rm->rm_list, rv, r,
+						     r_link);
+			} else if (s->r_start == rv->r_start) {
+				DPRINTF(("allocating from the beginning\n"));
+				/*
+				 * We are allocating at the beginning.
+				 */
+				s->r_start = rv->r_end + 1;
+				TAILQ_INSERT_BEFORE(s, rv, r_link);
+			} else {
+				DPRINTF(("allocating at the end\n"));
+				/*
+				 * We are allocating at the end.
+				 */
+				s->r_end = rv->r_start - 1;
+				TAILQ_INSERT_AFTER(&rm->rm_list, s, rv,
+						     r_link);
+			}
+			goto out;
+		}
+	}
+
+	/*
+	 * Now find an acceptable shared region, if the client's requirements
+	 * allow sharing.  By our implementation restriction, a candidate
+	 * region must match exactly by both size and sharing type in order
+	 * to be considered compatible with the client's request.  (The
+	 * former restriction could probably be lifted without too much
+	 * additional work, but this does not seem warranted.)
+	 */
+	DPRINTF(("no unshared regions found\n"));
+	if ((flags & (RF_SHAREABLE | RF_TIMESHARE)) == 0)
+		goto out;
+
+	for (s = r; s; s = TAILQ_NEXT(s, r_link)) {
+		if (s->r_start > end)
+			break;
+		if ((s->r_flags & flags) != flags)
+			continue;
+		rstart = ulmax(s->r_start, start);
+		rend = ulmin(s->r_end, ulmax(start + count - 1, end));
+		if (s->r_start >= start && s->r_end <= end
+		    && (s->r_end - s->r_start + 1) == count &&
+		    (s->r_start & amask) == 0 &&
+		    ((s->r_start ^ s->r_end) & bmask) == 0) {
+			rv = int_alloc_resource(M_NOWAIT);
+			if (rv == NULL)
+				goto out;
+			rv->r_start = s->r_start;
+			rv->r_end = s->r_end;
+			rv->r_flags = s->r_flags &
+				(RF_ALLOCATED | RF_SHAREABLE | RF_TIMESHARE);
+			rv->r_dev = dev;
+			rv->r_rm = rm;
+			if (s->r_sharehead == NULL) {
+				s->r_sharehead = malloc(sizeof *s->r_sharehead,
+						M_RMAN, M_NOWAIT | M_ZERO);
+				if (s->r_sharehead == NULL) {
+					free(rv, M_RMAN);
+					rv = NULL;
+					goto out;
+				}
+				LIST_INIT(s->r_sharehead);
+				LIST_INSERT_HEAD(s->r_sharehead, s,
+						 r_sharelink);
+				s->r_flags |= RF_FIRSTSHARE;
+			}
+			rv->r_sharehead = s->r_sharehead;
+			LIST_INSERT_HEAD(s->r_sharehead, rv, r_sharelink);
+			goto out;
+		}
+	}
+
+	/*
+	 * We couldn't find anything.
+	 */
+out:
+	/*
+	 * If the user specified RF_ACTIVE in the initial flags,
+	 * which is reflected in `want_activate', we attempt to atomically
+	 * activate the resource.  If this fails, we release the resource
+	 * and indicate overall failure.  (This behavior probably doesn't
+	 * make sense for RF_TIMESHARE-type resources.)
+	 */
+	if (rv && want_activate) {
+		struct resource_i *whohas;
+		if (int_rman_activate_resource(rm, rv, &whohas)) {
+			int_rman_release_resource(rm, rv);
+			rv = NULL;
+		}
+	}
+
+	mtx_unlock(rm->rm_mtx);
+	return (rv == NULL ? NULL : &rv->r_r);
+}
+
+struct resource *
+rman_reserve_resource(struct rman *rm, u_long start, u_long end, u_long count,
+		      u_int flags, struct device *dev)
+{
+
+	return (rman_reserve_resource_bound(rm, start, end, count, 0, flags,
+	    dev));
+}
+
+static int
+int_rman_activate_resource(struct rman *rm, struct resource_i *r,
+			   struct resource_i **whohas)
+{
+	struct resource_i *s;
+	int ok;
+
+	/*
+	 * If we are not timesharing, then there is nothing much to do.
+	 * If we already have the resource, then there is nothing at all to do.
+	 * If we are not on a sharing list with anybody else, then there is
+	 * little to do.
+	 */
+	if ((r->r_flags & RF_TIMESHARE) == 0
+	    || (r->r_flags & RF_ACTIVE) != 0
+	    || r->r_sharehead == NULL) {
+		r->r_flags |= RF_ACTIVE;
+		return 0;
+	}
+
+	ok = 1;
+	for (s = LIST_FIRST(r->r_sharehead); s && ok;
+	     s = LIST_NEXT(s, r_sharelink)) {
+		if ((s->r_flags & RF_ACTIVE) != 0) {
+			ok = 0;
+			*whohas = s;
+		}
+	}
+	if (ok) {
+		r->r_flags |= RF_ACTIVE;
+		return 0;
+	}
+	return EBUSY;
+}
+
+int
+rman_activate_resource(struct resource *re)
+{
+	int rv;
+	struct resource_i *r, *whohas;
+	struct rman *rm;
+
+	r = re->__r_i;
+	rm = r->r_rm;
+	mtx_lock(rm->rm_mtx);
+	rv = int_rman_activate_resource(rm, r, &whohas);
+	mtx_unlock(rm->rm_mtx);
+	return rv;
+}
+
+int
+rman_await_resource(struct resource *re, int pri, int timo)
+{
+	int	rv;
+	struct	resource_i *r, *whohas;
+	struct	rman *rm;
+
+	r = re->__r_i;
+	rm = r->r_rm;
+	mtx_lock(rm->rm_mtx);
+	for (;;) {
+		rv = int_rman_activate_resource(rm, r, &whohas);
+		if (rv != EBUSY)
+			return (rv);	/* returns with mutex held */
+
+		if (r->r_sharehead == NULL)
+			panic("rman_await_resource");
+		whohas->r_flags |= RF_WANTED;
+		rv = msleep(r->r_sharehead, rm->rm_mtx, pri, "rmwait", timo);
+		if (rv) {
+			mtx_unlock(rm->rm_mtx);
+			return (rv);
+		}
+	}
+}
+
+static int
+int_rman_deactivate_resource(struct resource_i *r)
+{
+
+	r->r_flags &= ~RF_ACTIVE;
+	if (r->r_flags & RF_WANTED) {
+		r->r_flags &= ~RF_WANTED;
+		wakeup(r->r_sharehead);
+	}
+	return 0;
+}
+
+int
+rman_deactivate_resource(struct resource *r)
+{
+	struct	rman *rm;
+
+	rm = r->__r_i->r_rm;
+	mtx_lock(rm->rm_mtx);
+	int_rman_deactivate_resource(r->__r_i);
+	mtx_unlock(rm->rm_mtx);
+	return 0;
+}
+
+static int
+int_rman_release_resource(struct rman *rm, struct resource_i *r)
+{
+	struct	resource_i *s, *t;
+
+	if (r->r_flags & RF_ACTIVE)
+		int_rman_deactivate_resource(r);
+
+	/*
+	 * Check for a sharing list first.  If there is one, then we don't
+	 * have to think as hard.
+	 */
+	if (r->r_sharehead) {
+		/*
+		 * If a sharing list exists, then we know there are at
+		 * least two sharers.
+		 *
+		 * If we are in the main circleq, appoint someone else.
+		 */
+		LIST_REMOVE(r, r_sharelink);
+		s = LIST_FIRST(r->r_sharehead);
+		if (r->r_flags & RF_FIRSTSHARE) {
+			s->r_flags |= RF_FIRSTSHARE;
+			TAILQ_INSERT_BEFORE(r, s, r_link);
+			TAILQ_REMOVE(&rm->rm_list, r, r_link);
+		}
+
+		/*
+		 * Make sure that the sharing list goes away completely
+		 * if the resource is no longer being shared at all.
+		 */
+		if (LIST_NEXT(s, r_sharelink) == NULL) {
+			free(s->r_sharehead, M_RMAN);
+			s->r_sharehead = NULL;
+			s->r_flags &= ~RF_FIRSTSHARE;
+		}
+		goto out;
+	}
+
+	/*
+	 * Look at the adjacent resources in the list and see if our
+	 * segment can be merged with any of them.  If either of the
+	 * resources is allocated or is not exactly adjacent then they
+	 * cannot be merged with our segment.
+	 */
+	s = TAILQ_PREV(r, resource_head, r_link);
+	if (s != NULL && ((s->r_flags & RF_ALLOCATED) != 0 ||
+	    s->r_end + 1 != r->r_start))
+		s = NULL;
+	t = TAILQ_NEXT(r, r_link);
+	if (t != NULL && ((t->r_flags & RF_ALLOCATED) != 0 ||
+	    r->r_end + 1 != t->r_start))
+		t = NULL;
+
+	if (s != NULL && t != NULL) {
+		/*
+		 * Merge all three segments.
+		 */
+		s->r_end = t->r_end;
+		TAILQ_REMOVE(&rm->rm_list, r, r_link);
+		TAILQ_REMOVE(&rm->rm_list, t, r_link);
+		free(t, M_RMAN);
+	} else if (s != NULL) {
+		/*
+		 * Merge previous segment with ours.
+		 */
+		s->r_end = r->r_end;
+		TAILQ_REMOVE(&rm->rm_list, r, r_link);
+	} else if (t != NULL) {
+		/*
+		 * Merge next segment with ours.
+		 */
+		t->r_start = r->r_start;
+		TAILQ_REMOVE(&rm->rm_list, r, r_link);
+	} else {
+		/*
+		 * At this point, we know there is nothing we
+		 * can potentially merge with, because on each
+		 * side, there is either nothing there or what is
+		 * there is still allocated.  In that case, we don't
+		 * want to remove r from the list; we simply want to
+		 * change it to an unallocated region and return
+		 * without freeing anything.
+		 */
+		r->r_flags &= ~RF_ALLOCATED;
+		r->r_dev = NULL;
+		return 0;
+	}
+
+out:
+	free(r, M_RMAN);
+	return 0;
+}
+
+int
+rman_release_resource(struct resource *re)
+{
+	int	rv;
+	struct	resource_i *r;
+	struct	rman *rm;
+
+	r = re->__r_i;
+	rm = r->r_rm;
+	mtx_lock(rm->rm_mtx);
+	rv = int_rman_release_resource(rm, r);
+	mtx_unlock(rm->rm_mtx);
+	return (rv);
+}
+
+uint32_t
+rman_make_alignment_flags(uint32_t size)
+{
+	int	i;
+
+	/*
+	 * Find the hightest bit set, and add one if more than one bit
+	 * set.  We're effectively computing the ceil(log2(size)) here.
+	 */
+	for (i = 31; i > 0; i--)
+		if ((1 << i) & size)
+			break;
+	if (~(1 << i) & size)
+		i++;
+
+	return(RF_ALIGNMENT_LOG2(i));
+}
+
+void
+rman_set_start(struct resource *r, u_long start)
+{
+	r->__r_i->r_start = start;
+}
+
+u_long
+rman_get_start(struct resource *r)
+{
+	return (r->__r_i->r_start);
+}
+
+void
+rman_set_end(struct resource *r, u_long end)
+{
+	r->__r_i->r_end = end;
+}
+
+u_long
+rman_get_end(struct resource *r)
+{
+	return (r->__r_i->r_end);
+}
+
+u_long
+rman_get_size(struct resource *r)
+{
+	return (r->__r_i->r_end - r->__r_i->r_start + 1);
+}
+
+u_int
+rman_get_flags(struct resource *r)
+{
+	return (r->__r_i->r_flags);
+}
+
+void
+rman_set_virtual(struct resource *r, void *v)
+{
+	r->__r_i->r_virtual = v;
+}
+
+void *
+rman_get_virtual(struct resource *r)
+{
+	return (r->__r_i->r_virtual);
+}
+
+void
+rman_set_bustag(struct resource *r, bus_space_tag_t t)
+{
+	r->r_bustag = t;
+}
+
+bus_space_tag_t
+rman_get_bustag(struct resource *r)
+{
+	return (r->r_bustag);
+}
+
+void
+rman_set_bushandle(struct resource *r, bus_space_handle_t h)
+{
+	r->r_bushandle = h;
+}
+
+bus_space_handle_t
+rman_get_bushandle(struct resource *r)
+{
+	return (r->r_bushandle);
+}
+
+void
+rman_set_rid(struct resource *r, int rid)
+{
+	r->__r_i->r_rid = rid;
+}
+
+int
+rman_get_rid(struct resource *r)
+{
+	return (r->__r_i->r_rid);
+}
+
+void
+rman_set_device(struct resource *r, struct device *dev)
+{
+	r->__r_i->r_dev = dev;
+}
+
+struct device *
+rman_get_device(struct resource *r)
+{
+	return (r->__r_i->r_dev);
+}
+
+int
+rman_is_region_manager(struct resource *r, struct rman *rm)
+{
+
+	return (r->__r_i->r_rm == rm);
+}
+
+/*
+ * Sysctl interface for scanning the resource lists.
+ *
+ * We take two input parameters; the index into the list of resource
+ * managers, and the resource offset into the list.
+ */
+static int
+sysctl_rman(SYSCTL_HANDLER_ARGS)
+{
+	int			*name = (int *)arg1;
+	u_int			namelen = arg2;
+	int			rman_idx, res_idx;
+	struct rman		*rm;
+	struct resource_i	*res;
+	struct resource_i	*sres;
+	struct u_rman		urm;
+	struct u_resource	ures;
+	int			error;
+
+	if (namelen != 3)
+		return (EINVAL);
+
+	if (bus_data_generation_check(name[0]))
+		return (EINVAL);
+	rman_idx = name[1];
+	res_idx = name[2];
+
+	/*
+	 * Find the indexed resource manager
+	 */
+	mtx_lock(&rman_mtx);
+	TAILQ_FOREACH(rm, &rman_head, rm_link) {
+		if (rman_idx-- == 0)
+			break;
+	}
+	mtx_unlock(&rman_mtx);
+	if (rm == NULL)
+		return (ENOENT);
+
+	/*
+	 * If the resource index is -1, we want details on the
+	 * resource manager.
+	 */
+	if (res_idx == -1) {
+		bzero(&urm, sizeof(urm));
+		urm.rm_handle = (uintptr_t)rm;
+		if (rm->rm_descr != NULL)
+			strlcpy(urm.rm_descr, rm->rm_descr, RM_TEXTLEN);
+		urm.rm_start = rm->rm_start;
+		urm.rm_size = rm->rm_end - rm->rm_start + 1;
+		urm.rm_type = rm->rm_type;
+
+		error = SYSCTL_OUT(req, &urm, sizeof(urm));
+		return (error);
+	}
+
+	/*
+	 * Find the indexed resource and return it.
+	 */
+	mtx_lock(rm->rm_mtx);
+	TAILQ_FOREACH(res, &rm->rm_list, r_link) {
+		if (res->r_sharehead != NULL) {
+			LIST_FOREACH(sres, res->r_sharehead, r_sharelink)
+				if (res_idx-- == 0) {
+					res = sres;
+					goto found;
+				}
+		}
+		else if (res_idx-- == 0)
+				goto found;
+	}
+	mtx_unlock(rm->rm_mtx);
+	return (ENOENT);
+
+found:
+	bzero(&ures, sizeof(ures));
+	ures.r_handle = (uintptr_t)res;
+	ures.r_parent = (uintptr_t)res->r_rm;
+	ures.r_device = (uintptr_t)res->r_dev;
+	if (res->r_dev != NULL) {
+		if (device_get_name(res->r_dev) != NULL) {
+			snprintf(ures.r_devname, RM_TEXTLEN,
+			    "%s%d",
+			    device_get_name(res->r_dev),
+			    device_get_unit(res->r_dev));
+		} else {
+			strlcpy(ures.r_devname, "nomatch",
+			    RM_TEXTLEN);
+		}
+	} else {
+		ures.r_devname[0] = '\0';
+	}
+	ures.r_start = res->r_start;
+	ures.r_size = res->r_end - res->r_start + 1;
+	ures.r_flags = res->r_flags;
+
+	mtx_unlock(rm->rm_mtx);
+	error = SYSCTL_OUT(req, &ures, sizeof(ures));
+	return (error);
+}
+
+static SYSCTL_NODE(_hw_bus, OID_AUTO, rman, CTLFLAG_RD, sysctl_rman,
+    "kernel resource manager");
+
+#ifdef DDB
+static void
+dump_rman_header(struct rman *rm)
+{
+
+	if (db_pager_quit)
+		return;
+	db_printf("rman %p: %s (0x%lx-0x%lx full range)\n",
+	    rm, rm->rm_descr, rm->rm_start, rm->rm_end);
+}
+
+static void
+dump_rman(struct rman *rm)
+{
+	struct resource_i *r;
+	const char *devname;
+
+	if (db_pager_quit)
+		return;
+	TAILQ_FOREACH(r, &rm->rm_list, r_link) {
+		if (r->r_dev != NULL) {
+			devname = device_get_nameunit(r->r_dev);
+			if (devname == NULL)
+				devname = "nomatch";
+		} else
+			devname = NULL;
+		db_printf("    0x%lx-0x%lx ", r->r_start, r->r_end);
+		if (devname != NULL)
+			db_printf("(%s)\n", devname);
+		else
+			db_printf("----\n");
+		if (db_pager_quit)
+			return;
+	}
+}
+
+DB_SHOW_COMMAND(rman, db_show_rman)
+{
+
+	if (have_addr) {
+		dump_rman_header((struct rman *)addr);
+		dump_rman((struct rman *)addr);
+	}
+}
+
+DB_SHOW_COMMAND(rmans, db_show_rmans)
+{
+	struct rman *rm;
+
+	TAILQ_FOREACH(rm, &rman_head, rm_link) {
+		dump_rman_header(rm);
+	}
+}
+
+DB_SHOW_ALL_COMMAND(rman, db_show_all_rman)
+{
+	struct rman *rm;
+
+	TAILQ_FOREACH(rm, &rman_head, rm_link) {
+		dump_rman_header(rm);
+		dump_rman(rm);
+	}
+}
+DB_SHOW_ALIAS(allrman, db_show_all_rman);
+#endif
diff --git a/sys/kern/subr_rtc.c b/sys/kern/subr_rtc.c
new file mode 100644
index 0000000..ed2befc
--- /dev/null
+++ b/sys/kern/subr_rtc.c
@@ -0,0 +1,178 @@
+/*-
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1982, 1990, 1993
+ *	The Regents of the University of California.
+ * Copyright (c) 2011 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Portions of this software were developed by Julien Ridoux at the University
+ * of Melbourne under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: Utah $Hdr: clock.c 1.18 91/01/21$
+ *	from: @(#)clock.c	8.2 (Berkeley) 1/12/94
+ *	from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp
+ *	and
+ *	from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
+ */
+
+/*
+ * Helpers for time-of-day clocks. This is useful for architectures that need
+ * support multiple models of such clocks, and generally serves to make the
+ * code more machine-independent.
+ * If the clock in question can also be used as a time counter, the driver
+ * needs to initiate this.
+ * This code is not yet used by all architectures.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ffclock.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/clock.h>
+#include <sys/sysctl.h>
+#ifdef FFCLOCK
+#include <sys/timeffc.h>
+#endif
+#include <sys/timetc.h>
+
+#include "clock_if.h"
+
+static device_t clock_dev = NULL;
+static long clock_res;
+static struct timespec clock_adj;
+
+/* XXX: should be kern. now, it's no longer machdep.  */
+static int disable_rtc_set;
+SYSCTL_INT(_machdep, OID_AUTO, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set,
+    0, "Disallow adjusting time-of-day clock");
+
+void
+clock_register(device_t dev, long res)	/* res has units of microseconds */
+{
+
+	if (clock_dev != NULL) {
+		if (clock_res > res) {
+			if (bootverbose)
+				device_printf(dev, "not installed as "
+				    "time-of-day clock: clock %s has higher "
+				    "resolution\n", device_get_name(clock_dev));
+			return;
+		}
+		if (bootverbose)
+			device_printf(clock_dev, "removed as "
+			    "time-of-day clock: clock %s has higher "
+			    "resolution\n", device_get_name(dev));
+	}
+	clock_dev = dev;
+	clock_res = res;
+	clock_adj.tv_sec = res / 2 / 1000000;
+	clock_adj.tv_nsec = res / 2 % 1000000 * 1000;
+	if (bootverbose)
+		device_printf(dev, "registered as a time-of-day clock "
+		    "(resolution %ldus, adjustment %jd.%09jds)\n", res,
+		    (intmax_t)clock_adj.tv_sec, (intmax_t)clock_adj.tv_nsec);
+}
+
+/*
+ * inittodr and settodr derived from the i386 versions written
+ * by Christoph Robitschko <chmr@edvz.tu-graz.ac.at>,  reintroduced and
+ * updated by Chris Stenton <chris@gnome.co.uk> 8/10/94
+ */
+
+/*
+ * Initialize the time of day register, based on the time base which is, e.g.
+ * from a filesystem.
+ */
+void
+inittodr(time_t base)
+{
+	struct timespec ts;
+	int error;
+
+	if (clock_dev == NULL) {
+		printf("warning: no time-of-day clock registered, system time "
+		    "will not be set accurately\n");
+		goto wrong_time;
+	}
+	/* XXX: We should poll all registered RTCs in case of failure */
+	error = CLOCK_GETTIME(clock_dev, &ts);
+	if (error != 0 && error != EINVAL) {
+		printf("warning: clock_gettime failed (%d), the system time "
+		    "will not be set accurately\n", error);
+		goto wrong_time;
+	}
+	if (error == EINVAL || ts.tv_sec < 0) {
+		printf("Invalid time in real time clock.\n"
+		    "Check and reset the date immediately!\n");
+		goto wrong_time;
+	}
+
+	ts.tv_sec += utc_offset();
+	timespecadd(&ts, &clock_adj);
+	tc_setclock(&ts);
+#ifdef FFCLOCK
+	ffclock_reset_clock(&ts);
+#endif
+	return;
+
+wrong_time:
+	if (base > 0) {
+		ts.tv_sec = base;
+		ts.tv_nsec = 0;
+		tc_setclock(&ts);
+	}
+}
+
+/*
+ * Write system time back to RTC
+ */
+void
+resettodr(void)
+{
+	struct timespec ts;
+	int error;
+
+	if (disable_rtc_set || clock_dev == NULL)
+		return;
+
+	getnanotime(&ts);
+	timespecadd(&ts, &clock_adj);
+	ts.tv_sec -= utc_offset();
+	/* XXX: We should really set all registered RTCs */
+	if ((error = CLOCK_SETTIME(clock_dev, &ts)) != 0)
+		printf("warning: clock_settime failed (%d), time-of-day clock "
+		    "not adjusted to system time\n", error);
+}
diff --git a/sys/kern/subr_sbuf.c b/sys/kern/subr_sbuf.c
new file mode 100644
index 0000000..68a7b15
--- /dev/null
+++ b/sys/kern/subr_sbuf.c
@@ -0,0 +1,831 @@
+/*-
+ * Copyright (c) 2000-2008 Poul-Henning Kamp
+ * Copyright (c) 2000-2008 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#ifdef _KERNEL
+#include <sys/ctype.h>
+#include <sys/errno.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/uio.h>
+#include <machine/stdarg.h>
+#else /* _KERNEL */
+#include <ctype.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#endif /* _KERNEL */
+
+#include <sys/sbuf.h>
+
+#ifdef _KERNEL
+static MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers");
+#define	SBMALLOC(size)		malloc(size, M_SBUF, M_WAITOK)
+#define	SBFREE(buf)		free(buf, M_SBUF)
+#else /* _KERNEL */
+#define	KASSERT(e, m)
+#define	SBMALLOC(size)		malloc(size)
+#define	SBFREE(buf)		free(buf)
+#endif /* _KERNEL */
+
+/*
+ * Predicates
+ */
+#define	SBUF_ISDYNAMIC(s)	((s)->s_flags & SBUF_DYNAMIC)
+#define	SBUF_ISDYNSTRUCT(s)	((s)->s_flags & SBUF_DYNSTRUCT)
+#define	SBUF_ISFINISHED(s)	((s)->s_flags & SBUF_FINISHED)
+#define	SBUF_HASROOM(s)		((s)->s_len < (s)->s_size - 1)
+#define	SBUF_FREESPACE(s)	((s)->s_size - ((s)->s_len + 1))
+#define	SBUF_CANEXTEND(s)	((s)->s_flags & SBUF_AUTOEXTEND)
+#define	SBUF_ISSECTION(s)	((s)->s_flags & SBUF_INSECTION)
+
+/*
+ * Set / clear flags
+ */
+#define	SBUF_SETFLAG(s, f)	do { (s)->s_flags |= (f); } while (0)
+#define	SBUF_CLEARFLAG(s, f)	do { (s)->s_flags &= ~(f); } while (0)
+
+#define	SBUF_MINEXTENDSIZE	16		/* Should be power of 2. */
+
+#ifdef PAGE_SIZE
+#define	SBUF_MAXEXTENDSIZE	PAGE_SIZE
+#define	SBUF_MAXEXTENDINCR	PAGE_SIZE
+#else
+#define	SBUF_MAXEXTENDSIZE	4096
+#define	SBUF_MAXEXTENDINCR	4096
+#endif
+
+/*
+ * Debugging support
+ */
+#if defined(_KERNEL) && defined(INVARIANTS)
+
+static void
+_assert_sbuf_integrity(const char *fun, struct sbuf *s)
+{
+
+	KASSERT(s != NULL,
+	    ("%s called with a NULL sbuf pointer", fun));
+	KASSERT(s->s_buf != NULL,
+	    ("%s called with uninitialized or corrupt sbuf", fun));
+	KASSERT(s->s_len < s->s_size,
+	    ("wrote past end of sbuf (%jd >= %jd)",
+	    (intmax_t)s->s_len, (intmax_t)s->s_size));
+}
+
+static void
+_assert_sbuf_state(const char *fun, struct sbuf *s, int state)
+{
+
+	KASSERT((s->s_flags & SBUF_FINISHED) == state,
+	    ("%s called with %sfinished or corrupt sbuf", fun,
+	    (state ? "un" : "")));
+}
+
+#define	assert_sbuf_integrity(s) _assert_sbuf_integrity(__func__, (s))
+#define	assert_sbuf_state(s, i)	 _assert_sbuf_state(__func__, (s), (i))
+
+#else /* _KERNEL && INVARIANTS */
+
+#define	assert_sbuf_integrity(s) do { } while (0)
+#define	assert_sbuf_state(s, i)	 do { } while (0)
+
+#endif /* _KERNEL && INVARIANTS */
+
+#ifdef CTASSERT
+CTASSERT(powerof2(SBUF_MAXEXTENDSIZE));
+CTASSERT(powerof2(SBUF_MAXEXTENDINCR));
+#endif
+
+static int
+sbuf_extendsize(int size)
+{
+	int newsize;
+
+	if (size < (int)SBUF_MAXEXTENDSIZE) {
+		newsize = SBUF_MINEXTENDSIZE;
+		while (newsize < size)
+			newsize *= 2;
+	} else {
+		newsize = roundup2(size, SBUF_MAXEXTENDINCR);
+	}
+	KASSERT(newsize >= size, ("%s: %d < %d\n", __func__, newsize, size));
+	return (newsize);
+}
+
+/*
+ * Extend an sbuf.
+ */
+static int
+sbuf_extend(struct sbuf *s, int addlen)
+{
+	char *newbuf;
+	int newsize;
+
+	if (!SBUF_CANEXTEND(s))
+		return (-1);
+	newsize = sbuf_extendsize(s->s_size + addlen);
+	newbuf = SBMALLOC(newsize);
+	if (newbuf == NULL)
+		return (-1);
+	memcpy(newbuf, s->s_buf, s->s_size);
+	if (SBUF_ISDYNAMIC(s))
+		SBFREE(s->s_buf);
+	else
+		SBUF_SETFLAG(s, SBUF_DYNAMIC);
+	s->s_buf = newbuf;
+	s->s_size = newsize;
+	return (0);
+}
+
+/*
+ * Initialize the internals of an sbuf.
+ * If buf is non-NULL, it points to a static or already-allocated string
+ * big enough to hold at least length characters.
+ */
+static struct sbuf *
+sbuf_newbuf(struct sbuf *s, char *buf, int length, int flags)
+{
+
+	memset(s, 0, sizeof(*s));
+	s->s_flags = flags;
+	s->s_size = length;
+	s->s_buf = buf;
+
+	if ((s->s_flags & SBUF_AUTOEXTEND) == 0) {
+		KASSERT(s->s_size >= 0,
+		    ("attempt to create a too small sbuf"));
+	}
+
+	if (s->s_buf != NULL)
+		return (s);
+
+	if ((flags & SBUF_AUTOEXTEND) != 0)
+		s->s_size = sbuf_extendsize(s->s_size);
+
+	s->s_buf = SBMALLOC(s->s_size);
+	if (s->s_buf == NULL)
+		return (NULL);
+	SBUF_SETFLAG(s, SBUF_DYNAMIC);
+	return (s);
+}
+
+/*
+ * Initialize an sbuf.
+ * If buf is non-NULL, it points to a static or already-allocated string
+ * big enough to hold at least length characters.
+ */
+struct sbuf *
+sbuf_new(struct sbuf *s, char *buf, int length, int flags)
+{
+
+	KASSERT(length >= 0,
+	    ("attempt to create an sbuf of negative length (%d)", length));
+	KASSERT((flags & ~SBUF_USRFLAGMSK) == 0,
+	    ("%s called with invalid flags", __func__));
+
+	flags &= SBUF_USRFLAGMSK;
+	if (s != NULL)
+		return (sbuf_newbuf(s, buf, length, flags));
+
+	s = SBMALLOC(sizeof(*s));
+	if (s == NULL)
+		return (NULL);
+	if (sbuf_newbuf(s, buf, length, flags) == NULL) {
+		SBFREE(s);
+		return (NULL);
+	}
+	SBUF_SETFLAG(s, SBUF_DYNSTRUCT);
+	return (s);
+}
+
+#ifdef _KERNEL
+/*
+ * Create an sbuf with uio data
+ */
+struct sbuf *
+sbuf_uionew(struct sbuf *s, struct uio *uio, int *error)
+{
+
+	KASSERT(uio != NULL,
+	    ("%s called with NULL uio pointer", __func__));
+	KASSERT(error != NULL,
+	    ("%s called with NULL error pointer", __func__));
+
+	s = sbuf_new(s, NULL, uio->uio_resid + 1, 0);
+	if (s == NULL) {
+		*error = ENOMEM;
+		return (NULL);
+	}
+	*error = uiomove(s->s_buf, uio->uio_resid, uio);
+	if (*error != 0) {
+		sbuf_delete(s);
+		return (NULL);
+	}
+	s->s_len = s->s_size - 1;
+	if (SBUF_ISSECTION(s))
+		s->s_sect_len = s->s_size - 1;
+	*error = 0;
+	return (s);
+}
+#endif
+
+/*
+ * Clear an sbuf and reset its position.
+ */
+void
+sbuf_clear(struct sbuf *s)
+{
+
+	assert_sbuf_integrity(s);
+	/* don't care if it's finished or not */
+
+	SBUF_CLEARFLAG(s, SBUF_FINISHED);
+	s->s_error = 0;
+	s->s_len = 0;
+	s->s_sect_len = 0;
+}
+
+/*
+ * Set the sbuf's end position to an arbitrary value.
+ * Effectively truncates the sbuf at the new position.
+ */
+int
+sbuf_setpos(struct sbuf *s, ssize_t pos)
+{
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+
+	KASSERT(pos >= 0,
+	    ("attempt to seek to a negative position (%jd)", (intmax_t)pos));
+	KASSERT(pos < s->s_size,
+	    ("attempt to seek past end of sbuf (%jd >= %jd)",
+	    (intmax_t)pos, (intmax_t)s->s_size));
+	KASSERT(!SBUF_ISSECTION(s),
+	    ("attempt to seek when in a section"));
+
+	if (pos < 0 || pos > s->s_len)
+		return (-1);
+	s->s_len = pos;
+	return (0);
+}
+
+/*
+ * Set up a drain function and argument on an sbuf to flush data to
+ * when the sbuf buffer overflows.
+ */
+void
+sbuf_set_drain(struct sbuf *s, sbuf_drain_func *func, void *ctx)
+{
+
+	assert_sbuf_state(s, 0);
+	assert_sbuf_integrity(s);
+	KASSERT(func == s->s_drain_func || s->s_len == 0,
+	    ("Cannot change drain to %p on non-empty sbuf %p", func, s));
+	s->s_drain_func = func;
+	s->s_drain_arg = ctx;
+}
+
+/*
+ * Call the drain and process the return.
+ */
+static int
+sbuf_drain(struct sbuf *s)
+{
+	int len;
+
+	KASSERT(s->s_len > 0, ("Shouldn't drain empty sbuf %p", s));
+	KASSERT(s->s_error == 0, ("Called %s with error on %p", __func__, s));
+	len = s->s_drain_func(s->s_drain_arg, s->s_buf, s->s_len);
+	if (len < 0) {
+		s->s_error = -len;
+		return (s->s_error);
+	}
+	KASSERT(len > 0 && len <= s->s_len,
+	    ("Bad drain amount %d for sbuf %p", len, s));
+	s->s_len -= len;
+	/*
+	 * Fast path for the expected case where all the data was
+	 * drained.
+	 */
+	if (s->s_len == 0)
+		return (0);
+	/*
+	 * Move the remaining characters to the beginning of the
+	 * string.
+	 */
+	memmove(s->s_buf, s->s_buf + len, s->s_len);
+	return (0);
+}
+
+/*
+ * Append a byte to an sbuf.  This is the core function for appending
+ * to an sbuf and is the main place that deals with extending the
+ * buffer and marking overflow.
+ */
+static void
+sbuf_put_byte(struct sbuf *s, int c)
+{
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+
+	if (s->s_error != 0)
+		return;
+	if (SBUF_FREESPACE(s) <= 0) {
+		/*
+		 * If there is a drain, use it, otherwise extend the
+		 * buffer.
+		 */
+		if (s->s_drain_func != NULL)
+			(void)sbuf_drain(s);
+		else if (sbuf_extend(s, 1) < 0)
+			s->s_error = ENOMEM;
+		if (s->s_error != 0)
+			return;
+	}
+	s->s_buf[s->s_len++] = c;
+	if (SBUF_ISSECTION(s))
+		s->s_sect_len++;
+}
+
+/*
+ * Append a byte string to an sbuf.
+ */
+int
+sbuf_bcat(struct sbuf *s, const void *buf, size_t len)
+{
+	const char *str = buf;
+	const char *end = str + len;
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+
+	if (s->s_error != 0)
+		return (-1);
+	for (; str < end; str++) {
+		sbuf_put_byte(s, *str);
+		if (s->s_error != 0)
+			return (-1);
+	}
+	return (0);
+}
+
+#ifdef _KERNEL
+/*
+ * Copy a byte string from userland into an sbuf.
+ */
+int
+sbuf_bcopyin(struct sbuf *s, const void *uaddr, size_t len)
+{
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+	KASSERT(s->s_drain_func == NULL,
+	    ("Nonsensical copyin to sbuf %p with a drain", s));
+
+	if (s->s_error != 0)
+		return (-1);
+	if (len == 0)
+		return (0);
+	if (len > SBUF_FREESPACE(s)) {
+		sbuf_extend(s, len - SBUF_FREESPACE(s));
+		if (SBUF_FREESPACE(s) < len)
+			len = SBUF_FREESPACE(s);
+	}
+	if (copyin(uaddr, s->s_buf + s->s_len, len) != 0)
+		return (-1);
+	s->s_len += len;
+
+	return (0);
+}
+#endif
+
+/*
+ * Copy a byte string into an sbuf.
+ */
+int
+sbuf_bcpy(struct sbuf *s, const void *buf, size_t len)
+{
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+
+	sbuf_clear(s);
+	return (sbuf_bcat(s, buf, len));
+}
+
+/*
+ * Append a string to an sbuf.
+ */
+int
+sbuf_cat(struct sbuf *s, const char *str)
+{
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+
+	if (s->s_error != 0)
+		return (-1);
+
+	while (*str != '\0') {
+		sbuf_put_byte(s, *str++);
+		if (s->s_error != 0)
+			return (-1);
+	}
+	return (0);
+}
+
+#ifdef _KERNEL
+/*
+ * Append a string from userland to an sbuf.
+ */
+int
+sbuf_copyin(struct sbuf *s, const void *uaddr, size_t len)
+{
+	size_t done;
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+	KASSERT(s->s_drain_func == NULL,
+	    ("Nonsensical copyin to sbuf %p with a drain", s));
+
+	if (s->s_error != 0)
+		return (-1);
+
+	if (len == 0)
+		len = SBUF_FREESPACE(s);	/* XXX return 0? */
+	if (len > SBUF_FREESPACE(s)) {
+		sbuf_extend(s, len);
+		if (SBUF_FREESPACE(s) < len)
+			len = SBUF_FREESPACE(s);
+	}
+	switch (copyinstr(uaddr, s->s_buf + s->s_len, len + 1, &done)) {
+	case ENAMETOOLONG:
+		s->s_error = ENOMEM;
+		/* fall through */
+	case 0:
+		s->s_len += done - 1;
+		if (SBUF_ISSECTION(s))
+			s->s_sect_len += done - 1;
+		break;
+	default:
+		return (-1);	/* XXX */
+	}
+
+	return (done);
+}
+#endif
+
+/*
+ * Copy a string into an sbuf.
+ */
+int
+sbuf_cpy(struct sbuf *s, const char *str)
+{
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+
+	sbuf_clear(s);
+	return (sbuf_cat(s, str));
+}
+
+/*
+ * Format the given argument list and append the resulting string to an sbuf.
+ */
+#ifdef _KERNEL
+
+/*
+ * Append a non-NUL character to an sbuf.  This prototype signature is
+ * suitable for use with kvprintf(9).
+ */
+static void
+sbuf_putc_func(int c, void *arg)
+{
+
+	if (c != '\0')
+		sbuf_put_byte(arg, c);
+}
+
+int
+sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap)
+{
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+
+	KASSERT(fmt != NULL,
+	    ("%s called with a NULL format string", __func__));
+
+	(void)kvprintf(fmt, sbuf_putc_func, s, 10, ap);
+	if (s->s_error != 0)
+		return (-1);
+	return (0);
+}
+#else /* !_KERNEL */
+int
+sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap)
+{
+	va_list ap_copy;
+	int error, len;
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+
+	KASSERT(fmt != NULL,
+	    ("%s called with a NULL format string", __func__));
+
+	if (s->s_error != 0)
+		return (-1);
+
+	/*
+	 * For the moment, there is no way to get vsnprintf(3) to hand
+	 * back a character at a time, to push everything into
+	 * sbuf_putc_func() as was done for the kernel.
+	 *
+	 * In userspace, while drains are useful, there's generally
+	 * not a problem attempting to malloc(3) on out of space.  So
+	 * expand a userland sbuf if there is not enough room for the
+	 * data produced by sbuf_[v]printf(3).
+	 */
+
+	error = 0;
+	do {
+		va_copy(ap_copy, ap);
+		len = vsnprintf(&s->s_buf[s->s_len], SBUF_FREESPACE(s) + 1,
+		    fmt, ap_copy);
+		va_end(ap_copy);
+
+		if (SBUF_FREESPACE(s) >= len)
+			break;
+		/* Cannot print with the current available space. */
+		if (s->s_drain_func != NULL && s->s_len > 0)
+			error = sbuf_drain(s);
+		else
+			error = sbuf_extend(s, len - SBUF_FREESPACE(s));
+	} while (error == 0);
+
+	/*
+	 * s->s_len is the length of the string, without the terminating nul.
+	 * When updating s->s_len, we must subtract 1 from the length that
+	 * we passed into vsnprintf() because that length includes the
+	 * terminating nul.
+	 *
+	 * vsnprintf() returns the amount that would have been copied,
+	 * given sufficient space, so don't over-increment s_len.
+	 */
+	if (SBUF_FREESPACE(s) < len)
+		len = SBUF_FREESPACE(s);
+	s->s_len += len;
+	if (SBUF_ISSECTION(s))
+		s->s_sect_len += len;
+	if (!SBUF_HASROOM(s) && !SBUF_CANEXTEND(s))
+		s->s_error = ENOMEM;
+
+	KASSERT(s->s_len < s->s_size,
+	    ("wrote past end of sbuf (%d >= %d)", s->s_len, s->s_size));
+
+	if (s->s_error != 0)
+		return (-1);
+	return (0);
+}
+#endif /* _KERNEL */
+
+/*
+ * Format the given arguments and append the resulting string to an sbuf.
+ */
+int
+sbuf_printf(struct sbuf *s, const char *fmt, ...)
+{
+	va_list ap;
+	int result;
+
+	va_start(ap, fmt);
+	result = sbuf_vprintf(s, fmt, ap);
+	va_end(ap);
+	return (result);
+}
+
+/*
+ * Append a character to an sbuf.
+ */
+int
+sbuf_putc(struct sbuf *s, int c)
+{
+
+	sbuf_put_byte(s, c);
+	if (s->s_error != 0)
+		return (-1);
+	return (0);
+}
+
+/*
+ * Trim whitespace characters from end of an sbuf.
+ */
+int
+sbuf_trim(struct sbuf *s)
+{
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+	KASSERT(s->s_drain_func == NULL,
+	    ("%s makes no sense on sbuf %p with drain", __func__, s));
+
+	if (s->s_error != 0)
+		return (-1);
+
+	while (s->s_len > 0 && isspace(s->s_buf[s->s_len-1])) {
+		--s->s_len;
+		if (SBUF_ISSECTION(s))
+			s->s_sect_len--;
+	}
+
+	return (0);
+}
+
+/*
+ * Check if an sbuf has an error.
+ */
+int
+sbuf_error(const struct sbuf *s)
+{
+
+	return (s->s_error);
+}
+
+/*
+ * Finish off an sbuf.
+ */
+int
+sbuf_finish(struct sbuf *s)
+{
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+
+	if (s->s_drain_func != NULL) {
+		while (s->s_len > 0 && s->s_error == 0)
+			s->s_error = sbuf_drain(s);
+	}
+	s->s_buf[s->s_len] = '\0';
+	SBUF_SETFLAG(s, SBUF_FINISHED);
+#ifdef _KERNEL
+	return (s->s_error);
+#else
+	if (s->s_error != 0) {
+		errno = s->s_error;
+		return (-1);
+	}
+	return (0);
+#endif
+}
+
+/*
+ * Return a pointer to the sbuf data.
+ */
+char *
+sbuf_data(struct sbuf *s)
+{
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, SBUF_FINISHED);
+	KASSERT(s->s_drain_func == NULL,
+	    ("%s makes no sense on sbuf %p with drain", __func__, s));
+
+	return (s->s_buf);
+}
+
+/*
+ * Return the length of the sbuf data.
+ */
+ssize_t
+sbuf_len(struct sbuf *s)
+{
+
+	assert_sbuf_integrity(s);
+	/* don't care if it's finished or not */
+	KASSERT(s->s_drain_func == NULL,
+	    ("%s makes no sense on sbuf %p with drain", __func__, s));
+
+	if (s->s_error != 0)
+		return (-1);
+	return (s->s_len);
+}
+
+/*
+ * Clear an sbuf, free its buffer if necessary.
+ */
+void
+sbuf_delete(struct sbuf *s)
+{
+	int isdyn;
+
+	assert_sbuf_integrity(s);
+	/* don't care if it's finished or not */
+
+	if (SBUF_ISDYNAMIC(s))
+		SBFREE(s->s_buf);
+	isdyn = SBUF_ISDYNSTRUCT(s);
+	memset(s, 0, sizeof(*s));
+	if (isdyn)
+		SBFREE(s);
+}
+
+/*
+ * Check if an sbuf has been finished.
+ */
+int
+sbuf_done(const struct sbuf *s)
+{
+
+	return (SBUF_ISFINISHED(s));
+}
+
+/*
+ * Start a section.
+ */
+void
+sbuf_start_section(struct sbuf *s, ssize_t *old_lenp)
+{
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+
+	if (!SBUF_ISSECTION(s)) {
+		KASSERT(s->s_sect_len == 0,
+		    ("s_sect_len != 0 when starting a section"));
+		if (old_lenp != NULL)
+			*old_lenp = -1;
+		SBUF_SETFLAG(s, SBUF_INSECTION);
+	} else {
+		KASSERT(old_lenp != NULL,
+		    ("s_sect_len should be saved when starting a subsection"));
+		*old_lenp = s->s_sect_len;
+		s->s_sect_len = 0;
+	}
+}
+
+/*
+ * End the section padding to the specified length with the specified
+ * character.
+ */
+ssize_t
+sbuf_end_section(struct sbuf *s, ssize_t old_len, size_t pad, int c)
+{
+	ssize_t len;
+
+	assert_sbuf_integrity(s);
+	assert_sbuf_state(s, 0);
+	KASSERT(SBUF_ISSECTION(s),
+	    ("attempt to end a section when not in a section"));
+
+	if (pad > 1) {
+		len = roundup(s->s_sect_len, pad) - s->s_sect_len;
+		for (; s->s_error == 0 && len > 0; len--)
+			sbuf_put_byte(s, c);
+	}
+	len = s->s_sect_len;
+	if (old_len == -1) {
+		s->s_sect_len = 0;
+		SBUF_CLEARFLAG(s, SBUF_INSECTION);
+	} else {
+		s->s_sect_len += old_len;
+	}
+	if (s->s_error != 0)
+		return (-1);
+	return (len);
+}
diff --git a/sys/kern/subr_scanf.c b/sys/kern/subr_scanf.c
new file mode 100644
index 0000000..824e392
--- /dev/null
+++ b/sys/kern/subr_scanf.c
@@ -0,0 +1,641 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Chris Torek.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * From: Id: vfscanf.c,v 1.13 1998/09/25 12:20:27 obrien Exp 
+ * From: static char sccsid[] = "@(#)strtol.c	8.1 (Berkeley) 6/4/93";
+ * From: static char sccsid[] = "@(#)strtoul.c	8.1 (Berkeley) 6/4/93";
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ctype.h>
+#include <sys/limits.h>
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#define	BUF		32 	/* Maximum length of numeric string. */
+
+/*
+ * Flags used during conversion.
+ */
+#define	LONG		0x01	/* l: long or double */
+#define	SHORT		0x04	/* h: short */
+#define	SUPPRESS	0x08	/* suppress assignment */
+#define	POINTER		0x10	/* weird %p pointer (`fake hex') */
+#define	NOSKIP		0x20	/* do not skip blanks */
+#define	QUAD		0x400
+#define	SHORTSHORT	0x4000	/** hh: char */
+
+/*
+ * The following are used in numeric conversions only:
+ * SIGNOK, NDIGITS, DPTOK, and EXPOK are for floating point;
+ * SIGNOK, NDIGITS, PFXOK, and NZDIGITS are for integral.
+ */
+#define	SIGNOK		0x40	/* +/- is (still) legal */
+#define	NDIGITS		0x80	/* no digits detected */
+
+#define	DPTOK		0x100	/* (float) decimal point is still legal */
+#define	EXPOK		0x200	/* (float) exponent (e+3, etc) still legal */
+
+#define	PFXOK		0x100	/* 0x prefix is (still) legal */
+#define	NZDIGITS	0x200	/* no zero digits detected */
+
+/*
+ * Conversion types.
+ */
+#define	CT_CHAR		0	/* %c conversion */
+#define	CT_CCL		1	/* %[...] conversion */
+#define	CT_STRING	2	/* %s conversion */
+#define	CT_INT		3	/* integer, i.e., strtoq or strtouq */
+typedef u_quad_t (*ccfntype)(const char *, char **, int);
+
+static const u_char *__sccl(char *, const u_char *);
+
+int
+sscanf(const char *ibuf, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	
+	va_start(ap, fmt);
+	ret = vsscanf(ibuf, fmt, ap);
+	va_end(ap);
+	return(ret);
+}
+
+int
+vsscanf(const char *inp, char const *fmt0, va_list ap)
+{
+	int inr;
+	const u_char *fmt = (const u_char *)fmt0;
+	int c;			/* character from format, or conversion */
+	size_t width;		/* field width, or 0 */
+	char *p;		/* points into all kinds of strings */
+	int n;			/* handy integer */
+	int flags;		/* flags as defined above */
+	char *p0;		/* saves original value of p when necessary */
+	int nassigned;		/* number of fields assigned */
+	int nconversions;	/* number of conversions */
+	int nread;		/* number of characters consumed from fp */
+	int base;		/* base argument to strtoq/strtouq */
+	ccfntype ccfn;		/* conversion function (strtoq/strtouq) */
+	char ccltab[256];	/* character class table for %[...] */
+	char buf[BUF];		/* buffer for numeric conversions */
+
+	/* `basefix' is used to avoid `if' tests in the integer scanner */
+	static short basefix[17] =
+		{ 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
+
+	inr = strlen(inp);
+	
+	nassigned = 0;
+	nconversions = 0;
+	nread = 0;
+	base = 0;		/* XXX just to keep gcc happy */
+	ccfn = NULL;		/* XXX just to keep gcc happy */
+	for (;;) {
+		c = *fmt++;
+		if (c == 0)
+			return (nassigned);
+		if (isspace(c)) {
+			while (inr > 0 && isspace(*inp))
+				nread++, inr--, inp++;
+			continue;
+		}
+		if (c != '%')
+			goto literal;
+		width = 0;
+		flags = 0;
+		/*
+		 * switch on the format.  continue if done;
+		 * break once format type is derived.
+		 */
+again:		c = *fmt++;
+		switch (c) {
+		case '%':
+literal:
+			if (inr <= 0)
+				goto input_failure;
+			if (*inp != c)
+				goto match_failure;
+			inr--, inp++;
+			nread++;
+			continue;
+
+		case '*':
+			flags |= SUPPRESS;
+			goto again;
+		case 'l':
+			if (flags & LONG){
+				flags &= ~LONG;
+				flags |= QUAD;
+			} else {
+				flags |= LONG;
+			}
+			goto again;
+		case 'q':
+			flags |= QUAD;
+			goto again;
+		case 'h':
+			if (flags & SHORT){
+				flags &= ~SHORT;
+				flags |= SHORTSHORT;
+			} else {
+				flags |= SHORT;
+			}
+			goto again;
+
+		case '0': case '1': case '2': case '3': case '4':
+		case '5': case '6': case '7': case '8': case '9':
+			width = width * 10 + c - '0';
+			goto again;
+
+		/*
+		 * Conversions.
+		 *
+		 */
+		case 'd':
+			c = CT_INT;
+			ccfn = (ccfntype)strtoq;
+			base = 10;
+			break;
+
+		case 'i':
+			c = CT_INT;
+			ccfn = (ccfntype)strtoq;
+			base = 0;
+			break;
+
+		case 'o':
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 8;
+			break;
+
+		case 'u':
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 10;
+			break;
+
+		case 'x':
+			flags |= PFXOK;	/* enable 0x prefixing */
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 16;
+			break;
+
+		case 's':
+			c = CT_STRING;
+			break;
+
+		case '[':
+			fmt = __sccl(ccltab, fmt);
+			flags |= NOSKIP;
+			c = CT_CCL;
+			break;
+
+		case 'c':
+			flags |= NOSKIP;
+			c = CT_CHAR;
+			break;
+
+		case 'p':	/* pointer format is like hex */
+			flags |= POINTER | PFXOK;
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 16;
+			break;
+
+		case 'n':
+			nconversions++;
+			if (flags & SUPPRESS)	/* ??? */
+				continue;
+			if (flags & SHORTSHORT)
+				*va_arg(ap, char *) = nread;
+			else if (flags & SHORT)
+				*va_arg(ap, short *) = nread;
+			else if (flags & LONG)
+				*va_arg(ap, long *) = nread;
+			else if (flags & QUAD)
+				*va_arg(ap, quad_t *) = nread;
+			else
+				*va_arg(ap, int *) = nread;
+			continue;
+		}
+
+		/*
+		 * We have a conversion that requires input.
+		 */
+		if (inr <= 0)
+			goto input_failure;
+
+		/*
+		 * Consume leading white space, except for formats
+		 * that suppress this.
+		 */
+		if ((flags & NOSKIP) == 0) {
+			while (isspace(*inp)) {
+				nread++;
+				if (--inr > 0)
+					inp++;
+				else 
+					goto input_failure;
+			}
+			/*
+			 * Note that there is at least one character in
+			 * the buffer, so conversions that do not set NOSKIP
+			 * can no longer result in an input failure.
+			 */
+		}
+
+		/*
+		 * Do the conversion.
+		 */
+		switch (c) {
+
+		case CT_CHAR:
+			/* scan arbitrary characters (sets NOSKIP) */
+			if (width == 0)
+				width = 1;
+			if (flags & SUPPRESS) {
+				size_t sum = 0;
+				for (;;) {
+					if ((n = inr) < width) {
+						sum += n;
+						width -= n;
+						inp += n;
+						if (sum == 0)
+							goto input_failure;
+						break;
+					} else {
+						sum += width;
+						inr -= width;
+						inp += width;
+						break;
+					}
+				}
+				nread += sum;
+			} else {
+				bcopy(inp, va_arg(ap, char *), width);
+				inr -= width;
+				inp += width;
+				nread += width;
+				nassigned++;
+			}
+			nconversions++;
+			break;
+
+		case CT_CCL:
+			/* scan a (nonempty) character class (sets NOSKIP) */
+			if (width == 0)
+				width = (size_t)~0;	/* `infinity' */
+			/* take only those things in the class */
+			if (flags & SUPPRESS) {
+				n = 0;
+				while (ccltab[(unsigned char)*inp]) {
+					n++, inr--, inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0) {
+						if (n == 0)
+							goto input_failure;
+						break;
+					}
+				}
+				if (n == 0)
+					goto match_failure;
+			} else {
+				p0 = p = va_arg(ap, char *);
+				while (ccltab[(unsigned char)*inp]) {
+					inr--;
+					*p++ = *inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0) {
+						if (p == p0)
+							goto input_failure;
+						break;
+					}
+				}
+				n = p - p0;
+				if (n == 0)
+					goto match_failure;
+				*p = 0;
+				nassigned++;
+			}
+			nread += n;
+			nconversions++;
+			break;
+
+		case CT_STRING:
+			/* like CCL, but zero-length string OK, & no NOSKIP */
+			if (width == 0)
+				width = (size_t)~0;
+			if (flags & SUPPRESS) {
+				n = 0;
+				while (!isspace(*inp)) {
+					n++, inr--, inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0)
+						break;
+				}
+				nread += n;
+			} else {
+				p0 = p = va_arg(ap, char *);
+				while (!isspace(*inp)) {
+					inr--;
+					*p++ = *inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0)
+						break;
+				}
+				*p = 0;
+				nread += p - p0;
+				nassigned++;
+			}
+			nconversions++;
+			continue;
+
+		case CT_INT:
+			/* scan an integer as if by strtoq/strtouq */
+#ifdef hardway
+			if (width == 0 || width > sizeof(buf) - 1)
+				width = sizeof(buf) - 1;
+#else
+			/* size_t is unsigned, hence this optimisation */
+			if (--width > sizeof(buf) - 2)
+				width = sizeof(buf) - 2;
+			width++;
+#endif
+			flags |= SIGNOK | NDIGITS | NZDIGITS;
+			for (p = buf; width; width--) {
+				c = *inp;
+				/*
+				 * Switch on the character; `goto ok'
+				 * if we accept it as a part of number.
+				 */
+				switch (c) {
+
+				/*
+				 * The digit 0 is always legal, but is
+				 * special.  For %i conversions, if no
+				 * digits (zero or nonzero) have been
+				 * scanned (only signs), we will have
+				 * base==0.  In that case, we should set
+				 * it to 8 and enable 0x prefixing.
+				 * Also, if we have not scanned zero digits
+				 * before this, do not turn off prefixing
+				 * (someone else will turn it off if we
+				 * have scanned any nonzero digits).
+				 */
+				case '0':
+					if (base == 0) {
+						base = 8;
+						flags |= PFXOK;
+					}
+					if (flags & NZDIGITS)
+					    flags &= ~(SIGNOK|NZDIGITS|NDIGITS);
+					else
+					    flags &= ~(SIGNOK|PFXOK|NDIGITS);
+					goto ok;
+
+				/* 1 through 7 always legal */
+				case '1': case '2': case '3':
+				case '4': case '5': case '6': case '7':
+					base = basefix[base];
+					flags &= ~(SIGNOK | PFXOK | NDIGITS);
+					goto ok;
+
+				/* digits 8 and 9 ok iff decimal or hex */
+				case '8': case '9':
+					base = basefix[base];
+					if (base <= 8)
+						break;	/* not legal here */
+					flags &= ~(SIGNOK | PFXOK | NDIGITS);
+					goto ok;
+
+				/* letters ok iff hex */
+				case 'A': case 'B': case 'C':
+				case 'D': case 'E': case 'F':
+				case 'a': case 'b': case 'c':
+				case 'd': case 'e': case 'f':
+					/* no need to fix base here */
+					if (base <= 10)
+						break;	/* not legal here */
+					flags &= ~(SIGNOK | PFXOK | NDIGITS);
+					goto ok;
+
+				/* sign ok only as first character */
+				case '+': case '-':
+					if (flags & SIGNOK) {
+						flags &= ~SIGNOK;
+						goto ok;
+					}
+					break;
+
+				/* x ok iff flag still set & 2nd char */
+				case 'x': case 'X':
+					if (flags & PFXOK && p == buf + 1) {
+						base = 16;	/* if %i */
+						flags &= ~PFXOK;
+						goto ok;
+					}
+					break;
+				}
+
+				/*
+				 * If we got here, c is not a legal character
+				 * for a number.  Stop accumulating digits.
+				 */
+				break;
+		ok:
+				/*
+				 * c is legal: store it and look at the next.
+				 */
+				*p++ = c;
+				if (--inr > 0)
+					inp++;
+				else 
+					break;		/* end of input */
+			}
+			/*
+			 * If we had only a sign, it is no good; push
+			 * back the sign.  If the number ends in `x',
+			 * it was [sign] '0' 'x', so push back the x
+			 * and treat it as [sign] '0'.
+			 */
+			if (flags & NDIGITS) {
+				if (p > buf) {
+					inp--;
+					inr++;
+				}
+				goto match_failure;
+			}
+			c = ((u_char *)p)[-1];
+			if (c == 'x' || c == 'X') {
+				--p;
+				inp--;
+				inr++;
+			}
+			if ((flags & SUPPRESS) == 0) {
+				u_quad_t res;
+
+				*p = 0;
+				res = (*ccfn)(buf, (char **)NULL, base);
+				if (flags & POINTER)
+					*va_arg(ap, void **) =
+						(void *)(uintptr_t)res;
+				else if (flags & SHORTSHORT)
+					*va_arg(ap, char *) = res;
+				else if (flags & SHORT)
+					*va_arg(ap, short *) = res;
+				else if (flags & LONG)
+					*va_arg(ap, long *) = res;
+				else if (flags & QUAD)
+					*va_arg(ap, quad_t *) = res;
+				else
+					*va_arg(ap, int *) = res;
+				nassigned++;
+			}
+			nread += p - buf;
+			nconversions++;
+			break;
+
+		}
+	}
+input_failure:
+	return (nconversions != 0 ? nassigned : -1);
+match_failure:
+	return (nassigned);
+}
+
+/*
+ * Fill in the given table from the scanset at the given format
+ * (just after `[').  Return a pointer to the character past the
+ * closing `]'.  The table has a 1 wherever characters should be
+ * considered part of the scanset.
+ */
+static const u_char *
+__sccl(char *tab, const u_char *fmt)
+{
+	int c, n, v;
+
+	/* first `clear' the whole table */
+	c = *fmt++;		/* first char hat => negated scanset */
+	if (c == '^') {
+		v = 1;		/* default => accept */
+		c = *fmt++;	/* get new first char */
+	} else
+		v = 0;		/* default => reject */
+
+	/* XXX: Will not work if sizeof(tab*) > sizeof(char) */
+	for (n = 0; n < 256; n++)
+		     tab[n] = v;	/* memset(tab, v, 256) */
+
+	if (c == 0)
+		return (fmt - 1);/* format ended before closing ] */
+
+	/*
+	 * Now set the entries corresponding to the actual scanset
+	 * to the opposite of the above.
+	 *
+	 * The first character may be ']' (or '-') without being special;
+	 * the last character may be '-'.
+	 */
+	v = 1 - v;
+	for (;;) {
+		tab[c] = v;		/* take character c */
+doswitch:
+		n = *fmt++;		/* and examine the next */
+		switch (n) {
+
+		case 0:			/* format ended too soon */
+			return (fmt - 1);
+
+		case '-':
+			/*
+			 * A scanset of the form
+			 *	[01+-]
+			 * is defined as `the digit 0, the digit 1,
+			 * the character +, the character -', but
+			 * the effect of a scanset such as
+			 *	[a-zA-Z0-9]
+			 * is implementation defined.  The V7 Unix
+			 * scanf treats `a-z' as `the letters a through
+			 * z', but treats `a-a' as `the letter a, the
+			 * character -, and the letter a'.
+			 *
+			 * For compatibility, the `-' is not considerd
+			 * to define a range if the character following
+			 * it is either a close bracket (required by ANSI)
+			 * or is not numerically greater than the character
+			 * we just stored in the table (c).
+			 */
+			n = *fmt;
+			if (n == ']' || n < c) {
+				c = '-';
+				break;	/* resume the for(;;) */
+			}
+			fmt++;
+			/* fill in the range */
+			do {
+			    tab[++c] = v;
+			} while (c < n);
+			c = n;
+			/*
+			 * Alas, the V7 Unix scanf also treats formats
+			 * such as [a-c-e] as `the letters a through e'.
+			 * This too is permitted by the standard....
+			 */
+			goto doswitch;
+			break;
+
+		case ']':		/* end of scanset */
+			return (fmt);
+
+		default:		/* just another character */
+			c = n;
+			break;
+		}
+	}
+	/* NOTREACHED */
+}
+
diff --git a/sys/kern/subr_sglist.c b/sys/kern/subr_sglist.c
new file mode 100644
index 0000000..ea77161
--- /dev/null
+++ b/sys/kern/subr_sglist.c
@@ -0,0 +1,714 @@
+/*-
+ * Copyright (c) 2008 Yahoo!, Inc.
+ * All rights reserved.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+#include <sys/sglist.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#include <sys/ktr.h>
+
+static MALLOC_DEFINE(M_SGLIST, "sglist", "scatter/gather lists");
+
+/*
+ * Convenience macros to save the state of an sglist so it can be restored
+ * if an append attempt fails.  Since sglist's only grow we only need to
+ * save the current count of segments and the length of the ending segment.
+ * Earlier segments will not be changed by an append, and the only change
+ * that can occur to the ending segment is that it can be extended.
+ */
+struct sgsave {
+	u_short sg_nseg;
+	size_t ss_len;
+};
+
+#define	SGLIST_SAVE(sg, sgsave) do {					\
+	(sgsave).sg_nseg = (sg)->sg_nseg;				\
+	if ((sgsave).sg_nseg > 0)					\
+		(sgsave).ss_len = (sg)->sg_segs[(sgsave).sg_nseg - 1].ss_len; \
+	else								\
+		(sgsave).ss_len = 0;					\
+} while (0)
+
+#define	SGLIST_RESTORE(sg, sgsave) do {					\
+	(sg)->sg_nseg = (sgsave).sg_nseg;				\
+	if ((sgsave).sg_nseg > 0)					\
+		(sg)->sg_segs[(sgsave).sg_nseg - 1].ss_len = (sgsave).ss_len; \
+} while (0)
+
+/*
+ * Append a single (paddr, len) to a sglist.  sg is the list and ss is
+ * the current segment in the list.  If we run out of segments then
+ * EFBIG will be returned.
+ */
+static __inline int
+_sglist_append_range(struct sglist *sg, struct sglist_seg **ssp,
+    vm_paddr_t paddr, size_t len)
+{
+	struct sglist_seg *ss;
+
+	ss = *ssp;
+	if (ss->ss_paddr + ss->ss_len == paddr)
+		ss->ss_len += len;
+	else {
+		if (sg->sg_nseg == sg->sg_maxseg)
+			return (EFBIG);
+		ss++;
+		ss->ss_paddr = paddr;
+		ss->ss_len = len;
+		sg->sg_nseg++;
+		*ssp = ss;
+	}
+	return (0);
+}
+
+/*
+ * Worker routine to append a virtual address range (either kernel or
+ * user) to a scatter/gather list.
+ */
+static __inline int
+_sglist_append_buf(struct sglist *sg, void *buf, size_t len, pmap_t pmap,
+    size_t *donep)
+{
+	struct sglist_seg *ss;
+	vm_offset_t vaddr, offset;
+	vm_paddr_t paddr;
+	size_t seglen;
+	int error;
+
+	if (donep)
+		*donep = 0;
+	if (len == 0)
+		return (0);
+
+	/* Do the first page.  It may have an offset. */
+	vaddr = (vm_offset_t)buf;
+	offset = vaddr & PAGE_MASK;
+	if (pmap != NULL)
+		paddr = pmap_extract(pmap, vaddr);
+	else
+		paddr = pmap_kextract(vaddr);
+	seglen = MIN(len, PAGE_SIZE - offset);
+	if (sg->sg_nseg == 0) {
+		ss = sg->sg_segs;
+		ss->ss_paddr = paddr;
+		ss->ss_len = seglen;
+		sg->sg_nseg = 1;
+	} else {
+		ss = &sg->sg_segs[sg->sg_nseg - 1];
+		error = _sglist_append_range(sg, &ss, paddr, seglen);
+		if (error)
+			return (error);
+	}
+	vaddr += seglen;
+	len -= seglen;
+	if (donep)
+		*donep += seglen;
+
+	while (len > 0) {
+		seglen = MIN(len, PAGE_SIZE);
+		if (pmap != NULL)
+			paddr = pmap_extract(pmap, vaddr);
+		else
+			paddr = pmap_kextract(vaddr);
+		error = _sglist_append_range(sg, &ss, paddr, seglen);
+		if (error)
+			return (error);
+		vaddr += seglen;
+		len -= seglen;
+		if (donep)
+			*donep += seglen;
+	}
+
+	return (0);
+}
+
+/*
+ * Determine the number of scatter/gather list elements needed to
+ * describe a kernel virtual address range.
+ */
+int
+sglist_count(void *buf, size_t len)
+{
+	vm_offset_t vaddr, vendaddr;
+	vm_paddr_t lastaddr, paddr;
+	int nsegs;
+
+	if (len == 0)
+		return (0);
+
+	vaddr = trunc_page((vm_offset_t)buf);
+	vendaddr = (vm_offset_t)buf + len;
+	nsegs = 1;
+	lastaddr = pmap_kextract(vaddr);
+	vaddr += PAGE_SIZE;
+	while (vaddr < vendaddr) {
+		paddr = pmap_kextract(vaddr);
+		if (lastaddr + PAGE_SIZE != paddr)
+			nsegs++;
+		lastaddr = paddr;
+		vaddr += PAGE_SIZE;
+	}
+	return (nsegs);
+}
+
+/*
+ * Allocate a scatter/gather list along with 'nsegs' segments.  The
+ * 'mflags' parameters are the same as passed to malloc(9).  The caller
+ * should use sglist_free() to free this list.
+ */
+struct sglist *
+sglist_alloc(int nsegs, int mflags)
+{
+	struct sglist *sg;
+
+	sg = malloc(sizeof(struct sglist) + nsegs * sizeof(struct sglist_seg),
+	    M_SGLIST, mflags);
+	if (sg == NULL)
+		return (NULL);
+	sglist_init(sg, nsegs, (struct sglist_seg *)(sg + 1));
+	return (sg);
+}
+
+/*
+ * Free a scatter/gather list allocated via sglist_allc().
+ */
+void
+sglist_free(struct sglist *sg)
+{
+
+	if (refcount_release(&sg->sg_refs))
+		free(sg, M_SGLIST);
+}
+
+/*
+ * Append the segments to describe a single kernel virtual address
+ * range to a scatter/gather list.  If there are insufficient
+ * segments, then this fails with EFBIG.
+ */
+int
+sglist_append(struct sglist *sg, void *buf, size_t len)
+{
+	struct sgsave save;
+	int error;
+
+	if (sg->sg_maxseg == 0)
+		return (EINVAL);
+	SGLIST_SAVE(sg, save);
+	error = _sglist_append_buf(sg, buf, len, NULL, NULL);
+	if (error)
+		SGLIST_RESTORE(sg, save);
+	return (error);
+}
+
+/*
+ * Append a single physical address range to a scatter/gather list.
+ * If there are insufficient segments, then this fails with EFBIG.
+ */
+int
+sglist_append_phys(struct sglist *sg, vm_paddr_t paddr, size_t len)
+{
+	struct sglist_seg *ss;
+	struct sgsave save;
+	int error;
+
+	if (sg->sg_maxseg == 0)
+		return (EINVAL);
+	if (len == 0)
+		return (0);
+
+	if (sg->sg_nseg == 0) {
+		sg->sg_segs[0].ss_paddr = paddr;
+		sg->sg_segs[0].ss_len = len;
+		sg->sg_nseg = 1;
+		return (0);
+	}
+	ss = &sg->sg_segs[sg->sg_nseg - 1];
+	SGLIST_SAVE(sg, save);
+	error = _sglist_append_range(sg, &ss, paddr, len);
+	if (error)
+		SGLIST_RESTORE(sg, save);
+	return (error);
+}
+
+/*
+ * Append the segments that describe a single mbuf chain to a
+ * scatter/gather list.  If there are insufficient segments, then this
+ * fails with EFBIG.
+ */
+int
+sglist_append_mbuf(struct sglist *sg, struct mbuf *m0)
+{
+	struct sgsave save;
+	struct mbuf *m;
+	int error;
+
+	if (sg->sg_maxseg == 0)
+		return (EINVAL);
+
+	error = 0;
+	SGLIST_SAVE(sg, save);
+	for (m = m0; m != NULL; m = m->m_next) {
+		if (m->m_len > 0) {
+			error = sglist_append(sg, m->m_data, m->m_len);
+			if (error) {
+				SGLIST_RESTORE(sg, save);
+				return (error);
+			}
+		}
+	}
+	return (0);
+}
+
+/*
+ * Append the segments that describe a single user address range to a
+ * scatter/gather list.  If there are insufficient segments, then this
+ * fails with EFBIG.
+ */
+int
+sglist_append_user(struct sglist *sg, void *buf, size_t len, struct thread *td)
+{
+	struct sgsave save;
+	int error;
+
+	if (sg->sg_maxseg == 0)
+		return (EINVAL);
+	SGLIST_SAVE(sg, save);
+	error = _sglist_append_buf(sg, buf, len,
+	    vmspace_pmap(td->td_proc->p_vmspace), NULL);
+	if (error)
+		SGLIST_RESTORE(sg, save);
+	return (error);
+}
+
+/*
+ * Append the segments that describe a single uio to a scatter/gather
+ * list.  If there are insufficient segments, then this fails with
+ * EFBIG.
+ */
+int
+sglist_append_uio(struct sglist *sg, struct uio *uio)
+{
+	struct iovec *iov;
+	struct sgsave save;
+	size_t resid, minlen;
+	pmap_t pmap;
+	int error, i;
+
+	if (sg->sg_maxseg == 0)
+		return (EINVAL);
+
+	resid = uio->uio_resid;
+	iov = uio->uio_iov;
+
+	if (uio->uio_segflg == UIO_USERSPACE) {
+		KASSERT(uio->uio_td != NULL,
+		    ("sglist_append_uio: USERSPACE but no thread"));
+		pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
+	} else
+		pmap = NULL;
+
+	error = 0;
+	SGLIST_SAVE(sg, save);
+	for (i = 0; i < uio->uio_iovcnt && resid != 0; i++) {
+		/*
+		 * Now at the first iovec to load.  Load each iovec
+		 * until we have exhausted the residual count.
+		 */
+		minlen = MIN(resid, iov[i].iov_len);
+		if (minlen > 0) {
+			error = _sglist_append_buf(sg, iov[i].iov_base, minlen,
+			    pmap, NULL);
+			if (error) {
+				SGLIST_RESTORE(sg, save);
+				return (error);
+			}
+			resid -= minlen;
+		}
+	}
+	return (0);
+}
+
+/*
+ * Append the segments that describe at most 'resid' bytes from a
+ * single uio to a scatter/gather list.  If there are insufficient
+ * segments, then only the amount that fits is appended.
+ */
+int
+sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid)
+{
+	struct iovec *iov;
+	size_t done;
+	pmap_t pmap;
+	int error, len;
+
+	if (sg->sg_maxseg == 0)
+		return (EINVAL);
+
+	if (uio->uio_segflg == UIO_USERSPACE) {
+		KASSERT(uio->uio_td != NULL,
+		    ("sglist_consume_uio: USERSPACE but no thread"));
+		pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
+	} else
+		pmap = NULL;
+
+	error = 0;
+	while (resid > 0 && uio->uio_resid) {
+		iov = uio->uio_iov;
+		len = iov->iov_len;
+		if (len == 0) {
+			uio->uio_iov++;
+			uio->uio_iovcnt--;
+			continue;
+		}
+		if (len > resid)
+			len = resid;
+
+		/*
+		 * Try to append this iovec.  If we run out of room,
+		 * then break out of the loop.
+		 */
+		error = _sglist_append_buf(sg, iov->iov_base, len, pmap, &done);
+		iov->iov_base = (char *)iov->iov_base + done;
+		iov->iov_len -= done;
+		uio->uio_resid -= done;
+		uio->uio_offset += done;
+		resid -= done;
+		if (error)
+			break;
+	}
+	return (0);
+}
+
+/*
+ * Allocate and populate a scatter/gather list to describe a single
+ * kernel virtual address range.
+ */
+struct sglist *
+sglist_build(void *buf, size_t len, int mflags)
+{
+	struct sglist *sg;
+	int nsegs;
+
+	if (len == 0)
+		return (NULL);
+
+	nsegs = sglist_count(buf, len);
+	sg = sglist_alloc(nsegs, mflags);
+	if (sg == NULL)
+		return (NULL);
+	if (sglist_append(sg, buf, len) != 0) {
+		sglist_free(sg);
+		return (NULL);
+	}
+	return (sg);
+}
+
+/*
+ * Clone a new copy of a scatter/gather list.
+ */
+struct sglist *
+sglist_clone(struct sglist *sg, int mflags)
+{
+	struct sglist *new;
+
+	if (sg == NULL)
+		return (NULL);
+	new = sglist_alloc(sg->sg_maxseg, mflags);
+	if (new == NULL)
+		return (NULL);
+	new->sg_nseg = sg->sg_nseg;
+	bcopy(sg->sg_segs, new->sg_segs, sizeof(struct sglist_seg) *
+	    sg->sg_nseg);
+	return (new);
+}
+
+/*
+ * Calculate the total length of the segments described in a
+ * scatter/gather list.
+ */
+size_t
+sglist_length(struct sglist *sg)
+{
+	size_t space;
+	int i;
+
+	space = 0;
+	for (i = 0; i < sg->sg_nseg; i++)
+		space += sg->sg_segs[i].ss_len;
+	return (space);
+}
+
+/*
+ * Split a scatter/gather list into two lists.  The scatter/gather
+ * entries for the first 'length' bytes of the 'original' list are
+ * stored in the '*head' list and are removed from 'original'.
+ *
+ * If '*head' is NULL, then a new list will be allocated using
+ * 'mflags'.  If M_NOWAIT is specified and the allocation fails,
+ * ENOMEM will be returned.
+ *
+ * If '*head' is not NULL, it should point to an empty sglist.  If it
+ * does not have enough room for the remaining space, then EFBIG will
+ * be returned.  If '*head' is not empty, then EINVAL will be
+ * returned.
+ *
+ * If 'original' is shared (refcount > 1), then EDOOFUS will be
+ * returned.
+ */
+int
+sglist_split(struct sglist *original, struct sglist **head, size_t length,
+    int mflags)
+{
+	struct sglist *sg;
+	size_t space, split;
+	int count, i;
+
+	if (original->sg_refs > 1)
+		return (EDOOFUS);
+
+	/* Figure out how big of a sglist '*head' has to hold. */
+	count = 0;
+	space = 0;
+	split = 0;
+	for (i = 0; i < original->sg_nseg; i++) {
+		space += original->sg_segs[i].ss_len;
+		count++;
+		if (space >= length) {
+			/*
+			 * If 'length' falls in the middle of a
+			 * scatter/gather list entry, then 'split'
+			 * holds how much of that entry will remain in
+			 * 'original'.
+			 */
+			split = space - length;
+			break;
+		}
+	}
+
+	/* Nothing to do, so leave head empty. */
+	if (count == 0)
+		return (0);
+
+	if (*head == NULL) {
+		sg = sglist_alloc(count, mflags);
+		if (sg == NULL)
+			return (ENOMEM);
+		*head = sg;
+	} else {
+		sg = *head;
+		if (sg->sg_maxseg < count)
+			return (EFBIG);
+		if (sg->sg_nseg != 0)
+			return (EINVAL);
+	}
+
+	/* Copy 'count' entries to 'sg' from 'original'. */
+	bcopy(original->sg_segs, sg->sg_segs, count *
+	    sizeof(struct sglist_seg));
+	sg->sg_nseg = count;
+
+	/*
+	 * If we had to split a list entry, fixup the last entry in
+	 * 'sg' and the new first entry in 'original'.  We also
+	 * decrement 'count' by 1 since we will only be removing
+	 * 'count - 1' segments from 'original' now.
+	 */
+	if (split != 0) {
+		count--;
+		sg->sg_segs[count].ss_len -= split;
+		original->sg_segs[count].ss_paddr =
+		    sg->sg_segs[count].ss_paddr + split;
+		original->sg_segs[count].ss_len = split;
+	}
+
+	/* Trim 'count' entries from the front of 'original'. */
+	original->sg_nseg -= count;
+	bcopy(original->sg_segs + count, original->sg_segs, count *
+	    sizeof(struct sglist_seg));
+	return (0);
+}
+
+/*
+ * Append the scatter/gather list elements in 'second' to the
+ * scatter/gather list 'first'.  If there is not enough space in
+ * 'first', EFBIG is returned.
+ */
+int
+sglist_join(struct sglist *first, struct sglist *second)
+{
+	struct sglist_seg *flast, *sfirst;
+	int append;
+
+	/* If 'second' is empty, there is nothing to do. */
+	if (second->sg_nseg == 0)
+		return (0);
+
+	/*
+	 * If the first entry in 'second' can be appended to the last entry
+	 * in 'first' then set append to '1'.
+	 */
+	append = 0;
+	flast = &first->sg_segs[first->sg_nseg - 1];
+	sfirst = &second->sg_segs[0];
+	if (first->sg_nseg != 0 &&
+	    flast->ss_paddr + flast->ss_len == sfirst->ss_paddr)
+		append = 1;
+
+	/* Make sure 'first' has enough room. */
+	if (first->sg_nseg + second->sg_nseg - append > first->sg_maxseg)
+		return (EFBIG);
+
+	/* Merge last in 'first' and first in 'second' if needed. */
+	if (append)
+		flast->ss_len += sfirst->ss_len;
+
+	/* Append new segments from 'second' to 'first'. */
+	bcopy(first->sg_segs + first->sg_nseg, second->sg_segs + append,
+	    (second->sg_nseg - append) * sizeof(struct sglist_seg));
+	first->sg_nseg += second->sg_nseg - append;
+	sglist_reset(second);
+	return (0);
+}
+
+/*
+ * Generate a new scatter/gather list from a range of an existing
+ * scatter/gather list.  The 'offset' and 'length' parameters specify
+ * the logical range of the 'original' list to extract.  If that range
+ * is not a subset of the length of 'original', then EINVAL is
+ * returned.  The new scatter/gather list is stored in '*slice'.
+ *
+ * If '*slice' is NULL, then a new list will be allocated using
+ * 'mflags'.  If M_NOWAIT is specified and the allocation fails,
+ * ENOMEM will be returned.
+ *
+ * If '*slice' is not NULL, it should point to an empty sglist.  If it
+ * does not have enough room for the remaining space, then EFBIG will
+ * be returned.  If '*slice' is not empty, then EINVAL will be
+ * returned.
+ */
+int
+sglist_slice(struct sglist *original, struct sglist **slice, size_t offset,
+    size_t length, int mflags)
+{
+	struct sglist *sg;
+	size_t space, end, foffs, loffs;
+	int count, i, fseg;
+
+	/* Nothing to do. */
+	if (length == 0)
+		return (0);
+
+	/* Figure out how many segments '*slice' needs to have. */
+	end = offset + length;
+	space = 0;
+	count = 0;
+	fseg = 0;
+	foffs = loffs = 0;
+	for (i = 0; i < original->sg_nseg; i++) {
+		space += original->sg_segs[i].ss_len;
+		if (space > offset) {
+			/*
+			 * When we hit the first segment, store its index
+			 * in 'fseg' and the offset into the first segment
+			 * of 'offset' in 'foffs'.
+			 */
+			if (count == 0) {
+				fseg = i;
+				foffs = offset - (space -
+				    original->sg_segs[i].ss_len);
+				CTR1(KTR_DEV, "sglist_slice: foffs = %08lx",
+				    foffs);
+			}
+			count++;
+
+			/*
+			 * When we hit the last segment, break out of
+			 * the loop.  Store the amount of extra space
+			 * at the end of this segment in 'loffs'.
+			 */
+			if (space >= end) {
+				loffs = space - end;
+				CTR1(KTR_DEV, "sglist_slice: loffs = %08lx",
+				    loffs);
+				break;
+			}
+		}
+	}
+
+	/* If we never hit 'end', then 'length' ran off the end, so fail. */
+	if (space < end)
+		return (EINVAL);
+
+	if (*slice == NULL) {
+		sg = sglist_alloc(count, mflags);
+		if (sg == NULL)
+			return (ENOMEM);
+		*slice = sg;
+	} else {
+		sg = *slice;
+		if (sg->sg_maxseg < count)
+			return (EFBIG);
+		if (sg->sg_nseg != 0)
+			return (EINVAL);
+	}
+
+	/*
+	 * Copy over 'count' segments from 'original' starting at
+	 * 'fseg' to 'sg'.
+	 */
+	bcopy(original->sg_segs + fseg, sg->sg_segs,
+	    count * sizeof(struct sglist_seg));
+	sg->sg_nseg = count;
+
+	/* Fixup first and last segments if needed. */
+	if (foffs != 0) {
+		sg->sg_segs[0].ss_paddr += foffs;
+		sg->sg_segs[0].ss_len -= foffs;
+		CTR2(KTR_DEV, "sglist_slice seg[0]: %08lx:%08lx",
+		    (long)sg->sg_segs[0].ss_paddr, sg->sg_segs[0].ss_len);
+	}
+	if (loffs != 0) {
+		sg->sg_segs[count - 1].ss_len -= loffs;
+		CTR2(KTR_DEV, "sglist_slice seg[%d]: len %08x", count - 1,
+		    sg->sg_segs[count - 1].ss_len);
+	}
+	return (0);
+}
diff --git a/sys/kern/subr_sleepqueue.c b/sys/kern/subr_sleepqueue.c
new file mode 100644
index 0000000..92b5147
--- /dev/null
+++ b/sys/kern/subr_sleepqueue.c
@@ -0,0 +1,1236 @@
+/*-
+ * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Implementation of sleep queues used to hold queue of threads blocked on
+ * a wait channel.  Sleep queues different from turnstiles in that wait
+ * channels are not owned by anyone, so there is no priority propagation.
+ * Sleep queues can also provide a timeout and can also be interrupted by
+ * signals.  That said, there are several similarities between the turnstile
+ * and sleep queue implementations.  (Note: turnstiles were implemented
+ * first.)  For example, both use a hash table of the same size where each
+ * bucket is referred to as a "chain" that contains both a spin lock and
+ * a linked list of queues.  An individual queue is located by using a hash
+ * to pick a chain, locking the chain, and then walking the chain searching
+ * for the queue.  This means that a wait channel object does not need to
+ * embed it's queue head just as locks do not embed their turnstile queue
+ * head.  Threads also carry around a sleep queue that they lend to the
+ * wait channel when blocking.  Just as in turnstiles, the queue includes
+ * a free list of the sleep queues of other threads blocked on the same
+ * wait channel in the case of multiple waiters.
+ *
+ * Some additional functionality provided by sleep queues include the
+ * ability to set a timeout.  The timeout is managed using a per-thread
+ * callout that resumes a thread if it is asleep.  A thread may also
+ * catch signals while it is asleep (aka an interruptible sleep).  The
+ * signal code uses sleepq_abort() to interrupt a sleeping thread.  Finally,
+ * sleep queues also provide some extra assertions.  One is not allowed to
+ * mix the sleep/wakeup and cv APIs for a given wait channel.  Also, one
+ * must consistently use the same lock to synchronize with a wait channel,
+ * though this check is currently only a warning for sleep/wakeup due to
+ * pre-existing abuse of that API.  The same lock must also be held when
+ * awakening threads, though that is currently only enforced for condition
+ * variables.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_sleepqueue_profiling.h"
+#include "opt_ddb.h"
+#include "opt_kdtrace.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/sleepqueue.h>
+#include <sys/sysctl.h>
+
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * Constants for the hash table of sleep queue chains.
+ * SC_TABLESIZE must be a power of two for SC_MASK to work properly.
+ */
+#define	SC_TABLESIZE	256			/* Must be power of 2. */
+#define	SC_MASK		(SC_TABLESIZE - 1)
+#define	SC_SHIFT	8
+#define	SC_HASH(wc)	((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \
+			    SC_MASK)
+#define	SC_LOOKUP(wc)	&sleepq_chains[SC_HASH(wc)]
+#define NR_SLEEPQS      2
+/*
+ * There two different lists of sleep queues.  Both lists are connected
+ * via the sq_hash entries.  The first list is the sleep queue chain list
+ * that a sleep queue is on when it is attached to a wait channel.  The
+ * second list is the free list hung off of a sleep queue that is attached
+ * to a wait channel.
+ *
+ * Each sleep queue also contains the wait channel it is attached to, the
+ * list of threads blocked on that wait channel, flags specific to the
+ * wait channel, and the lock used to synchronize with a wait channel.
+ * The flags are used to catch mismatches between the various consumers
+ * of the sleep queue API (e.g. sleep/wakeup and condition variables).
+ * The lock pointer is only used when invariants are enabled for various
+ * debugging checks.
+ *
+ * Locking key:
+ *  c - sleep queue chain lock
+ */
+struct sleepqueue {
+	TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS];	/* (c) Blocked threads. */
+	u_int sq_blockedcnt[NR_SLEEPQS];	/* (c) N. of blocked threads. */
+	LIST_ENTRY(sleepqueue) sq_hash;		/* (c) Chain and free list. */
+	LIST_HEAD(, sleepqueue) sq_free;	/* (c) Free queues. */
+	void	*sq_wchan;			/* (c) Wait channel. */
+	int	sq_type;			/* (c) Queue type. */
+#ifdef INVARIANTS
+	struct lock_object *sq_lock;		/* (c) Associated lock. */
+#endif
+};
+
+struct sleepqueue_chain {
+	LIST_HEAD(, sleepqueue) sc_queues;	/* List of sleep queues. */
+	struct mtx sc_lock;			/* Spin lock for this chain. */
+#ifdef SLEEPQUEUE_PROFILING
+	u_int	sc_depth;			/* Length of sc_queues. */
+	u_int	sc_max_depth;			/* Max length of sc_queues. */
+#endif
+};
+
+#ifdef SLEEPQUEUE_PROFILING
+u_int sleepq_max_depth;
+static SYSCTL_NODE(_debug, OID_AUTO, sleepq, CTLFLAG_RD, 0, "sleepq profiling");
+static SYSCTL_NODE(_debug_sleepq, OID_AUTO, chains, CTLFLAG_RD, 0,
+    "sleepq chain stats");
+SYSCTL_UINT(_debug_sleepq, OID_AUTO, max_depth, CTLFLAG_RD, &sleepq_max_depth,
+    0, "maxmimum depth achieved of a single chain");
+
+static void	sleepq_profile(const char *wmesg);
+static int	prof_enabled;
+#endif
+static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE];
+static uma_zone_t sleepq_zone;
+
+/*
+ * Prototypes for non-exported routines.
+ */
+static int	sleepq_catch_signals(void *wchan, int pri);
+static int	sleepq_check_signals(void);
+static int	sleepq_check_timeout(void);
+#ifdef INVARIANTS
+static void	sleepq_dtor(void *mem, int size, void *arg);
+#endif
+static int	sleepq_init(void *mem, int size, int flags);
+static int	sleepq_resume_thread(struct sleepqueue *sq, struct thread *td,
+		    int pri);
+static void	sleepq_switch(void *wchan, int pri);
+static void	sleepq_timeout(void *arg);
+
+SDT_PROBE_DECLARE(sched, , , sleep);
+SDT_PROBE_DECLARE(sched, , , wakeup);
+
+/*
+ * Early initialization of sleep queues that is called from the sleepinit()
+ * SYSINIT.
+ */
+void
+init_sleepqueues(void)
+{
+#ifdef SLEEPQUEUE_PROFILING
+	struct sysctl_oid *chain_oid;
+	char chain_name[10];
+#endif
+	int i;
+
+	for (i = 0; i < SC_TABLESIZE; i++) {
+		LIST_INIT(&sleepq_chains[i].sc_queues);
+		mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL,
+		    MTX_SPIN | MTX_RECURSE);
+#ifdef SLEEPQUEUE_PROFILING
+		snprintf(chain_name, sizeof(chain_name), "%d", i);
+		chain_oid = SYSCTL_ADD_NODE(NULL, 
+		    SYSCTL_STATIC_CHILDREN(_debug_sleepq_chains), OID_AUTO,
+		    chain_name, CTLFLAG_RD, NULL, "sleepq chain stats");
+		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
+		    "depth", CTLFLAG_RD, &sleepq_chains[i].sc_depth, 0, NULL);
+		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
+		    "max_depth", CTLFLAG_RD, &sleepq_chains[i].sc_max_depth, 0,
+		    NULL);
+#endif
+	}
+	sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue),
+#ifdef INVARIANTS
+	    NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
+#else
+	    NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
+#endif
+	
+	thread0.td_sleepqueue = sleepq_alloc();
+}
+
+/*
+ * Get a sleep queue for a new thread.
+ */
+struct sleepqueue *
+sleepq_alloc(void)
+{
+
+	return (uma_zalloc(sleepq_zone, M_WAITOK));
+}
+
+/*
+ * Free a sleep queue when a thread is destroyed.
+ */
+void
+sleepq_free(struct sleepqueue *sq)
+{
+
+	uma_zfree(sleepq_zone, sq);
+}
+
+/*
+ * Lock the sleep queue chain associated with the specified wait channel.
+ */
+void
+sleepq_lock(void *wchan)
+{
+	struct sleepqueue_chain *sc;
+
+	sc = SC_LOOKUP(wchan);
+	mtx_lock_spin(&sc->sc_lock);
+}
+
+/*
+ * Look up the sleep queue associated with a given wait channel in the hash
+ * table locking the associated sleep queue chain.  If no queue is found in
+ * the table, NULL is returned.
+ */
+struct sleepqueue *
+sleepq_lookup(void *wchan)
+{
+	struct sleepqueue_chain *sc;
+	struct sleepqueue *sq;
+
+	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+	sc = SC_LOOKUP(wchan);
+	mtx_assert(&sc->sc_lock, MA_OWNED);
+	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
+		if (sq->sq_wchan == wchan)
+			return (sq);
+	return (NULL);
+}
+
+/*
+ * Unlock the sleep queue chain associated with a given wait channel.
+ */
+void
+sleepq_release(void *wchan)
+{
+	struct sleepqueue_chain *sc;
+
+	sc = SC_LOOKUP(wchan);
+	mtx_unlock_spin(&sc->sc_lock);
+}
+
+/*
+ * Places the current thread on the sleep queue for the specified wait
+ * channel.  If INVARIANTS is enabled, then it associates the passed in
+ * lock with the sleepq to make sure it is held when that sleep queue is
+ * woken up.
+ */
+void
+sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags,
+    int queue)
+{
+	struct sleepqueue_chain *sc;
+	struct sleepqueue *sq;
+	struct thread *td;
+
+	td = curthread;
+	sc = SC_LOOKUP(wchan);
+	mtx_assert(&sc->sc_lock, MA_OWNED);
+	MPASS(td->td_sleepqueue != NULL);
+	MPASS(wchan != NULL);
+	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
+
+	/* If this thread is not allowed to sleep, die a horrible death. */
+	KASSERT(td->td_no_sleeping == 0,
+	    ("%s: td %p to sleep on wchan %p with sleeping prohibited",
+	    __func__, td, wchan));
+
+	/* Look up the sleep queue associated with the wait channel 'wchan'. */
+	sq = sleepq_lookup(wchan);
+
+	/*
+	 * If the wait channel does not already have a sleep queue, use
+	 * this thread's sleep queue.  Otherwise, insert the current thread
+	 * into the sleep queue already in use by this wait channel.
+	 */
+	if (sq == NULL) {
+#ifdef INVARIANTS
+		int i;
+
+		sq = td->td_sleepqueue;
+		for (i = 0; i < NR_SLEEPQS; i++) {
+			KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]),
+			    ("thread's sleep queue %d is not empty", i));
+			KASSERT(sq->sq_blockedcnt[i] == 0,
+			    ("thread's sleep queue %d count mismatches", i));
+		}
+		KASSERT(LIST_EMPTY(&sq->sq_free),
+		    ("thread's sleep queue has a non-empty free list"));
+		KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
+		sq->sq_lock = lock;
+#endif
+#ifdef SLEEPQUEUE_PROFILING
+		sc->sc_depth++;
+		if (sc->sc_depth > sc->sc_max_depth) {
+			sc->sc_max_depth = sc->sc_depth;
+			if (sc->sc_max_depth > sleepq_max_depth)
+				sleepq_max_depth = sc->sc_max_depth;
+		}
+#endif
+		sq = td->td_sleepqueue;
+		LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
+		sq->sq_wchan = wchan;
+		sq->sq_type = flags & SLEEPQ_TYPE;
+	} else {
+		MPASS(wchan == sq->sq_wchan);
+		MPASS(lock == sq->sq_lock);
+		MPASS((flags & SLEEPQ_TYPE) == sq->sq_type);
+		LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash);
+	}
+	thread_lock(td);
+	TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq);
+	sq->sq_blockedcnt[queue]++;
+	td->td_sleepqueue = NULL;
+	td->td_sqqueue = queue;
+	td->td_wchan = wchan;
+	td->td_wmesg = wmesg;
+	if (flags & SLEEPQ_INTERRUPTIBLE) {
+		td->td_flags |= TDF_SINTR;
+		td->td_flags &= ~TDF_SLEEPABORT;
+	}
+	thread_unlock(td);
+}
+
+/*
+ * Sets a timeout that will remove the current thread from the specified
+ * sleep queue after timo ticks if the thread has not already been awakened.
+ */
+void
+sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
+    int flags)
+{
+	struct sleepqueue_chain *sc;
+	struct thread *td;
+
+	td = curthread;
+	sc = SC_LOOKUP(wchan);
+	mtx_assert(&sc->sc_lock, MA_OWNED);
+	MPASS(TD_ON_SLEEPQ(td));
+	MPASS(td->td_sleepqueue == NULL);
+	MPASS(wchan != NULL);
+	callout_reset_sbt_on(&td->td_slpcallout, sbt, pr,
+	    sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC);
+}
+
+/*
+ * Return the number of actual sleepers for the specified queue.
+ */
+u_int
+sleepq_sleepcnt(void *wchan, int queue)
+{
+	struct sleepqueue *sq;
+
+	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
+	sq = sleepq_lookup(wchan);
+	if (sq == NULL)
+		return (0);
+	return (sq->sq_blockedcnt[queue]);
+}
+
+/*
+ * Marks the pending sleep of the current thread as interruptible and
+ * makes an initial check for pending signals before putting a thread
+ * to sleep. Enters and exits with the thread lock held.  Thread lock
+ * may have transitioned from the sleepq lock to a run lock.
+ */
+static int
+sleepq_catch_signals(void *wchan, int pri)
+{
+	struct sleepqueue_chain *sc;
+	struct sleepqueue *sq;
+	struct thread *td;
+	struct proc *p;
+	struct sigacts *ps;
+	int sig, ret;
+
+	td = curthread;
+	p = curproc;
+	sc = SC_LOOKUP(wchan);
+	mtx_assert(&sc->sc_lock, MA_OWNED);
+	MPASS(wchan != NULL);
+	if ((td->td_pflags & TDP_WAKEUP) != 0) {
+		td->td_pflags &= ~TDP_WAKEUP;
+		ret = EINTR;
+		thread_lock(td);
+		goto out;
+	}
+
+	/*
+	 * See if there are any pending signals for this thread.  If not
+	 * we can switch immediately.  Otherwise do the signal processing
+	 * directly.
+	 */
+	thread_lock(td);
+	if ((td->td_flags & (TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK)) == 0) {
+		sleepq_switch(wchan, pri);
+		return (0);
+	}
+	thread_unlock(td);
+	mtx_unlock_spin(&sc->sc_lock);
+	CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)",
+		(void *)td, (long)p->p_pid, td->td_name);
+	PROC_LOCK(p);
+	ps = p->p_sigacts;
+	mtx_lock(&ps->ps_mtx);
+	sig = cursig(td);
+	if (sig == 0) {
+		mtx_unlock(&ps->ps_mtx);
+		ret = thread_suspend_check(1);
+		MPASS(ret == 0 || ret == EINTR || ret == ERESTART);
+	} else {
+		if (SIGISMEMBER(ps->ps_sigintr, sig))
+			ret = EINTR;
+		else
+			ret = ERESTART;
+		mtx_unlock(&ps->ps_mtx);
+	}
+	/*
+	 * Lock the per-process spinlock prior to dropping the PROC_LOCK
+	 * to avoid a signal delivery race.  PROC_LOCK, PROC_SLOCK, and
+	 * thread_lock() are currently held in tdsendsignal().
+	 */
+	PROC_SLOCK(p);
+	mtx_lock_spin(&sc->sc_lock);
+	PROC_UNLOCK(p);
+	thread_lock(td);
+	PROC_SUNLOCK(p);
+	if (ret == 0) {
+		sleepq_switch(wchan, pri);
+		return (0);
+	}
+out:
+	/*
+	 * There were pending signals and this thread is still
+	 * on the sleep queue, remove it from the sleep queue.
+	 */
+	if (TD_ON_SLEEPQ(td)) {
+		sq = sleepq_lookup(wchan);
+		if (sleepq_resume_thread(sq, td, 0)) {
+#ifdef INVARIANTS
+			/*
+			 * This thread hasn't gone to sleep yet, so it
+			 * should not be swapped out.
+			 */
+			panic("not waking up swapper");
+#endif
+		}
+	}
+	mtx_unlock_spin(&sc->sc_lock);
+	MPASS(td->td_lock != &sc->sc_lock);
+	return (ret);
+}
+
+/*
+ * Switches to another thread if we are still asleep on a sleep queue.
+ * Returns with thread lock.
+ */
+static void
+sleepq_switch(void *wchan, int pri)
+{
+	struct sleepqueue_chain *sc;
+	struct sleepqueue *sq;
+	struct thread *td;
+
+	td = curthread;
+	sc = SC_LOOKUP(wchan);
+	mtx_assert(&sc->sc_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+	/* 
+	 * If we have a sleep queue, then we've already been woken up, so
+	 * just return.
+	 */
+	if (td->td_sleepqueue != NULL) {
+		mtx_unlock_spin(&sc->sc_lock);
+		return;
+	}
+
+	/*
+	 * If TDF_TIMEOUT is set, then our sleep has been timed out
+	 * already but we are still on the sleep queue, so dequeue the
+	 * thread and return.
+	 */
+	if (td->td_flags & TDF_TIMEOUT) {
+		MPASS(TD_ON_SLEEPQ(td));
+		sq = sleepq_lookup(wchan);
+		if (sleepq_resume_thread(sq, td, 0)) {
+#ifdef INVARIANTS
+			/*
+			 * This thread hasn't gone to sleep yet, so it
+			 * should not be swapped out.
+			 */
+			panic("not waking up swapper");
+#endif
+		}
+		mtx_unlock_spin(&sc->sc_lock);
+		return;		
+	}
+#ifdef SLEEPQUEUE_PROFILING
+	if (prof_enabled)
+		sleepq_profile(td->td_wmesg);
+#endif
+	MPASS(td->td_sleepqueue == NULL);
+	sched_sleep(td, pri);
+	thread_lock_set(td, &sc->sc_lock);
+	SDT_PROBE0(sched, , , sleep);
+	TD_SET_SLEEPING(td);
+	mi_switch(SW_VOL | SWT_SLEEPQ, NULL);
+	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
+	CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",
+	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
+}
+
+/*
+ * Check to see if we timed out.
+ */
+static int
+sleepq_check_timeout(void)
+{
+	struct thread *td;
+
+	td = curthread;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+	/*
+	 * If TDF_TIMEOUT is set, we timed out.
+	 */
+	if (td->td_flags & TDF_TIMEOUT) {
+		td->td_flags &= ~TDF_TIMEOUT;
+		return (EWOULDBLOCK);
+	}
+
+	/*
+	 * If TDF_TIMOFAIL is set, the timeout ran after we had
+	 * already been woken up.
+	 */
+	if (td->td_flags & TDF_TIMOFAIL)
+		td->td_flags &= ~TDF_TIMOFAIL;
+
+	/*
+	 * If callout_stop() fails, then the timeout is running on
+	 * another CPU, so synchronize with it to avoid having it
+	 * accidentally wake up a subsequent sleep.
+	 */
+	else if (callout_stop(&td->td_slpcallout) == 0) {
+		td->td_flags |= TDF_TIMEOUT;
+		TD_SET_SLEEPING(td);
+		mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL);
+	}
+	return (0);
+}
+
+/*
+ * Check to see if we were awoken by a signal.
+ */
+static int
+sleepq_check_signals(void)
+{
+	struct thread *td;
+
+	td = curthread;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+	/* We are no longer in an interruptible sleep. */
+	if (td->td_flags & TDF_SINTR)
+		td->td_flags &= ~TDF_SINTR;
+
+	if (td->td_flags & TDF_SLEEPABORT) {
+		td->td_flags &= ~TDF_SLEEPABORT;
+		return (td->td_intrval);
+	}
+
+	return (0);
+}
+
+/*
+ * Block the current thread until it is awakened from its sleep queue.
+ */
+void
+sleepq_wait(void *wchan, int pri)
+{
+	struct thread *td;
+
+	td = curthread;
+	MPASS(!(td->td_flags & TDF_SINTR));
+	thread_lock(td);
+	sleepq_switch(wchan, pri);
+	thread_unlock(td);
+}
+
+/*
+ * Block the current thread until it is awakened from its sleep queue
+ * or it is interrupted by a signal.
+ */
+int
+sleepq_wait_sig(void *wchan, int pri)
+{
+	int rcatch;
+	int rval;
+
+	rcatch = sleepq_catch_signals(wchan, pri);
+	rval = sleepq_check_signals();
+	thread_unlock(curthread);
+	if (rcatch)
+		return (rcatch);
+	return (rval);
+}
+
+/*
+ * Block the current thread until it is awakened from its sleep queue
+ * or it times out while waiting.
+ */
+int
+sleepq_timedwait(void *wchan, int pri)
+{
+	struct thread *td;
+	int rval;
+
+	td = curthread;
+	MPASS(!(td->td_flags & TDF_SINTR));
+	thread_lock(td);
+	sleepq_switch(wchan, pri);
+	rval = sleepq_check_timeout();
+	thread_unlock(td);
+
+	return (rval);
+}
+
+/*
+ * Block the current thread until it is awakened from its sleep queue,
+ * it is interrupted by a signal, or it times out waiting to be awakened.
+ */
+int
+sleepq_timedwait_sig(void *wchan, int pri)
+{
+	int rcatch, rvalt, rvals;
+
+	rcatch = sleepq_catch_signals(wchan, pri);
+	rvalt = sleepq_check_timeout();
+	rvals = sleepq_check_signals();
+	thread_unlock(curthread);
+	if (rcatch)
+		return (rcatch);
+	if (rvals)
+		return (rvals);
+	return (rvalt);
+}
+
+/*
+ * Returns the type of sleepqueue given a waitchannel.
+ */
+int
+sleepq_type(void *wchan)
+{
+	struct sleepqueue *sq;
+	int type;
+
+	MPASS(wchan != NULL);
+
+	sleepq_lock(wchan);
+	sq = sleepq_lookup(wchan);
+	if (sq == NULL) {
+		sleepq_release(wchan);
+		return (-1);
+	}
+	type = sq->sq_type;
+	sleepq_release(wchan);
+	return (type);
+}
+
+/*
+ * Removes a thread from a sleep queue and makes it
+ * runnable.
+ */
+static int
+sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri)
+{
+	struct sleepqueue_chain *sc;
+
+	MPASS(td != NULL);
+	MPASS(sq->sq_wchan != NULL);
+	MPASS(td->td_wchan == sq->sq_wchan);
+	MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	sc = SC_LOOKUP(sq->sq_wchan);
+	mtx_assert(&sc->sc_lock, MA_OWNED);
+
+	SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
+
+	/* Remove the thread from the queue. */
+	sq->sq_blockedcnt[td->td_sqqueue]--;
+	TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);
+
+	/*
+	 * Get a sleep queue for this thread.  If this is the last waiter,
+	 * use the queue itself and take it out of the chain, otherwise,
+	 * remove a queue from the free list.
+	 */
+	if (LIST_EMPTY(&sq->sq_free)) {
+		td->td_sleepqueue = sq;
+#ifdef INVARIANTS
+		sq->sq_wchan = NULL;
+#endif
+#ifdef SLEEPQUEUE_PROFILING
+		sc->sc_depth--;
+#endif
+	} else
+		td->td_sleepqueue = LIST_FIRST(&sq->sq_free);
+	LIST_REMOVE(td->td_sleepqueue, sq_hash);
+
+	td->td_wmesg = NULL;
+	td->td_wchan = NULL;
+	td->td_flags &= ~TDF_SINTR;
+
+	CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)",
+	    (void *)td, (long)td->td_proc->p_pid, td->td_name);
+
+	/* Adjust priority if requested. */
+	MPASS(pri == 0 || (pri >= PRI_MIN && pri <= PRI_MAX));
+	if (pri != 0 && td->td_priority > pri &&
+	    PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
+		sched_prio(td, pri);
+
+	/*
+	 * Note that thread td might not be sleeping if it is running
+	 * sleepq_catch_signals() on another CPU or is blocked on its
+	 * proc lock to check signals.  There's no need to mark the
+	 * thread runnable in that case.
+	 */
+	if (TD_IS_SLEEPING(td)) {
+		TD_CLR_SLEEPING(td);
+		return (setrunnable(td));
+	}
+	return (0);
+}
+
+#ifdef INVARIANTS
+/*
+ * UMA zone item deallocator.
+ */
+static void
+sleepq_dtor(void *mem, int size, void *arg)
+{
+	struct sleepqueue *sq;
+	int i;
+
+	sq = mem;
+	for (i = 0; i < NR_SLEEPQS; i++) {
+		MPASS(TAILQ_EMPTY(&sq->sq_blocked[i]));
+		MPASS(sq->sq_blockedcnt[i] == 0);
+	}
+}
+#endif
+
+/*
+ * UMA zone item initializer.
+ */
+static int
+sleepq_init(void *mem, int size, int flags)
+{
+	struct sleepqueue *sq;
+	int i;
+
+	bzero(mem, size);
+	sq = mem;
+	for (i = 0; i < NR_SLEEPQS; i++) {
+		TAILQ_INIT(&sq->sq_blocked[i]);
+		sq->sq_blockedcnt[i] = 0;
+	}
+	LIST_INIT(&sq->sq_free);
+	return (0);
+}
+
+/*
+ * Find the highest priority thread sleeping on a wait channel and resume it.
+ */
+int
+sleepq_signal(void *wchan, int flags, int pri, int queue)
+{
+	struct sleepqueue *sq;
+	struct thread *td, *besttd;
+	int wakeup_swapper;
+
+	CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags);
+	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
+	sq = sleepq_lookup(wchan);
+	if (sq == NULL)
+		return (0);
+	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
+	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
+
+	/*
+	 * Find the highest priority thread on the queue.  If there is a
+	 * tie, use the thread that first appears in the queue as it has
+	 * been sleeping the longest since threads are always added to
+	 * the tail of sleep queues.
+	 */
+	besttd = NULL;
+	TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) {
+		if (besttd == NULL || td->td_priority < besttd->td_priority)
+			besttd = td;
+	}
+	MPASS(besttd != NULL);
+	thread_lock(besttd);
+	wakeup_swapper = sleepq_resume_thread(sq, besttd, pri);
+	thread_unlock(besttd);
+	return (wakeup_swapper);
+}
+
+/*
+ * Resume all threads sleeping on a specified wait channel.
+ */
+int
+sleepq_broadcast(void *wchan, int flags, int pri, int queue)
+{
+	struct sleepqueue *sq;
+	struct thread *td, *tdn;
+	int wakeup_swapper;
+
+	CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags);
+	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
+	sq = sleepq_lookup(wchan);
+	if (sq == NULL)
+		return (0);
+	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
+	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
+
+	/* Resume all blocked threads on the sleep queue. */
+	wakeup_swapper = 0;
+	TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, tdn) {
+		thread_lock(td);
+		if (sleepq_resume_thread(sq, td, pri))
+			wakeup_swapper = 1;
+		thread_unlock(td);
+	}
+	return (wakeup_swapper);
+}
+
+/*
+ * Time sleeping threads out.  When the timeout expires, the thread is
+ * removed from the sleep queue and made runnable if it is still asleep.
+ */
+static void
+sleepq_timeout(void *arg)
+{
+	struct sleepqueue_chain *sc;
+	struct sleepqueue *sq;
+	struct thread *td;
+	void *wchan;
+	int wakeup_swapper;
+
+	td = arg;
+	wakeup_swapper = 0;
+	CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)",
+	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
+
+	/*
+	 * First, see if the thread is asleep and get the wait channel if
+	 * it is.
+	 */
+	thread_lock(td);
+	if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
+		wchan = td->td_wchan;
+		sc = SC_LOOKUP(wchan);
+		THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
+		sq = sleepq_lookup(wchan);
+		MPASS(sq != NULL);
+		td->td_flags |= TDF_TIMEOUT;
+		wakeup_swapper = sleepq_resume_thread(sq, td, 0);
+		thread_unlock(td);
+		if (wakeup_swapper)
+			kick_proc0();
+		return;
+	}
+
+	/*
+	 * If the thread is on the SLEEPQ but isn't sleeping yet, it
+	 * can either be on another CPU in between sleepq_add() and
+	 * one of the sleepq_*wait*() routines or it can be in
+	 * sleepq_catch_signals().
+	 */
+	if (TD_ON_SLEEPQ(td)) {
+		td->td_flags |= TDF_TIMEOUT;
+		thread_unlock(td);
+		return;
+	}
+
+	/*
+	 * Now check for the edge cases.  First, if TDF_TIMEOUT is set,
+	 * then the other thread has already yielded to us, so clear
+	 * the flag and resume it.  If TDF_TIMEOUT is not set, then the
+	 * we know that the other thread is not on a sleep queue, but it
+	 * hasn't resumed execution yet.  In that case, set TDF_TIMOFAIL
+	 * to let it know that the timeout has already run and doesn't
+	 * need to be canceled.
+	 */
+	if (td->td_flags & TDF_TIMEOUT) {
+		MPASS(TD_IS_SLEEPING(td));
+		td->td_flags &= ~TDF_TIMEOUT;
+		TD_CLR_SLEEPING(td);
+		wakeup_swapper = setrunnable(td);
+	} else
+		td->td_flags |= TDF_TIMOFAIL;
+	thread_unlock(td);
+	if (wakeup_swapper)
+		kick_proc0();
+}
+
+/*
+ * Resumes a specific thread from the sleep queue associated with a specific
+ * wait channel if it is on that queue.
+ */
+void
+sleepq_remove(struct thread *td, void *wchan)
+{
+	struct sleepqueue *sq;
+	int wakeup_swapper;
+
+	/*
+	 * Look up the sleep queue for this wait channel, then re-check
+	 * that the thread is asleep on that channel, if it is not, then
+	 * bail.
+	 */
+	MPASS(wchan != NULL);
+	sleepq_lock(wchan);
+	sq = sleepq_lookup(wchan);
+	/*
+	 * We can not lock the thread here as it may be sleeping on a
+	 * different sleepq.  However, holding the sleepq lock for this
+	 * wchan can guarantee that we do not miss a wakeup for this
+	 * channel.  The asserts below will catch any false positives.
+	 */
+	if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) {
+		sleepq_release(wchan);
+		return;
+	}
+	/* Thread is asleep on sleep queue sq, so wake it up. */
+	thread_lock(td);
+	MPASS(sq != NULL);
+	MPASS(td->td_wchan == wchan);
+	wakeup_swapper = sleepq_resume_thread(sq, td, 0);
+	thread_unlock(td);
+	sleepq_release(wchan);
+	if (wakeup_swapper)
+		kick_proc0();
+}
+
+/*
+ * Abort a thread as if an interrupt had occurred.  Only abort
+ * interruptible waits (unfortunately it isn't safe to abort others).
+ */
+int
+sleepq_abort(struct thread *td, int intrval)
+{
+	struct sleepqueue *sq;
+	void *wchan;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	MPASS(TD_ON_SLEEPQ(td));
+	MPASS(td->td_flags & TDF_SINTR);
+	MPASS(intrval == EINTR || intrval == ERESTART);
+
+	/*
+	 * If the TDF_TIMEOUT flag is set, just leave. A
+	 * timeout is scheduled anyhow.
+	 */
+	if (td->td_flags & TDF_TIMEOUT)
+		return (0);
+
+	CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)",
+	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
+	td->td_intrval = intrval;
+	td->td_flags |= TDF_SLEEPABORT;
+	/*
+	 * If the thread has not slept yet it will find the signal in
+	 * sleepq_catch_signals() and call sleepq_resume_thread.  Otherwise
+	 * we have to do it here.
+	 */
+	if (!TD_IS_SLEEPING(td))
+		return (0);
+	wchan = td->td_wchan;
+	MPASS(wchan != NULL);
+	sq = sleepq_lookup(wchan);
+	MPASS(sq != NULL);
+
+	/* Thread is asleep on sleep queue sq, so wake it up. */
+	return (sleepq_resume_thread(sq, td, 0));
+}
+
+#ifdef SLEEPQUEUE_PROFILING
+#define	SLEEPQ_PROF_LOCATIONS	1024
+#define	SLEEPQ_SBUFSIZE		512
+struct sleepq_prof {
+	LIST_ENTRY(sleepq_prof) sp_link;
+	const char	*sp_wmesg;
+	long		sp_count;
+};
+
+LIST_HEAD(sqphead, sleepq_prof);
+
+struct sqphead sleepq_prof_free;
+struct sqphead sleepq_hash[SC_TABLESIZE];
+static struct sleepq_prof sleepq_profent[SLEEPQ_PROF_LOCATIONS];
+static struct mtx sleepq_prof_lock;
+MTX_SYSINIT(sleepq_prof_lock, &sleepq_prof_lock, "sleepq_prof", MTX_SPIN);
+
+static void
+sleepq_profile(const char *wmesg)
+{
+	struct sleepq_prof *sp;
+
+	mtx_lock_spin(&sleepq_prof_lock);
+	if (prof_enabled == 0)
+		goto unlock;
+	LIST_FOREACH(sp, &sleepq_hash[SC_HASH(wmesg)], sp_link)
+		if (sp->sp_wmesg == wmesg)
+			goto done;
+	sp = LIST_FIRST(&sleepq_prof_free);
+	if (sp == NULL)
+		goto unlock;
+	sp->sp_wmesg = wmesg;
+	LIST_REMOVE(sp, sp_link);
+	LIST_INSERT_HEAD(&sleepq_hash[SC_HASH(wmesg)], sp, sp_link);
+done:
+	sp->sp_count++;
+unlock:
+	mtx_unlock_spin(&sleepq_prof_lock);
+	return;
+}
+
+static void
+sleepq_prof_reset(void)
+{
+	struct sleepq_prof *sp;
+	int enabled;
+	int i;
+
+	mtx_lock_spin(&sleepq_prof_lock);
+	enabled = prof_enabled;
+	prof_enabled = 0;
+	for (i = 0; i < SC_TABLESIZE; i++)
+		LIST_INIT(&sleepq_hash[i]);
+	LIST_INIT(&sleepq_prof_free);
+	for (i = 0; i < SLEEPQ_PROF_LOCATIONS; i++) {
+		sp = &sleepq_profent[i];
+		sp->sp_wmesg = NULL;
+		sp->sp_count = 0;
+		LIST_INSERT_HEAD(&sleepq_prof_free, sp, sp_link);
+	}
+	prof_enabled = enabled;
+	mtx_unlock_spin(&sleepq_prof_lock);
+}
+
+static int
+enable_sleepq_prof(SYSCTL_HANDLER_ARGS)
+{
+	int error, v;
+
+	v = prof_enabled;
+	error = sysctl_handle_int(oidp, &v, v, req);
+	if (error)
+		return (error);
+	if (req->newptr == NULL)
+		return (error);
+	if (v == prof_enabled)
+		return (0);
+	if (v == 1)
+		sleepq_prof_reset();
+	mtx_lock_spin(&sleepq_prof_lock);
+	prof_enabled = !!v;
+	mtx_unlock_spin(&sleepq_prof_lock);
+
+	return (0);
+}
+
+static int
+reset_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+	int error, v;
+
+	v = 0;
+	error = sysctl_handle_int(oidp, &v, 0, req);
+	if (error)
+		return (error);
+	if (req->newptr == NULL)
+		return (error);
+	if (v == 0)
+		return (0);
+	sleepq_prof_reset();
+
+	return (0);
+}
+
+static int
+dump_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+	struct sleepq_prof *sp;
+	struct sbuf *sb;
+	int enabled;
+	int error;
+	int i;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sb = sbuf_new_for_sysctl(NULL, NULL, SLEEPQ_SBUFSIZE, req);
+	sbuf_printf(sb, "\nwmesg\tcount\n");
+	enabled = prof_enabled;
+	mtx_lock_spin(&sleepq_prof_lock);
+	prof_enabled = 0;
+	mtx_unlock_spin(&sleepq_prof_lock);
+	for (i = 0; i < SC_TABLESIZE; i++) {
+		LIST_FOREACH(sp, &sleepq_hash[i], sp_link) {
+			sbuf_printf(sb, "%s\t%ld\n",
+			    sp->sp_wmesg, sp->sp_count);
+		}
+	}
+	mtx_lock_spin(&sleepq_prof_lock);
+	prof_enabled = enabled;
+	mtx_unlock_spin(&sleepq_prof_lock);
+
+	error = sbuf_finish(sb);
+	sbuf_delete(sb);
+	return (error);
+}
+
+SYSCTL_PROC(_debug_sleepq, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
+    NULL, 0, dump_sleepq_prof_stats, "A", "Sleepqueue profiling statistics");
+SYSCTL_PROC(_debug_sleepq, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
+    NULL, 0, reset_sleepq_prof_stats, "I",
+    "Reset sleepqueue profiling statistics");
+SYSCTL_PROC(_debug_sleepq, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
+    NULL, 0, enable_sleepq_prof, "I", "Enable sleepqueue profiling");
+#endif
+
+#ifdef DDB
+DB_SHOW_COMMAND(sleepq, db_show_sleepqueue)
+{
+	struct sleepqueue_chain *sc;
+	struct sleepqueue *sq;
+#ifdef INVARIANTS
+	struct lock_object *lock;
+#endif
+	struct thread *td;
+	void *wchan;
+	int i;
+
+	if (!have_addr)
+		return;
+
+	/*
+	 * First, see if there is an active sleep queue for the wait channel
+	 * indicated by the address.
+	 */
+	wchan = (void *)addr;
+	sc = SC_LOOKUP(wchan);
+	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
+		if (sq->sq_wchan == wchan)
+			goto found;
+
+	/*
+	 * Second, see if there is an active sleep queue at the address
+	 * indicated.
+	 */
+	for (i = 0; i < SC_TABLESIZE; i++)
+		LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) {
+			if (sq == (struct sleepqueue *)addr)
+				goto found;
+		}
+
+	db_printf("Unable to locate a sleep queue via %p\n", (void *)addr);
+	return;
+found:
+	db_printf("Wait channel: %p\n", sq->sq_wchan);
+	db_printf("Queue type: %d\n", sq->sq_type);
+#ifdef INVARIANTS
+	if (sq->sq_lock) {
+		lock = sq->sq_lock;
+		db_printf("Associated Interlock: %p - (%s) %s\n", lock,
+		    LOCK_CLASS(lock)->lc_name, lock->lo_name);
+	}
+#endif
+	db_printf("Blocked threads:\n");
+	for (i = 0; i < NR_SLEEPQS; i++) {
+		db_printf("\nQueue[%d]:\n", i);
+		if (TAILQ_EMPTY(&sq->sq_blocked[i]))
+			db_printf("\tempty\n");
+		else
+			TAILQ_FOREACH(td, &sq->sq_blocked[0],
+				      td_slpq) {
+				db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td,
+					  td->td_tid, td->td_proc->p_pid,
+					  td->td_name);
+			}
+		db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]);
+	}
+}
+
+/* Alias 'show sleepqueue' to 'show sleepq'. */
+DB_SHOW_ALIAS(sleepqueue, db_show_sleepqueue);
+#endif
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
new file mode 100644
index 0000000..3614798
--- /dev/null
+++ b/sys/kern/subr_smp.c
@@ -0,0 +1,787 @@
+/*-
+ * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This module holds the global variables and machine independent functions
+ * used for the kernel SMP support.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/proc.h>
+#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+#include <machine/smp.h>
+
+#include "opt_sched.h"
+
+#ifdef SMP
+volatile cpuset_t stopped_cpus;
+volatile cpuset_t started_cpus;
+volatile cpuset_t suspended_cpus;
+cpuset_t hlt_cpus_mask;
+cpuset_t logical_cpus_mask;
+
+void (*cpustop_restartfunc)(void);
+#endif
+/* This is used in modules that need to work in both SMP and UP. */
+cpuset_t all_cpus;
+
+int mp_ncpus;
+/* export this for libkvm consumers. */
+int mp_maxcpus = MAXCPU;
+
+volatile int smp_started;
+u_int mp_maxid;
+
+static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL,
+    "Kernel SMP");
+
+SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
+    "Max CPU ID.");
+
+SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
+    0, "Max number of CPUs that the system was compiled for.");
+
+int smp_active = 0;	/* are the APs allowed to run? */
+SYSCTL_INT(_kern_smp, OID_AUTO, active, CTLFLAG_RW, &smp_active, 0,
+    "Number of Auxillary Processors (APs) that were successfully started");
+
+int smp_disabled = 0;	/* has smp been disabled? */
+SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
+    &smp_disabled, 0, "SMP has been disabled from the loader");
+TUNABLE_INT("kern.smp.disabled", &smp_disabled);
+
+int smp_cpus = 1;	/* how many cpu's running */
+SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
+    "Number of CPUs online");
+
+int smp_topology = 0;	/* Which topology we're using. */
+SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RD, &smp_topology, 0,
+    "Topology override setting; 0 is default provided by hardware.");
+TUNABLE_INT("kern.smp.topology", &smp_topology);
+
+#ifdef SMP
+/* Enable forwarding of a signal to a process running on a different CPU */
+static int forward_signal_enabled = 1;
+SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
+	   &forward_signal_enabled, 0,
+	   "Forwarding of a signal to a process on a different CPU");
+
+/* Variables needed for SMP rendezvous. */
+static volatile int smp_rv_ncpus;
+static void (*volatile smp_rv_setup_func)(void *arg);
+static void (*volatile smp_rv_action_func)(void *arg);
+static void (*volatile smp_rv_teardown_func)(void *arg);
+static void *volatile smp_rv_func_arg;
+static volatile int smp_rv_waiters[4];
+
+/* 
+ * Shared mutex to restrict busywaits between smp_rendezvous() and
+ * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
+ * functions trigger at once and cause multiple CPUs to busywait with
+ * interrupts disabled. 
+ */
+struct mtx smp_ipi_mtx;
+
+/*
+ * Let the MD SMP code initialize mp_maxid very early if it can.
+ */
+static void
+mp_setmaxid(void *dummy)
+{
+	cpu_mp_setmaxid();
+}
+SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
+
+/*
+ * Call the MD SMP initialization code.
+ */
+static void
+mp_start(void *dummy)
+{
+
+	mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
+
+	/* Probe for MP hardware. */
+	if (smp_disabled != 0 || cpu_mp_probe() == 0) {
+		mp_ncpus = 1;
+		CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
+		return;
+	}
+
+	cpu_mp_start();
+	printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
+	    mp_ncpus);
+	cpu_mp_announce();
+}
+SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
+
+void
+forward_signal(struct thread *td)
+{
+	int id;
+
+	/*
+	 * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
+	 * this thread, so all we need to do is poke it if it is currently
+	 * executing so that it executes ast().
+	 */
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT(TD_IS_RUNNING(td),
+	    ("forward_signal: thread is not TDS_RUNNING"));
+
+	CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
+
+	if (!smp_started || cold || panicstr)
+		return;
+	if (!forward_signal_enabled)
+		return;
+
+	/* No need to IPI ourself. */
+	if (td == curthread)
+		return;
+
+	id = td->td_oncpu;
+	if (id == NOCPU)
+		return;
+	ipi_cpu(id, IPI_AST);
+}
+
+/*
+ * When called the executing CPU will send an IPI to all other CPUs
+ *  requesting that they halt execution.
+ *
+ * Usually (but not necessarily) called with 'other_cpus' as its arg.
+ *
+ *  - Signals all CPUs in map to stop.
+ *  - Waits for each to stop.
+ *
+ * Returns:
+ *  -1: error
+ *   0: NA
+ *   1: ok
+ *
+ */
+static int
+generic_stop_cpus(cpuset_t map, u_int type)
+{
+#ifdef KTR
+	char cpusetbuf[CPUSETBUFSIZ];
+#endif
+	static volatile u_int stopping_cpu = NOCPU;
+	int i;
+	volatile cpuset_t *cpus;
+
+	KASSERT(
+#if defined(__amd64__) || defined(__i386__)
+	    type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND,
+#else
+	    type == IPI_STOP || type == IPI_STOP_HARD,
+#endif
+	    ("%s: invalid stop type", __func__));
+
+	if (!smp_started)
+		return (0);
+
+	CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
+	    cpusetobj_strprint(cpusetbuf, &map), type);
+
+	if (stopping_cpu != PCPU_GET(cpuid))
+		while (atomic_cmpset_int(&stopping_cpu, NOCPU,
+		    PCPU_GET(cpuid)) == 0)
+			while (stopping_cpu != NOCPU)
+				cpu_spinwait(); /* spin */
+
+	/* send the stop IPI to all CPUs in map */
+	ipi_selected(map, type);
+
+#if defined(__amd64__) || defined(__i386__)
+	if (type == IPI_SUSPEND)
+		cpus = &suspended_cpus;
+	else
+#endif
+		cpus = &stopped_cpus;
+
+	i = 0;
+	while (!CPU_SUBSET(cpus, &map)) {
+		/* spin */
+		cpu_spinwait();
+		i++;
+		if (i == 100000000) {
+			printf("timeout stopping cpus\n");
+			break;
+		}
+	}
+
+	stopping_cpu = NOCPU;
+	return (1);
+}
+
+int
+stop_cpus(cpuset_t map)
+{
+
+	return (generic_stop_cpus(map, IPI_STOP));
+}
+
+int
+stop_cpus_hard(cpuset_t map)
+{
+
+	return (generic_stop_cpus(map, IPI_STOP_HARD));
+}
+
+#if defined(__amd64__) || defined(__i386__)
+int
+suspend_cpus(cpuset_t map)
+{
+
+	return (generic_stop_cpus(map, IPI_SUSPEND));
+}
+#endif
+
+/*
+ * Called by a CPU to restart stopped CPUs. 
+ *
+ * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
+ *
+ *  - Signals all CPUs in map to restart.
+ *  - Waits for each to restart.
+ *
+ * Returns:
+ *  -1: error
+ *   0: NA
+ *   1: ok
+ */
+int
+restart_cpus(cpuset_t map)
+{
+#ifdef KTR
+	char cpusetbuf[CPUSETBUFSIZ];
+#endif
+
+	if (!smp_started)
+		return 0;
+
+	CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
+
+	/* signal other cpus to restart */
+	CPU_COPY_STORE_REL(&map, &started_cpus);
+
+	/* wait for each to clear its bit */
+	while (CPU_OVERLAP(&stopped_cpus, &map))
+		cpu_spinwait();
+
+	return 1;
+}
+
+/*
+ * All-CPU rendezvous.  CPUs are signalled, all execute the setup function 
+ * (if specified), rendezvous, execute the action function (if specified),
+ * rendezvous again, execute the teardown function (if specified), and then
+ * resume.
+ *
+ * Note that the supplied external functions _must_ be reentrant and aware
+ * that they are running in parallel and in an unknown lock context.
+ */
+void
+smp_rendezvous_action(void)
+{
+	struct thread *td;
+	void *local_func_arg;
+	void (*local_setup_func)(void*);
+	void (*local_action_func)(void*);
+	void (*local_teardown_func)(void*);
+#ifdef INVARIANTS
+	int owepreempt;
+#endif
+
+	/* Ensure we have up-to-date values. */
+	atomic_add_acq_int(&smp_rv_waiters[0], 1);
+	while (smp_rv_waiters[0] < smp_rv_ncpus)
+		cpu_spinwait();
+
+	/* Fetch rendezvous parameters after acquire barrier. */
+	local_func_arg = smp_rv_func_arg;
+	local_setup_func = smp_rv_setup_func;
+	local_action_func = smp_rv_action_func;
+	local_teardown_func = smp_rv_teardown_func;
+
+	/*
+	 * Use a nested critical section to prevent any preemptions
+	 * from occurring during a rendezvous action routine.
+	 * Specifically, if a rendezvous handler is invoked via an IPI
+	 * and the interrupted thread was in the critical_exit()
+	 * function after setting td_critnest to 0 but before
+	 * performing a deferred preemption, this routine can be
+	 * invoked with td_critnest set to 0 and td_owepreempt true.
+	 * In that case, a critical_exit() during the rendezvous
+	 * action would trigger a preemption which is not permitted in
+	 * a rendezvous action.  To fix this, wrap all of the
+	 * rendezvous action handlers in a critical section.  We
+	 * cannot use a regular critical section however as having
+	 * critical_exit() preempt from this routine would also be
+	 * problematic (the preemption must not occur before the IPI
+	 * has been acknowledged via an EOI).  Instead, we
+	 * intentionally ignore td_owepreempt when leaving the
+	 * critical section.  This should be harmless because we do
+	 * not permit rendezvous action routines to schedule threads,
+	 * and thus td_owepreempt should never transition from 0 to 1
+	 * during this routine.
+	 */
+	td = curthread;
+	td->td_critnest++;
+#ifdef INVARIANTS
+	owepreempt = td->td_owepreempt;
+#endif
+	
+	/*
+	 * If requested, run a setup function before the main action
+	 * function.  Ensure all CPUs have completed the setup
+	 * function before moving on to the action function.
+	 */
+	if (local_setup_func != smp_no_rendevous_barrier) {
+		if (smp_rv_setup_func != NULL)
+			smp_rv_setup_func(smp_rv_func_arg);
+		atomic_add_int(&smp_rv_waiters[1], 1);
+		while (smp_rv_waiters[1] < smp_rv_ncpus)
+                	cpu_spinwait();
+	}
+
+	if (local_action_func != NULL)
+		local_action_func(local_func_arg);
+
+	if (local_teardown_func != smp_no_rendevous_barrier) {
+		/*
+		 * Signal that the main action has been completed.  If a
+		 * full exit rendezvous is requested, then all CPUs will
+		 * wait here until all CPUs have finished the main action.
+		 */
+		atomic_add_int(&smp_rv_waiters[2], 1);
+		while (smp_rv_waiters[2] < smp_rv_ncpus)
+			cpu_spinwait();
+
+		if (local_teardown_func != NULL)
+			local_teardown_func(local_func_arg);
+	}
+
+	/*
+	 * Signal that the rendezvous is fully completed by this CPU.
+	 * This means that no member of smp_rv_* pseudo-structure will be
+	 * accessed by this target CPU after this point; in particular,
+	 * memory pointed by smp_rv_func_arg.
+	 */
+	atomic_add_int(&smp_rv_waiters[3], 1);
+
+	td->td_critnest--;
+	KASSERT(owepreempt == td->td_owepreempt,
+	    ("rendezvous action changed td_owepreempt"));
+}
+
+void
+smp_rendezvous_cpus(cpuset_t map,
+	void (* setup_func)(void *), 
+	void (* action_func)(void *),
+	void (* teardown_func)(void *),
+	void *arg)
+{
+	int curcpumap, i, ncpus = 0;
+
+	/* Look comments in the !SMP case. */
+	if (!smp_started) {
+		spinlock_enter();
+		if (setup_func != NULL)
+			setup_func(arg);
+		if (action_func != NULL)
+			action_func(arg);
+		if (teardown_func != NULL)
+			teardown_func(arg);
+		spinlock_exit();
+		return;
+	}
+
+	CPU_FOREACH(i) {
+		if (CPU_ISSET(i, &map))
+			ncpus++;
+	}
+	if (ncpus == 0)
+		panic("ncpus is 0 with non-zero map");
+
+	mtx_lock_spin(&smp_ipi_mtx);
+
+	/* Pass rendezvous parameters via global variables. */
+	smp_rv_ncpus = ncpus;
+	smp_rv_setup_func = setup_func;
+	smp_rv_action_func = action_func;
+	smp_rv_teardown_func = teardown_func;
+	smp_rv_func_arg = arg;
+	smp_rv_waiters[1] = 0;
+	smp_rv_waiters[2] = 0;
+	smp_rv_waiters[3] = 0;
+	atomic_store_rel_int(&smp_rv_waiters[0], 0);
+
+	/*
+	 * Signal other processors, which will enter the IPI with
+	 * interrupts off.
+	 */
+	curcpumap = CPU_ISSET(curcpu, &map);
+	CPU_CLR(curcpu, &map);
+	ipi_selected(map, IPI_RENDEZVOUS);
+
+	/* Check if the current CPU is in the map */
+	if (curcpumap != 0)
+		smp_rendezvous_action();
+
+	/*
+	 * Ensure that the master CPU waits for all the other
+	 * CPUs to finish the rendezvous, so that smp_rv_*
+	 * pseudo-structure and the arg are guaranteed to not
+	 * be in use.
+	 */
+	while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
+		cpu_spinwait();
+
+	mtx_unlock_spin(&smp_ipi_mtx);
+}
+
+void
+smp_rendezvous(void (* setup_func)(void *), 
+	       void (* action_func)(void *),
+	       void (* teardown_func)(void *),
+	       void *arg)
+{
+	smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
+}
+
+static struct cpu_group group[MAXCPU];
+
+struct cpu_group *
+smp_topo(void)
+{
+	char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
+	struct cpu_group *top;
+
+	/*
+	 * Check for a fake topology request for debugging purposes.
+	 */
+	switch (smp_topology) {
+	case 1:
+		/* Dual core with no sharing.  */
+		top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
+		break;
+	case 2:
+		/* No topology, all cpus are equal. */
+		top = smp_topo_none();
+		break;
+	case 3:
+		/* Dual core with shared L2.  */
+		top = smp_topo_1level(CG_SHARE_L2, 2, 0);
+		break;
+	case 4:
+		/* quad core, shared l3 among each package, private l2.  */
+		top = smp_topo_1level(CG_SHARE_L3, 4, 0);
+		break;
+	case 5:
+		/* quad core,  2 dualcore parts on each package share l2.  */
+		top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
+		break;
+	case 6:
+		/* Single-core 2xHTT */
+		top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
+		break;
+	case 7:
+		/* quad core with a shared l3, 8 threads sharing L2.  */
+		top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
+		    CG_FLAG_SMT);
+		break;
+	default:
+		/* Default, ask the system what it wants. */
+		top = cpu_topo();
+		break;
+	}
+	/*
+	 * Verify the returned topology.
+	 */
+	if (top->cg_count != mp_ncpus)
+		panic("Built bad topology at %p.  CPU count %d != %d",
+		    top, top->cg_count, mp_ncpus);
+	if (CPU_CMP(&top->cg_mask, &all_cpus))
+		panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
+		    top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
+		    cpusetobj_strprint(cpusetbuf2, &all_cpus));
+	return (top);
+}
+
+struct cpu_group *
+smp_topo_none(void)
+{
+	struct cpu_group *top;
+
+	top = &group[0];
+	top->cg_parent = NULL;
+	top->cg_child = NULL;
+	top->cg_mask = all_cpus;
+	top->cg_count = mp_ncpus;
+	top->cg_children = 0;
+	top->cg_level = CG_SHARE_NONE;
+	top->cg_flags = 0;
+	
+	return (top);
+}
+
+static int
+smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
+    int count, int flags, int start)
+{
+	char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
+	cpuset_t mask;
+	int i;
+
+	CPU_ZERO(&mask);
+	for (i = 0; i < count; i++, start++)
+		CPU_SET(start, &mask);
+	child->cg_parent = parent;
+	child->cg_child = NULL;
+	child->cg_children = 0;
+	child->cg_level = share;
+	child->cg_count = count;
+	child->cg_flags = flags;
+	child->cg_mask = mask;
+	parent->cg_children++;
+	for (; parent != NULL; parent = parent->cg_parent) {
+		if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
+			panic("Duplicate children in %p.  mask (%s) child (%s)",
+			    parent,
+			    cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
+			    cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
+		CPU_OR(&parent->cg_mask, &child->cg_mask);
+		parent->cg_count += child->cg_count;
+	}
+
+	return (start);
+}
+
+struct cpu_group *
+smp_topo_1level(int share, int count, int flags)
+{
+	struct cpu_group *child;
+	struct cpu_group *top;
+	int packages;
+	int cpu;
+	int i;
+
+	cpu = 0;
+	top = &group[0];
+	packages = mp_ncpus / count;
+	top->cg_child = child = &group[1];
+	top->cg_level = CG_SHARE_NONE;
+	for (i = 0; i < packages; i++, child++)
+		cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
+	return (top);
+}
+
+struct cpu_group *
+smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
+    int l1flags)
+{
+	struct cpu_group *top;
+	struct cpu_group *l1g;
+	struct cpu_group *l2g;
+	int cpu;
+	int i;
+	int j;
+
+	cpu = 0;
+	top = &group[0];
+	l2g = &group[1];
+	top->cg_child = l2g;
+	top->cg_level = CG_SHARE_NONE;
+	top->cg_children = mp_ncpus / (l2count * l1count);
+	l1g = l2g + top->cg_children;
+	for (i = 0; i < top->cg_children; i++, l2g++) {
+		l2g->cg_parent = top;
+		l2g->cg_child = l1g;
+		l2g->cg_level = l2share;
+		for (j = 0; j < l2count; j++, l1g++)
+			cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
+			    l1flags, cpu);
+	}
+	return (top);
+}
+
+
+struct cpu_group *
+smp_topo_find(struct cpu_group *top, int cpu)
+{
+	struct cpu_group *cg;
+	cpuset_t mask;
+	int children;
+	int i;
+
+	CPU_SETOF(cpu, &mask);
+	cg = top;
+	for (;;) {
+		if (!CPU_OVERLAP(&cg->cg_mask, &mask))
+			return (NULL);
+		if (cg->cg_children == 0)
+			return (cg);
+		children = cg->cg_children;
+		for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
+			if (CPU_OVERLAP(&cg->cg_mask, &mask))
+				break;
+	}
+	return (NULL);
+}
+#else /* !SMP */
+
+void
+smp_rendezvous_cpus(cpuset_t map,
+	void (*setup_func)(void *), 
+	void (*action_func)(void *),
+	void (*teardown_func)(void *),
+	void *arg)
+{
+	/*
+	 * In the !SMP case we just need to ensure the same initial conditions
+	 * as the SMP case.
+	 */
+	spinlock_enter();
+	if (setup_func != NULL)
+		setup_func(arg);
+	if (action_func != NULL)
+		action_func(arg);
+	if (teardown_func != NULL)
+		teardown_func(arg);
+	spinlock_exit();
+}
+
+void
+smp_rendezvous(void (*setup_func)(void *), 
+	       void (*action_func)(void *),
+	       void (*teardown_func)(void *),
+	       void *arg)
+{
+
+	/* Look comments in the smp_rendezvous_cpus() case. */
+	spinlock_enter();
+	if (setup_func != NULL)
+		setup_func(arg);
+	if (action_func != NULL)
+		action_func(arg);
+	if (teardown_func != NULL)
+		teardown_func(arg);
+	spinlock_exit();
+}
+
+/*
+ * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
+ * APIs will still work using this dummy support.
+ */
+static void
+mp_setvariables_for_up(void *dummy)
+{
+	mp_ncpus = 1;
+	mp_maxid = PCPU_GET(cpuid);
+	CPU_SETOF(mp_maxid, &all_cpus);
+	KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
+}
+SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
+    mp_setvariables_for_up, NULL);
+#endif /* SMP */
+
+void
+smp_no_rendevous_barrier(void *dummy)
+{
+#ifdef SMP
+	KASSERT((!smp_started),("smp_no_rendevous called and smp is started"));
+#endif
+}
+
+/*
+ * Wait specified idle threads to switch once.  This ensures that even
+ * preempted threads have cycled through the switch function once,
+ * exiting their codepaths.  This allows us to change global pointers
+ * with no other synchronization.
+ */
+int
+quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
+{
+	struct pcpu *pcpu;
+	u_int gen[MAXCPU];
+	int error;
+	int cpu;
+
+	error = 0;
+	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+		if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
+			continue;
+		pcpu = pcpu_find(cpu);
+		gen[cpu] = pcpu->pc_idlethread->td_generation;
+	}
+	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+		if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
+			continue;
+		pcpu = pcpu_find(cpu);
+		thread_lock(curthread);
+		sched_bind(curthread, cpu);
+		thread_unlock(curthread);
+		while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
+			error = tsleep(quiesce_cpus, prio, wmesg, 1);
+			if (error != EWOULDBLOCK)
+				goto out;
+			error = 0;
+		}
+	}
+out:
+	thread_lock(curthread);
+	sched_unbind(curthread);
+	thread_unlock(curthread);
+
+	return (error);
+}
+
+int
+quiesce_all_cpus(const char *wmesg, int prio)
+{
+
+	return quiesce_cpus(all_cpus, wmesg, prio);
+}
diff --git a/sys/kern/subr_stack.c b/sys/kern/subr_stack.c
new file mode 100644
index 0000000..6408aec
--- /dev/null
+++ b/sys/kern/subr_stack.c
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (c) 2005 Antoine Brodin
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#ifdef KTR
+#include <sys/ktr.h>
+#endif
+#include <sys/linker.h>
+#include <sys/malloc.h>
+#include <sys/sbuf.h>
+#include <sys/stack.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+
+FEATURE(stack, "Support for capturing kernel stack");
+
+static MALLOC_DEFINE(M_STACK, "stack", "Stack Traces");
+
+static int stack_symbol(vm_offset_t pc, char *namebuf, u_int buflen,
+	    long *offset);
+static int stack_symbol_ddb(vm_offset_t pc, const char **name, long *offset);
+
+struct stack *
+stack_create(void)
+{
+	struct stack *st;
+
+	st = malloc(sizeof *st, M_STACK, M_WAITOK | M_ZERO);
+	return (st);
+}
+
+void
+stack_destroy(struct stack *st)
+{
+
+	free(st, M_STACK);
+}
+
+int
+stack_put(struct stack *st, vm_offset_t pc)
+{
+
+	if (st->depth < STACK_MAX) {
+		st->pcs[st->depth++] = pc;
+		return (0);
+	} else
+		return (-1);
+}
+
+void
+stack_copy(const struct stack *src, struct stack *dst)
+{
+
+	*dst = *src;
+}
+
+void
+stack_zero(struct stack *st)
+{
+
+	bzero(st, sizeof *st);
+}
+
+void
+stack_print(const struct stack *st)
+{
+	char namebuf[64];
+	long offset;
+	int i;
+
+	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+	for (i = 0; i < st->depth; i++) {
+		(void)stack_symbol(st->pcs[i], namebuf, sizeof(namebuf),
+		    &offset);
+		printf("#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
+		    namebuf, offset);
+	}
+}
+
+void
+stack_print_short(const struct stack *st)
+{
+	char namebuf[64];
+	long offset;
+	int i;
+
+	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+	for (i = 0; i < st->depth; i++) {
+		if (i > 0)
+			printf(" ");
+		if (stack_symbol(st->pcs[i], namebuf, sizeof(namebuf),
+		    &offset) == 0)
+			printf("%s+%#lx", namebuf, offset);
+		else
+			printf("%p", (void *)st->pcs[i]);
+	}
+	printf("\n");
+}
+
+void
+stack_print_ddb(const struct stack *st)
+{
+	const char *name;
+	long offset;
+	int i;
+
+	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+	for (i = 0; i < st->depth; i++) {
+		stack_symbol_ddb(st->pcs[i], &name, &offset);
+		printf("#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
+		    name, offset);
+	}
+}
+
+#ifdef DDB
+void
+stack_print_short_ddb(const struct stack *st)
+{
+	const char *name;
+	long offset;
+	int i;
+
+	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+	for (i = 0; i < st->depth; i++) {
+		if (i > 0)
+			printf(" ");
+		if (stack_symbol_ddb(st->pcs[i], &name, &offset) == 0)
+			printf("%s+%#lx", name, offset);
+		else
+			printf("%p", (void *)st->pcs[i]);
+	}
+	printf("\n");
+}
+#endif
+
+/*
+ * Two print routines -- one for use from DDB and DDB-like contexts, the
+ * other for use in the live kernel.
+ */
+void
+stack_sbuf_print(struct sbuf *sb, const struct stack *st)
+{
+	char namebuf[64];
+	long offset;
+	int i;
+
+	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+	for (i = 0; i < st->depth; i++) {
+		(void)stack_symbol(st->pcs[i], namebuf, sizeof(namebuf),
+		    &offset);
+		sbuf_printf(sb, "#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
+		    namebuf, offset);
+	}
+}
+
+#ifdef DDB
+void
+stack_sbuf_print_ddb(struct sbuf *sb, const struct stack *st)
+{
+	const char *name;
+	long offset;
+	int i;
+
+	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+	for (i = 0; i < st->depth; i++) {
+		(void)stack_symbol_ddb(st->pcs[i], &name, &offset);
+		sbuf_printf(sb, "#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
+		    name, offset);
+	}
+}
+#endif
+
+#ifdef KTR
+void
+stack_ktr(u_int mask, const char *file, int line, const struct stack *st,
+    u_int depth, int cheap)
+{
+#ifdef DDB
+	const char *name;
+	long offset;
+	int i;
+#endif
+
+	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+	if (cheap) {
+		ktr_tracepoint(mask, file, line, "#0 %p %p %p %p %p %p",
+		    st->pcs[0], st->pcs[1], st->pcs[2], st->pcs[3],
+		    st->pcs[4], st->pcs[5]);
+		if (st->depth <= 6)
+			return;
+		ktr_tracepoint(mask, file, line, "#1 %p %p %p %p %p %p",
+		    st->pcs[6], st->pcs[7], st->pcs[8], st->pcs[9],
+		    st->pcs[10], st->pcs[11]);
+		if (st->depth <= 12)
+			return;
+		ktr_tracepoint(mask, file, line, "#2 %p %p %p %p %p %p",
+		    st->pcs[12], st->pcs[13], st->pcs[14], st->pcs[15],
+		    st->pcs[16], st->pcs[17]);
+#ifdef DDB
+	} else {
+		if (depth == 0 || st->depth < depth)
+			depth = st->depth;
+		for (i = 0; i < depth; i++) {
+			(void)stack_symbol_ddb(st->pcs[i], &name, &offset);
+			ktr_tracepoint(mask, file, line, "#%d %p at %s+%#lx",
+			    i, st->pcs[i], (u_long)name, offset, 0, 0);
+		}
+#endif
+	}
+}
+#endif
+
+/*
+ * Two variants of stack symbol lookup -- one that uses the DDB interfaces
+ * and bypasses linker locking, and the other that doesn't.
+ */
+static int
+stack_symbol(vm_offset_t pc, char *namebuf, u_int buflen, long *offset)
+{
+
+	if (linker_search_symbol_name((caddr_t)pc, namebuf, buflen,
+	    offset) != 0) {
+		*offset = 0;
+		strlcpy(namebuf, "??", buflen);
+		return (ENOENT);
+	} else
+		return (0);
+}
+
+static int
+stack_symbol_ddb(vm_offset_t pc, const char **name, long *offset)
+{
+	linker_symval_t symval;
+	c_linker_sym_t sym;
+
+	if (linker_ddb_search_symbol((caddr_t)pc, &sym, offset) != 0)
+		goto out;
+	if (linker_ddb_symbol_values(sym, &symval) != 0)
+		goto out;
+	if (symval.name != NULL) {
+		*name = symval.name;
+		return (0);
+	}
+ out:
+	*offset = 0;
+	*name = "??";
+	return (ENOENT);
+}
diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c
new file mode 100644
index 0000000..3d6dc5a
--- /dev/null
+++ b/sys/kern/subr_syscall.c
@@ -0,0 +1,235 @@
+/*-
+ * Copyright (C) 1994, David Greenman
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * Copyright (C) 2010 Konstantin Belousov <kib@freebsd.org>
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the University of Utah, and William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
+ */
+
+#include "opt_capsicum.h"
+#include "opt_ktrace.h"
+#include "opt_kdtrace.h"
+
+__FBSDID("$FreeBSD$");
+
+#include <sys/capability.h>
+#include <sys/ktr.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+#include <security/audit/audit.h>
+
+static inline int
+syscallenter(struct thread *td, struct syscall_args *sa)
+{
+	struct proc *p;
+	int error, traced;
+
+	PCPU_INC(cnt.v_syscall);
+	p = td->td_proc;
+
+	td->td_pticks = 0;
+	if (td->td_ucred != p->p_ucred)
+		cred_update_thread(td);
+	if (p->p_flag & P_TRACED) {
+		traced = 1;
+		PROC_LOCK(p);
+		td->td_dbgflags &= ~TDB_USERWR;
+		td->td_dbgflags |= TDB_SCE;
+		PROC_UNLOCK(p);
+	} else
+		traced = 0;
+	error = (p->p_sysent->sv_fetch_syscall_args)(td, sa);
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_SYSCALL))
+		ktrsyscall(sa->code, sa->narg, sa->args);
+#endif
+	KTR_START4(KTR_SYSC, "syscall", syscallname(p, sa->code),
+	    (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "arg0:%p", sa->args[0],
+	    "arg1:%p", sa->args[1], "arg2:%p", sa->args[2]);
+
+	if (error == 0) {
+
+		STOPEVENT(p, S_SCE, sa->narg);
+		if (p->p_flag & P_TRACED && p->p_stops & S_PT_SCE) {
+			PROC_LOCK(p);
+			ptracestop((td), SIGTRAP);
+			PROC_UNLOCK(p);
+		}
+		if (td->td_dbgflags & TDB_USERWR) {
+			/*
+			 * Reread syscall number and arguments if
+			 * debugger modified registers or memory.
+			 */
+			error = (p->p_sysent->sv_fetch_syscall_args)(td, sa);
+#ifdef KTRACE
+			if (KTRPOINT(td, KTR_SYSCALL))
+				ktrsyscall(sa->code, sa->narg, sa->args);
+#endif
+			if (error != 0)
+				goto retval;
+		}
+
+#ifdef CAPABILITY_MODE
+		/*
+		 * In capability mode, we only allow access to system calls
+		 * flagged with SYF_CAPENABLED.
+		 */
+		if (IN_CAPABILITY_MODE(td) &&
+		    !(sa->callp->sy_flags & SYF_CAPENABLED)) {
+			error = ECAPMODE;
+			goto retval;
+		}
+#endif
+
+		error = syscall_thread_enter(td, sa->callp);
+		if (error != 0)
+			goto retval;
+
+#ifdef KDTRACE_HOOKS
+		/*
+		 * If the systrace module has registered it's probe
+		 * callback and if there is a probe active for the
+		 * syscall 'entry', process the probe.
+		 */
+		if (systrace_probe_func != NULL && sa->callp->sy_entry != 0)
+			(*systrace_probe_func)(sa->callp->sy_entry, sa->code,
+			    sa->callp, sa->args, 0);
+#endif
+
+		AUDIT_SYSCALL_ENTER(sa->code, td);
+		error = (sa->callp->sy_call)(td, sa->args);
+		AUDIT_SYSCALL_EXIT(error, td);
+
+		/* Save the latest error return value. */
+		if ((td->td_pflags & TDP_NERRNO) == 0)
+			td->td_errno = error;
+
+#ifdef KDTRACE_HOOKS
+		/*
+		 * If the systrace module has registered it's probe
+		 * callback and if there is a probe active for the
+		 * syscall 'return', process the probe.
+		 */
+		if (systrace_probe_func != NULL && sa->callp->sy_return != 0)
+			(*systrace_probe_func)(sa->callp->sy_return, sa->code,
+			    sa->callp, NULL, (error) ? -1 : td->td_retval[0]);
+#endif
+		syscall_thread_exit(td, sa->callp);
+	}
+ retval:
+	KTR_STOP4(KTR_SYSC, "syscall", syscallname(p, sa->code),
+	    (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "error:%d", error,
+	    "retval0:%#lx", td->td_retval[0], "retval1:%#lx",
+	    td->td_retval[1]);
+	if (traced) {
+		PROC_LOCK(p);
+		td->td_dbgflags &= ~TDB_SCE;
+		PROC_UNLOCK(p);
+	}
+	(p->p_sysent->sv_set_syscall_retval)(td, error);
+	return (error);
+}
+
+static inline void
+syscallret(struct thread *td, int error, struct syscall_args *sa __unused)
+{
+	struct proc *p, *p2;
+	int traced;
+
+	p = td->td_proc;
+
+	/*
+	 * Handle reschedule and other end-of-syscall issues
+	 */
+	userret(td, td->td_frame);
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_SYSRET)) {
+		ktrsysret(sa->code, (td->td_pflags & TDP_NERRNO) == 0 ?
+		    error : td->td_errno, td->td_retval[0]);
+	}
+#endif
+	td->td_pflags &= ~TDP_NERRNO;
+
+	if (p->p_flag & P_TRACED) {
+		traced = 1;
+		PROC_LOCK(p);
+		td->td_dbgflags |= TDB_SCX;
+		PROC_UNLOCK(p);
+	} else
+		traced = 0;
+	/*
+	 * This works because errno is findable through the
+	 * register set.  If we ever support an emulation where this
+	 * is not the case, this code will need to be revisited.
+	 */
+	STOPEVENT(p, S_SCX, sa->code);
+	if (traced || (td->td_dbgflags & (TDB_EXEC | TDB_FORK)) != 0) {
+		PROC_LOCK(p);
+		/*
+		 * If tracing the execed process, trap to the debugger
+		 * so that breakpoints can be set before the program
+		 * executes.  If debugger requested tracing of syscall
+		 * returns, do it now too.
+		 */
+		if (traced &&
+		    ((td->td_dbgflags & (TDB_FORK | TDB_EXEC)) != 0 ||
+		    (p->p_stops & S_PT_SCX) != 0))
+			ptracestop(td, SIGTRAP);
+		td->td_dbgflags &= ~(TDB_SCX | TDB_EXEC | TDB_FORK);
+		PROC_UNLOCK(p);
+	}
+
+	if (td->td_pflags & TDP_RFPPWAIT) {
+		/*
+		 * Preserve synchronization semantics of vfork.  If
+		 * waiting for child to exec or exit, fork set
+		 * P_PPWAIT on child, and there we sleep on our proc
+		 * (in case of exit).
+		 *
+		 * Do it after the ptracestop() above is finished, to
+		 * not block our debugger until child execs or exits
+		 * to finish vfork wait.
+		 */
+		td->td_pflags &= ~TDP_RFPPWAIT;
+		p2 = td->td_rfppwait_p;
+		PROC_LOCK(p2);
+		while (p2->p_flag & P_PPWAIT)
+			cv_wait(&p2->p_pwait, &p2->p_mtx);
+		PROC_UNLOCK(p2);
+	}
+}
diff --git a/sys/kern/subr_taskqueue.c b/sys/kern/subr_taskqueue.c
new file mode 100644
index 0000000..9c7bf41
--- /dev/null
+++ b/sys/kern/subr_taskqueue.c
@@ -0,0 +1,634 @@
+/*-
+ * Copyright (c) 2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/taskqueue.h>
+#include <sys/unistd.h>
+#include <machine/stdarg.h>
+
+static MALLOC_DEFINE(M_TASKQUEUE, "taskqueue", "Task Queues");
+static void	*taskqueue_giant_ih;
+static void	*taskqueue_ih;
+
+struct taskqueue_busy {
+	struct task	*tb_running;
+	TAILQ_ENTRY(taskqueue_busy) tb_link;
+};
+
+struct taskqueue {
+	STAILQ_HEAD(, task)	tq_queue;
+	taskqueue_enqueue_fn	tq_enqueue;
+	void			*tq_context;
+	TAILQ_HEAD(, taskqueue_busy) tq_active;
+	struct mtx		tq_mutex;
+	struct thread		**tq_threads;
+	int			tq_tcount;
+	int			tq_spin;
+	int			tq_flags;
+	int			tq_callouts;
+	taskqueue_callback_fn	tq_callbacks[TASKQUEUE_NUM_CALLBACKS];
+	void			*tq_cb_contexts[TASKQUEUE_NUM_CALLBACKS];
+};
+
+#define	TQ_FLAGS_ACTIVE		(1 << 0)
+#define	TQ_FLAGS_BLOCKED	(1 << 1)
+#define	TQ_FLAGS_PENDING	(1 << 2)
+
+#define	DT_CALLOUT_ARMED	(1 << 0)
+
+#define	TQ_LOCK(tq)							\
+	do {								\
+		if ((tq)->tq_spin)					\
+			mtx_lock_spin(&(tq)->tq_mutex);			\
+		else							\
+			mtx_lock(&(tq)->tq_mutex);			\
+	} while (0)
+#define	TQ_ASSERT_LOCKED(tq)	mtx_assert(&(tq)->tq_mutex, MA_OWNED)
+
+#define	TQ_UNLOCK(tq)							\
+	do {								\
+		if ((tq)->tq_spin)					\
+			mtx_unlock_spin(&(tq)->tq_mutex);		\
+		else							\
+			mtx_unlock(&(tq)->tq_mutex);			\
+	} while (0)
+#define	TQ_ASSERT_UNLOCKED(tq)	mtx_assert(&(tq)->tq_mutex, MA_NOTOWNED)
+
+void
+_timeout_task_init(struct taskqueue *queue, struct timeout_task *timeout_task,
+    int priority, task_fn_t func, void *context)
+{
+
+	TASK_INIT(&timeout_task->t, priority, func, context);
+	callout_init_mtx(&timeout_task->c, &queue->tq_mutex, 0);
+	timeout_task->q = queue;
+	timeout_task->f = 0;
+}
+
+static __inline int
+TQ_SLEEP(struct taskqueue *tq, void *p, struct mtx *m, int pri, const char *wm,
+    int t)
+{
+	if (tq->tq_spin)
+		return (msleep_spin(p, m, wm, t));
+	return (msleep(p, m, pri, wm, t));
+}
+
+static struct taskqueue *
+_taskqueue_create(const char *name __unused, int mflags,
+		 taskqueue_enqueue_fn enqueue, void *context,
+		 int mtxflags, const char *mtxname)
+{
+	struct taskqueue *queue;
+
+	queue = malloc(sizeof(struct taskqueue), M_TASKQUEUE, mflags | M_ZERO);
+	if (!queue)
+		return NULL;
+
+	STAILQ_INIT(&queue->tq_queue);
+	TAILQ_INIT(&queue->tq_active);
+	queue->tq_enqueue = enqueue;
+	queue->tq_context = context;
+	queue->tq_spin = (mtxflags & MTX_SPIN) != 0;
+	queue->tq_flags |= TQ_FLAGS_ACTIVE;
+	mtx_init(&queue->tq_mutex, mtxname, NULL, mtxflags);
+
+	return queue;
+}
+
+struct taskqueue *
+taskqueue_create(const char *name, int mflags,
+		 taskqueue_enqueue_fn enqueue, void *context)
+{
+	return _taskqueue_create(name, mflags, enqueue, context,
+			MTX_DEF, "taskqueue");
+}
+
+void
+taskqueue_set_callback(struct taskqueue *queue,
+    enum taskqueue_callback_type cb_type, taskqueue_callback_fn callback,
+    void *context)
+{
+
+	KASSERT(((cb_type >= TASKQUEUE_CALLBACK_TYPE_MIN) &&
+	    (cb_type <= TASKQUEUE_CALLBACK_TYPE_MAX)),
+	    ("Callback type %d not valid, must be %d-%d", cb_type,
+	    TASKQUEUE_CALLBACK_TYPE_MIN, TASKQUEUE_CALLBACK_TYPE_MAX));
+	KASSERT((queue->tq_callbacks[cb_type] == NULL),
+	    ("Re-initialization of taskqueue callback?"));
+
+	queue->tq_callbacks[cb_type] = callback;
+	queue->tq_cb_contexts[cb_type] = context;
+}
+
+/*
+ * Signal a taskqueue thread to terminate.
+ */
+static void
+taskqueue_terminate(struct thread **pp, struct taskqueue *tq)
+{
+
+	while (tq->tq_tcount > 0 || tq->tq_callouts > 0) {
+		wakeup(tq);
+		TQ_SLEEP(tq, pp, &tq->tq_mutex, PWAIT, "taskqueue_destroy", 0);
+	}
+}
+
+void
+taskqueue_free(struct taskqueue *queue)
+{
+
+	TQ_LOCK(queue);
+	queue->tq_flags &= ~TQ_FLAGS_ACTIVE;
+	taskqueue_terminate(queue->tq_threads, queue);
+	KASSERT(TAILQ_EMPTY(&queue->tq_active), ("Tasks still running?"));
+	KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks"));
+	mtx_destroy(&queue->tq_mutex);
+	free(queue->tq_threads, M_TASKQUEUE);
+	free(queue, M_TASKQUEUE);
+}
+
+static int
+taskqueue_enqueue_locked(struct taskqueue *queue, struct task *task)
+{
+	struct task *ins;
+	struct task *prev;
+
+	/*
+	 * Count multiple enqueues.
+	 */
+	if (task->ta_pending) {
+		if (task->ta_pending < USHRT_MAX)
+			task->ta_pending++;
+		return (0);
+	}
+
+	/*
+	 * Optimise the case when all tasks have the same priority.
+	 */
+	prev = STAILQ_LAST(&queue->tq_queue, task, ta_link);
+	if (!prev || prev->ta_priority >= task->ta_priority) {
+		STAILQ_INSERT_TAIL(&queue->tq_queue, task, ta_link);
+	} else {
+		prev = NULL;
+		for (ins = STAILQ_FIRST(&queue->tq_queue); ins;
+		     prev = ins, ins = STAILQ_NEXT(ins, ta_link))
+			if (ins->ta_priority < task->ta_priority)
+				break;
+
+		if (prev)
+			STAILQ_INSERT_AFTER(&queue->tq_queue, prev, task, ta_link);
+		else
+			STAILQ_INSERT_HEAD(&queue->tq_queue, task, ta_link);
+	}
+
+	task->ta_pending = 1;
+	if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0)
+		queue->tq_enqueue(queue->tq_context);
+	else
+		queue->tq_flags |= TQ_FLAGS_PENDING;
+
+	return (0);
+}
+int
+taskqueue_enqueue(struct taskqueue *queue, struct task *task)
+{
+	int res;
+
+	TQ_LOCK(queue);
+	res = taskqueue_enqueue_locked(queue, task);
+	TQ_UNLOCK(queue);
+
+	return (res);
+}
+
+static void
+taskqueue_timeout_func(void *arg)
+{
+	struct taskqueue *queue;
+	struct timeout_task *timeout_task;
+
+	timeout_task = arg;
+	queue = timeout_task->q;
+	KASSERT((timeout_task->f & DT_CALLOUT_ARMED) != 0, ("Stray timeout"));
+	timeout_task->f &= ~DT_CALLOUT_ARMED;
+	queue->tq_callouts--;
+	taskqueue_enqueue_locked(timeout_task->q, &timeout_task->t);
+}
+
+int
+taskqueue_enqueue_timeout(struct taskqueue *queue,
+    struct timeout_task *timeout_task, int ticks)
+{
+	int res;
+
+	TQ_LOCK(queue);
+	KASSERT(timeout_task->q == NULL || timeout_task->q == queue,
+	    ("Migrated queue"));
+	KASSERT(!queue->tq_spin, ("Timeout for spin-queue"));
+	timeout_task->q = queue;
+	res = timeout_task->t.ta_pending;
+	if (ticks == 0) {
+		taskqueue_enqueue_locked(queue, &timeout_task->t);
+	} else {
+		if ((timeout_task->f & DT_CALLOUT_ARMED) != 0) {
+			res++;
+		} else {
+			queue->tq_callouts++;
+			timeout_task->f |= DT_CALLOUT_ARMED;
+			if (ticks < 0)
+				ticks = -ticks; /* Ignore overflow. */
+		}
+		if (ticks > 0) {
+			callout_reset(&timeout_task->c, ticks,
+			    taskqueue_timeout_func, timeout_task);
+		}
+	}
+	TQ_UNLOCK(queue);
+	return (res);
+}
+
+void
+taskqueue_block(struct taskqueue *queue)
+{
+
+	TQ_LOCK(queue);
+	queue->tq_flags |= TQ_FLAGS_BLOCKED;
+	TQ_UNLOCK(queue);
+}
+
+void
+taskqueue_unblock(struct taskqueue *queue)
+{
+
+	TQ_LOCK(queue);
+	queue->tq_flags &= ~TQ_FLAGS_BLOCKED;
+	if (queue->tq_flags & TQ_FLAGS_PENDING) {
+		queue->tq_flags &= ~TQ_FLAGS_PENDING;
+		queue->tq_enqueue(queue->tq_context);
+	}
+	TQ_UNLOCK(queue);
+}
+
+static void
+taskqueue_run_locked(struct taskqueue *queue)
+{
+	struct taskqueue_busy tb;
+	struct task *task;
+	int pending;
+
+	TQ_ASSERT_LOCKED(queue);
+	tb.tb_running = NULL;
+	TAILQ_INSERT_TAIL(&queue->tq_active, &tb, tb_link);
+
+	while (STAILQ_FIRST(&queue->tq_queue)) {
+		/*
+		 * Carefully remove the first task from the queue and
+		 * zero its pending count.
+		 */
+		task = STAILQ_FIRST(&queue->tq_queue);
+		STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link);
+		pending = task->ta_pending;
+		task->ta_pending = 0;
+		tb.tb_running = task;
+		TQ_UNLOCK(queue);
+
+		task->ta_func(task->ta_context, pending);
+
+		TQ_LOCK(queue);
+		tb.tb_running = NULL;
+		wakeup(task);
+	}
+	TAILQ_REMOVE(&queue->tq_active, &tb, tb_link);
+}
+
+void
+taskqueue_run(struct taskqueue *queue)
+{
+
+	TQ_LOCK(queue);
+	taskqueue_run_locked(queue);
+	TQ_UNLOCK(queue);
+}
+
+static int
+task_is_running(struct taskqueue *queue, struct task *task)
+{
+	struct taskqueue_busy *tb;
+
+	TQ_ASSERT_LOCKED(queue);
+	TAILQ_FOREACH(tb, &queue->tq_active, tb_link) {
+		if (tb->tb_running == task)
+			return (1);
+	}
+	return (0);
+}
+
+static int
+taskqueue_cancel_locked(struct taskqueue *queue, struct task *task,
+    u_int *pendp)
+{
+
+	if (task->ta_pending > 0)
+		STAILQ_REMOVE(&queue->tq_queue, task, task, ta_link);
+	if (pendp != NULL)
+		*pendp = task->ta_pending;
+	task->ta_pending = 0;
+	return (task_is_running(queue, task) ? EBUSY : 0);
+}
+
+int
+taskqueue_cancel(struct taskqueue *queue, struct task *task, u_int *pendp)
+{
+	u_int pending;
+	int error;
+
+	TQ_LOCK(queue);
+	pending = task->ta_pending;
+	error = taskqueue_cancel_locked(queue, task, pendp);
+	TQ_UNLOCK(queue);
+
+	return (error);
+}
+
+int
+taskqueue_cancel_timeout(struct taskqueue *queue,
+    struct timeout_task *timeout_task, u_int *pendp)
+{
+	u_int pending, pending1;
+	int error;
+
+	TQ_LOCK(queue);
+	pending = !!callout_stop(&timeout_task->c);
+	error = taskqueue_cancel_locked(queue, &timeout_task->t, &pending1);
+	if ((timeout_task->f & DT_CALLOUT_ARMED) != 0) {
+		timeout_task->f &= ~DT_CALLOUT_ARMED;
+		queue->tq_callouts--;
+	}
+	TQ_UNLOCK(queue);
+
+	if (pendp != NULL)
+		*pendp = pending + pending1;
+	return (error);
+}
+
+void
+taskqueue_drain(struct taskqueue *queue, struct task *task)
+{
+
+	if (!queue->tq_spin)
+		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
+
+	TQ_LOCK(queue);
+	while (task->ta_pending != 0 || task_is_running(queue, task))
+		TQ_SLEEP(queue, task, &queue->tq_mutex, PWAIT, "-", 0);
+	TQ_UNLOCK(queue);
+}
+
+void
+taskqueue_drain_timeout(struct taskqueue *queue,
+    struct timeout_task *timeout_task)
+{
+
+	callout_drain(&timeout_task->c);
+	taskqueue_drain(queue, &timeout_task->t);
+}
+
+static void
+taskqueue_swi_enqueue(void *context)
+{
+	swi_sched(taskqueue_ih, 0);
+}
+
+static void
+taskqueue_swi_run(void *dummy)
+{
+	taskqueue_run(taskqueue_swi);
+}
+
+static void
+taskqueue_swi_giant_enqueue(void *context)
+{
+	swi_sched(taskqueue_giant_ih, 0);
+}
+
+static void
+taskqueue_swi_giant_run(void *dummy)
+{
+	taskqueue_run(taskqueue_swi_giant);
+}
+
+int
+taskqueue_start_threads(struct taskqueue **tqp, int count, int pri,
+			const char *name, ...)
+{
+	va_list ap;
+	struct thread *td;
+	struct taskqueue *tq;
+	int i, error;
+	char ktname[MAXCOMLEN + 1];
+
+	if (count <= 0)
+		return (EINVAL);
+
+	tq = *tqp;
+
+	va_start(ap, name);
+	vsnprintf(ktname, sizeof(ktname), name, ap);
+	va_end(ap);
+
+	tq->tq_threads = malloc(sizeof(struct thread *) * count, M_TASKQUEUE,
+	    M_NOWAIT | M_ZERO);
+	if (tq->tq_threads == NULL) {
+		printf("%s: no memory for %s threads\n", __func__, ktname);
+		return (ENOMEM);
+	}
+
+	for (i = 0; i < count; i++) {
+		if (count == 1)
+			error = kthread_add(taskqueue_thread_loop, tqp, NULL,
+			    &tq->tq_threads[i], RFSTOPPED, 0, "%s", ktname);
+		else
+			error = kthread_add(taskqueue_thread_loop, tqp, NULL,
+			    &tq->tq_threads[i], RFSTOPPED, 0,
+			    "%s_%d", ktname, i);
+		if (error) {
+			/* should be ok to continue, taskqueue_free will dtrt */
+			printf("%s: kthread_add(%s): error %d", __func__,
+			    ktname, error);
+			tq->tq_threads[i] = NULL;		/* paranoid */
+		} else
+			tq->tq_tcount++;
+	}
+	for (i = 0; i < count; i++) {
+		if (tq->tq_threads[i] == NULL)
+			continue;
+		td = tq->tq_threads[i];
+		thread_lock(td);
+		sched_prio(td, pri);
+		sched_add(td, SRQ_BORING);
+		thread_unlock(td);
+	}
+
+	return (0);
+}
+
+static inline void
+taskqueue_run_callback(struct taskqueue *tq,
+    enum taskqueue_callback_type cb_type)
+{
+	taskqueue_callback_fn tq_callback;
+
+	TQ_ASSERT_UNLOCKED(tq);
+	tq_callback = tq->tq_callbacks[cb_type];
+	if (tq_callback != NULL)
+		tq_callback(tq->tq_cb_contexts[cb_type]);
+}
+
+void
+taskqueue_thread_loop(void *arg)
+{
+	struct taskqueue **tqp, *tq;
+
+	tqp = arg;
+	tq = *tqp;
+	taskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_INIT);
+	TQ_LOCK(tq);
+	while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) {
+		taskqueue_run_locked(tq);
+		/*
+		 * Because taskqueue_run() can drop tq_mutex, we need to
+		 * check if the TQ_FLAGS_ACTIVE flag wasn't removed in the
+		 * meantime, which means we missed a wakeup.
+		 */
+		if ((tq->tq_flags & TQ_FLAGS_ACTIVE) == 0)
+			break;
+		TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0);
+	}
+	taskqueue_run_locked(tq);
+
+	/*
+	 * This thread is on its way out, so just drop the lock temporarily
+	 * in order to call the shutdown callback.  This allows the callback
+	 * to look at the taskqueue, even just before it dies.
+	 */
+	TQ_UNLOCK(tq);
+	taskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN);
+	TQ_LOCK(tq);
+
+	/* rendezvous with thread that asked us to terminate */
+	tq->tq_tcount--;
+	wakeup_one(tq->tq_threads);
+	TQ_UNLOCK(tq);
+	kthread_exit();
+}
+
+void
+taskqueue_thread_enqueue(void *context)
+{
+	struct taskqueue **tqp, *tq;
+
+	tqp = context;
+	tq = *tqp;
+
+	TQ_ASSERT_LOCKED(tq);
+	wakeup_one(tq);
+}
+
+TASKQUEUE_DEFINE(swi, taskqueue_swi_enqueue, NULL,
+		 swi_add(NULL, "task queue", taskqueue_swi_run, NULL, SWI_TQ,
+		     INTR_MPSAFE, &taskqueue_ih)); 
+
+TASKQUEUE_DEFINE(swi_giant, taskqueue_swi_giant_enqueue, NULL,
+		 swi_add(NULL, "Giant taskq", taskqueue_swi_giant_run,
+		     NULL, SWI_TQ_GIANT, 0, &taskqueue_giant_ih)); 
+
+TASKQUEUE_DEFINE_THREAD(thread);
+
+struct taskqueue *
+taskqueue_create_fast(const char *name, int mflags,
+		 taskqueue_enqueue_fn enqueue, void *context)
+{
+	return _taskqueue_create(name, mflags, enqueue, context,
+			MTX_SPIN, "fast_taskqueue");
+}
+
+/* NB: for backwards compatibility */
+int
+taskqueue_enqueue_fast(struct taskqueue *queue, struct task *task)
+{
+	return taskqueue_enqueue(queue, task);
+}
+
+static void	*taskqueue_fast_ih;
+
+static void
+taskqueue_fast_enqueue(void *context)
+{
+	swi_sched(taskqueue_fast_ih, 0);
+}
+
+static void
+taskqueue_fast_run(void *dummy)
+{
+	taskqueue_run(taskqueue_fast);
+}
+
+TASKQUEUE_FAST_DEFINE(fast, taskqueue_fast_enqueue, NULL,
+	swi_add(NULL, "fast taskq", taskqueue_fast_run, NULL,
+	SWI_TQ_FAST, INTR_MPSAFE, &taskqueue_fast_ih));
+
+int
+taskqueue_member(struct taskqueue *queue, struct thread *td)
+{
+	int i, j, ret = 0;
+
+	for (i = 0, j = 0; ; i++) {
+		if (queue->tq_threads[i] == NULL)
+			continue;
+		if (queue->tq_threads[i] == td) {
+			ret = 1;
+			break;
+		}
+		if (++j >= queue->tq_tcount)
+			break;
+	}
+	return (ret);
+}
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
new file mode 100644
index 0000000..19729a4
--- /dev/null
+++ b/sys/kern/subr_trap.c
@@ -0,0 +1,303 @@
+/*-
+ * Copyright (C) 1994, David Greenman
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2007 The FreeBSD Foundation
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the University of Utah, and William Jolitz.
+ *
+ * Portions of this software were developed by A. Joseph Koshy under
+ * sponsorship from the FreeBSD Foundation and Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_hwpmc_hooks.h"
+#include "opt_ktrace.h"
+#include "opt_kdtrace.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/capability.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pmckern.h>
+#include <sys/proc.h>
+#include <sys/ktr.h>
+#include <sys/pioctl.h>
+#include <sys/ptrace.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/signalvar.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vmmeter.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+#include <security/audit/audit.h>
+
+#include <machine/cpu.h>
+
+#ifdef VIMAGE
+#include <net/vnet.h>
+#endif
+
+#ifdef XEN
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#endif
+
+#ifdef	HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#include <security/mac/mac_framework.h>
+
+/*
+ * Define the code needed before returning to user mode, for trap and
+ * syscall.
+ */
+void
+userret(struct thread *td, struct trapframe *frame)
+{
+	struct proc *p = td->td_proc;
+
+	CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,
+            td->td_name);
+	KASSERT((p->p_flag & P_WEXIT) == 0,
+	    ("Exiting process returns to usermode"));
+#if 0
+#ifdef DIAGNOSTIC
+	/* Check that we called signotify() enough. */
+	PROC_LOCK(p);
+	thread_lock(td);
+	if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 ||
+	    (td->td_flags & TDF_ASTPENDING) == 0))
+		printf("failed to set signal flags properly for ast()\n");
+	thread_unlock(td);
+	PROC_UNLOCK(p);
+#endif
+#endif
+#ifdef KTRACE
+	KTRUSERRET(td);
+#endif
+	/*
+	 * If this thread tickled GEOM, we need to wait for the giggling to
+	 * stop before we return to userland
+	 */
+	if (td->td_pflags & TDP_GEOM)
+		g_waitidle();
+
+	/*
+	 * Charge system time if profiling.
+	 */
+	if (p->p_flag & P_PROFIL)
+		addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio);
+	/*
+	 * Let the scheduler adjust our priority etc.
+	 */
+	sched_userret(td);
+#ifdef XEN
+	PT_UPDATES_FLUSH();
+#endif
+
+	/*
+	 * Check for misbehavior.
+	 *
+	 * In case there is a callchain tracing ongoing because of
+	 * hwpmc(4), skip the scheduler pinning check.
+	 * hwpmc(4) subsystem, infact, will collect callchain informations
+	 * at ast() checkpoint, which is past userret().
+	 */
+	WITNESS_WARN(WARN_PANIC, NULL, "userret: returning");
+	KASSERT(td->td_critnest == 0,
+	    ("userret: Returning in a critical section"));
+	KASSERT(td->td_locks == 0,
+	    ("userret: Returning with %d locks held", td->td_locks));
+	KASSERT((td->td_pflags & TDP_NOFAULTING) == 0,
+	    ("userret: Returning with pagefaults disabled"));
+	KASSERT(td->td_no_sleeping == 0,
+	    ("userret: Returning with sleep disabled"));
+	KASSERT(td->td_pinned == 0 || (td->td_pflags & TDP_CALLCHAIN) != 0,
+	    ("userret: Returning with with pinned thread"));
+	KASSERT(td->td_vp_reserv == 0,
+	    ("userret: Returning while holding vnode reservation"));
+	KASSERT((td->td_flags & TDF_SBDRY) == 0,
+	    ("userret: Returning with stop signals deferred"));
+#ifdef VIMAGE
+	/* Unfortunately td_vnet_lpush needs VNET_DEBUG. */
+	VNET_ASSERT(curvnet == NULL,
+	    ("%s: Returning on td %p (pid %d, %s) with vnet %p set in %s",
+	    __func__, td, p->p_pid, td->td_name, curvnet,
+	    (td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A"));
+#endif
+#ifdef	RACCT
+	PROC_LOCK(p);
+	while (p->p_throttled == 1)
+		msleep(p->p_racct, &p->p_mtx, 0, "racct", 0);
+	PROC_UNLOCK(p);
+#endif
+}
+
+/*
+ * Process an asynchronous software trap.
+ * This is relatively easy.
+ * This function will return with preemption disabled.
+ */
+void
+ast(struct trapframe *framep)
+{
+	struct thread *td;
+	struct proc *p;
+	int flags;
+	int sig;
+
+	td = curthread;
+	p = td->td_proc;
+
+	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,
+            p->p_comm);
+	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
+	WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode");
+	mtx_assert(&Giant, MA_NOTOWNED);
+	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
+	td->td_frame = framep;
+	td->td_pticks = 0;
+
+	/*
+	 * This updates the td_flag's for the checks below in one
+	 * "atomic" operation with turning off the astpending flag.
+	 * If another AST is triggered while we are handling the
+	 * AST's saved in flags, the astpending flag will be set and
+	 * ast() will be called again.
+	 */
+	thread_lock(td);
+	flags = td->td_flags;
+	td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK |
+	    TDF_NEEDRESCHED | TDF_ALRMPEND | TDF_PROFPEND | TDF_MACPEND);
+	thread_unlock(td);
+	PCPU_INC(cnt.v_trap);
+
+	if (td->td_ucred != p->p_ucred) 
+		cred_update_thread(td);
+	if (td->td_pflags & TDP_OWEUPC && p->p_flag & P_PROFIL) {
+		addupc_task(td, td->td_profil_addr, td->td_profil_ticks);
+		td->td_profil_ticks = 0;
+		td->td_pflags &= ~TDP_OWEUPC;
+	}
+#ifdef HWPMC_HOOKS
+	/* Handle Software PMC callchain capture. */
+	if (PMC_IS_PENDING_CALLCHAIN(td))
+		PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_USER_CALLCHAIN_SOFT, (void *) framep);
+#endif
+	if (flags & TDF_ALRMPEND) {
+		PROC_LOCK(p);
+		kern_psignal(p, SIGVTALRM);
+		PROC_UNLOCK(p);
+	}
+	if (flags & TDF_PROFPEND) {
+		PROC_LOCK(p);
+		kern_psignal(p, SIGPROF);
+		PROC_UNLOCK(p);
+	}
+#ifdef MAC
+	if (flags & TDF_MACPEND)
+		mac_thread_userret(td);
+#endif
+	if (flags & TDF_NEEDRESCHED) {
+#ifdef KTRACE
+		if (KTRPOINT(td, KTR_CSW))
+			ktrcsw(1, 1, __func__);
+#endif
+		thread_lock(td);
+		sched_prio(td, td->td_user_pri);
+		mi_switch(SW_INVOL | SWT_NEEDRESCHED, NULL);
+		thread_unlock(td);
+#ifdef KTRACE
+		if (KTRPOINT(td, KTR_CSW))
+			ktrcsw(0, 1, __func__);
+#endif
+	}
+
+	/*
+	 * Check for signals. Unlocked reads of p_pendingcnt or
+	 * p_siglist might cause process-directed signal to be handled
+	 * later.
+	 */
+	if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 ||
+	    !SIGISEMPTY(p->p_siglist)) {
+		PROC_LOCK(p);
+		mtx_lock(&p->p_sigacts->ps_mtx);
+		while ((sig = cursig(td)) != 0)
+			postsig(sig);
+		mtx_unlock(&p->p_sigacts->ps_mtx);
+		PROC_UNLOCK(p);
+	}
+	/*
+	 * We need to check to see if we have to exit or wait due to a
+	 * single threading requirement or some other STOP condition.
+	 */
+	if (flags & TDF_NEEDSUSPCHK) {
+		PROC_LOCK(p);
+		thread_suspend_check(0);
+		PROC_UNLOCK(p);
+	}
+
+	if (td->td_pflags & TDP_OLDMASK) {
+		td->td_pflags &= ~TDP_OLDMASK;
+		kern_sigprocmask(td, SIG_SETMASK, &td->td_oldsigmask, NULL, 0);
+	}
+
+	userret(td, framep);
+}
+
+const char *
+syscallname(struct proc *p, u_int code)
+{
+	static const char unknown[] = "unknown";
+	struct sysentvec *sv;
+
+	sv = p->p_sysent;
+	if (sv->sv_syscallnames == NULL || code >= sv->sv_size)
+		return (unknown);
+	return (sv->sv_syscallnames[code]);
+}
diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c
new file mode 100644
index 0000000..0a21ad9
--- /dev/null
+++ b/sys/kern/subr_turnstile.c
@@ -0,0 +1,1308 @@
+/*-
+ * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ *    promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
+ */
+
+/*
+ * Implementation of turnstiles used to hold queue of threads blocked on
+ * non-sleepable locks.  Sleepable locks use condition variables to
+ * implement their queues.  Turnstiles differ from a sleep queue in that
+ * turnstile queue's are assigned to a lock held by an owning thread.  Thus,
+ * when one thread is enqueued onto a turnstile, it can lend its priority
+ * to the owning thread.
+ *
+ * We wish to avoid bloating locks with an embedded turnstile and we do not
+ * want to use back-pointers in the locks for the same reason.  Thus, we
+ * use a similar approach to that of Solaris 7 as described in Solaris
+ * Internals by Jim Mauro and Richard McDougall.  Turnstiles are looked up
+ * in a hash table based on the address of the lock.  Each entry in the
+ * hash table is a linked-lists of turnstiles and is called a turnstile
+ * chain.  Each chain contains a spin mutex that protects all of the
+ * turnstiles in the chain.
+ *
+ * Each time a thread is created, a turnstile is allocated from a UMA zone
+ * and attached to that thread.  When a thread blocks on a lock, if it is the
+ * first thread to block, it lends its turnstile to the lock.  If the lock
+ * already has a turnstile, then it gives its turnstile to the lock's
+ * turnstile's free list.  When a thread is woken up, it takes a turnstile from
+ * the free list if there are any other waiters.  If it is the only thread
+ * blocked on the lock, then it reclaims the turnstile associated with the lock
+ * and removes it from the hash table.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_kdtrace.h"
+#include "opt_turnstile_profiling.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/sysctl.h>
+#include <sys/turnstile.h>
+
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#include <sys/lockmgr.h>
+#include <sys/sx.h>
+#endif
+
+/*
+ * Constants for the hash table of turnstile chains.  TC_SHIFT is a magic
+ * number chosen because the sleep queue's use the same value for the
+ * shift.  Basically, we ignore the lower 8 bits of the address.
+ * TC_TABLESIZE must be a power of two for TC_MASK to work properly.
+ */
+#define	TC_TABLESIZE	128			/* Must be power of 2. */
+#define	TC_MASK		(TC_TABLESIZE - 1)
+#define	TC_SHIFT	8
+#define	TC_HASH(lock)	(((uintptr_t)(lock) >> TC_SHIFT) & TC_MASK)
+#define	TC_LOOKUP(lock)	&turnstile_chains[TC_HASH(lock)]
+
+/*
+ * There are three different lists of turnstiles as follows.  The list
+ * connected by ts_link entries is a per-thread list of all the turnstiles
+ * attached to locks that we own.  This is used to fixup our priority when
+ * a lock is released.  The other two lists use the ts_hash entries.  The
+ * first of these two is the turnstile chain list that a turnstile is on
+ * when it is attached to a lock.  The second list to use ts_hash is the
+ * free list hung off of a turnstile that is attached to a lock.
+ *
+ * Each turnstile contains three lists of threads.  The two ts_blocked lists
+ * are linked list of threads blocked on the turnstile's lock.  One list is
+ * for exclusive waiters, and the other is for shared waiters.  The
+ * ts_pending list is a linked list of threads previously awakened by
+ * turnstile_signal() or turnstile_wait() that are waiting to be put on
+ * the run queue.
+ *
+ * Locking key:
+ *  c - turnstile chain lock
+ *  q - td_contested lock
+ */
+struct turnstile {
+	struct mtx ts_lock;			/* Spin lock for self. */
+	struct threadqueue ts_blocked[2];	/* (c + q) Blocked threads. */
+	struct threadqueue ts_pending;		/* (c) Pending threads. */
+	LIST_ENTRY(turnstile) ts_hash;		/* (c) Chain and free list. */
+	LIST_ENTRY(turnstile) ts_link;		/* (q) Contested locks. */
+	LIST_HEAD(, turnstile) ts_free;		/* (c) Free turnstiles. */
+	struct lock_object *ts_lockobj;		/* (c) Lock we reference. */
+	struct thread *ts_owner;		/* (c + q) Who owns the lock. */
+};
+
+struct turnstile_chain {
+	LIST_HEAD(, turnstile) tc_turnstiles;	/* List of turnstiles. */
+	struct mtx tc_lock;			/* Spin lock for this chain. */
+#ifdef TURNSTILE_PROFILING
+	u_int	tc_depth;			/* Length of tc_queues. */
+	u_int	tc_max_depth;			/* Max length of tc_queues. */
+#endif
+};
+
+#ifdef TURNSTILE_PROFILING
+u_int turnstile_max_depth;
+static SYSCTL_NODE(_debug, OID_AUTO, turnstile, CTLFLAG_RD, 0,
+    "turnstile profiling");
+static SYSCTL_NODE(_debug_turnstile, OID_AUTO, chains, CTLFLAG_RD, 0,
+    "turnstile chain stats");
+SYSCTL_UINT(_debug_turnstile, OID_AUTO, max_depth, CTLFLAG_RD,
+    &turnstile_max_depth, 0, "maximum depth achieved of a single chain");
+#endif
+static struct mtx td_contested_lock;
+static struct turnstile_chain turnstile_chains[TC_TABLESIZE];
+static uma_zone_t turnstile_zone;
+
+/*
+ * Prototypes for non-exported routines.
+ */
+static void	init_turnstile0(void *dummy);
+#ifdef TURNSTILE_PROFILING
+static void	init_turnstile_profiling(void *arg);
+#endif
+static void	propagate_priority(struct thread *td);
+static int	turnstile_adjust_thread(struct turnstile *ts,
+		    struct thread *td);
+static struct thread *turnstile_first_waiter(struct turnstile *ts);
+static void	turnstile_setowner(struct turnstile *ts, struct thread *owner);
+#ifdef INVARIANTS
+static void	turnstile_dtor(void *mem, int size, void *arg);
+#endif
+static int	turnstile_init(void *mem, int size, int flags);
+static void	turnstile_fini(void *mem, int size);
+
+SDT_PROVIDER_DECLARE(sched);
+SDT_PROBE_DEFINE(sched, , , sleep, sleep);
+SDT_PROBE_DEFINE2(sched, , , wakeup, wakeup, "struct thread *", 
+    "struct proc *");
+
+/*
+ * Walks the chain of turnstiles and their owners to propagate the priority
+ * of the thread being blocked to all the threads holding locks that have to
+ * release their locks before this thread can run again.
+ */
+static void
+propagate_priority(struct thread *td)
+{
+	struct turnstile *ts;
+	int pri;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	pri = td->td_priority;
+	ts = td->td_blocked;
+	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+	/*
+	 * Grab a recursive lock on this turnstile chain so it stays locked
+	 * for the whole operation.  The caller expects us to return with
+	 * the original lock held.  We only ever lock down the chain so
+	 * the lock order is constant.
+	 */
+	mtx_lock_spin(&ts->ts_lock);
+	for (;;) {
+		td = ts->ts_owner;
+
+		if (td == NULL) {
+			/*
+			 * This might be a read lock with no owner.  There's
+			 * not much we can do, so just bail.
+			 */
+			mtx_unlock_spin(&ts->ts_lock);
+			return;
+		}
+
+		thread_lock_flags(td, MTX_DUPOK);
+		mtx_unlock_spin(&ts->ts_lock);
+		MPASS(td->td_proc != NULL);
+		MPASS(td->td_proc->p_magic == P_MAGIC);
+
+		/*
+		 * If the thread is asleep, then we are probably about
+		 * to deadlock.  To make debugging this easier, show
+		 * backtrace of misbehaving thread and panic to not
+		 * leave the kernel deadlocked.
+		 */
+		if (TD_IS_SLEEPING(td)) {
+			printf(
+		"Sleeping thread (tid %d, pid %d) owns a non-sleepable lock\n",
+			    td->td_tid, td->td_proc->p_pid);
+			kdb_backtrace_thread(td);
+			panic("sleeping thread");
+		}
+
+		/*
+		 * If this thread already has higher priority than the
+		 * thread that is being blocked, we are finished.
+		 */
+		if (td->td_priority <= pri) {
+			thread_unlock(td);
+			return;
+		}
+
+		/*
+		 * Bump this thread's priority.
+		 */
+		sched_lend_prio(td, pri);
+
+		/*
+		 * If lock holder is actually running or on the run queue
+		 * then we are done.
+		 */
+		if (TD_IS_RUNNING(td) || TD_ON_RUNQ(td)) {
+			MPASS(td->td_blocked == NULL);
+			thread_unlock(td);
+			return;
+		}
+
+#ifndef SMP
+		/*
+		 * For UP, we check to see if td is curthread (this shouldn't
+		 * ever happen however as it would mean we are in a deadlock.)
+		 */
+		KASSERT(td != curthread, ("Deadlock detected"));
+#endif
+
+		/*
+		 * If we aren't blocked on a lock, we should be.
+		 */
+		KASSERT(TD_ON_LOCK(td), (
+		    "thread %d(%s):%d holds %s but isn't blocked on a lock\n",
+		    td->td_tid, td->td_name, td->td_state,
+		    ts->ts_lockobj->lo_name));
+
+		/*
+		 * Pick up the lock that td is blocked on.
+		 */
+		ts = td->td_blocked;
+		MPASS(ts != NULL);
+		THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+		/* Resort td on the list if needed. */
+		if (!turnstile_adjust_thread(ts, td)) {
+			mtx_unlock_spin(&ts->ts_lock);
+			return;
+		}
+		/* The thread lock is released as ts lock above. */
+	}
+}
+
+/*
+ * Adjust the thread's position on a turnstile after its priority has been
+ * changed.
+ */
+static int
+turnstile_adjust_thread(struct turnstile *ts, struct thread *td)
+{
+	struct thread *td1, *td2;
+	int queue;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	MPASS(TD_ON_LOCK(td));
+
+	/*
+	 * This thread may not be blocked on this turnstile anymore
+	 * but instead might already be woken up on another CPU
+	 * that is waiting on the thread lock in turnstile_unpend() to
+	 * finish waking this thread up.  We can detect this case
+	 * by checking to see if this thread has been given a
+	 * turnstile by either turnstile_signal() or
+	 * turnstile_broadcast().  In this case, treat the thread as
+	 * if it was already running.
+	 */
+	if (td->td_turnstile != NULL)
+		return (0);
+
+	/*
+	 * Check if the thread needs to be moved on the blocked chain.
+	 * It needs to be moved if either its priority is lower than
+	 * the previous thread or higher than the next thread.
+	 */
+	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+	td1 = TAILQ_PREV(td, threadqueue, td_lockq);
+	td2 = TAILQ_NEXT(td, td_lockq);
+	if ((td1 != NULL && td->td_priority < td1->td_priority) ||
+	    (td2 != NULL && td->td_priority > td2->td_priority)) {
+
+		/*
+		 * Remove thread from blocked chain and determine where
+		 * it should be moved to.
+		 */
+		queue = td->td_tsqueue;
+		MPASS(queue == TS_EXCLUSIVE_QUEUE || queue == TS_SHARED_QUEUE);
+		mtx_lock_spin(&td_contested_lock);
+		TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
+		TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq) {
+			MPASS(td1->td_proc->p_magic == P_MAGIC);
+			if (td1->td_priority > td->td_priority)
+				break;
+		}
+
+		if (td1 == NULL)
+			TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
+		else
+			TAILQ_INSERT_BEFORE(td1, td, td_lockq);
+		mtx_unlock_spin(&td_contested_lock);
+		if (td1 == NULL)
+			CTR3(KTR_LOCK,
+		    "turnstile_adjust_thread: td %d put at tail on [%p] %s",
+			    td->td_tid, ts->ts_lockobj, ts->ts_lockobj->lo_name);
+		else
+			CTR4(KTR_LOCK,
+		    "turnstile_adjust_thread: td %d moved before %d on [%p] %s",
+			    td->td_tid, td1->td_tid, ts->ts_lockobj,
+			    ts->ts_lockobj->lo_name);
+	}
+	return (1);
+}
+
+/*
+ * Early initialization of turnstiles.  This is not done via a SYSINIT()
+ * since this needs to be initialized very early when mutexes are first
+ * initialized.
+ */
+void
+init_turnstiles(void)
+{
+	int i;
+
+	for (i = 0; i < TC_TABLESIZE; i++) {
+		LIST_INIT(&turnstile_chains[i].tc_turnstiles);
+		mtx_init(&turnstile_chains[i].tc_lock, "turnstile chain",
+		    NULL, MTX_SPIN);
+	}
+	mtx_init(&td_contested_lock, "td_contested", NULL, MTX_SPIN);
+	LIST_INIT(&thread0.td_contested);
+	thread0.td_turnstile = NULL;
+}
+
+#ifdef TURNSTILE_PROFILING
+static void
+init_turnstile_profiling(void *arg)
+{
+	struct sysctl_oid *chain_oid;
+	char chain_name[10];
+	int i;
+
+	for (i = 0; i < TC_TABLESIZE; i++) {
+		snprintf(chain_name, sizeof(chain_name), "%d", i);
+		chain_oid = SYSCTL_ADD_NODE(NULL, 
+		    SYSCTL_STATIC_CHILDREN(_debug_turnstile_chains), OID_AUTO,
+		    chain_name, CTLFLAG_RD, NULL, "turnstile chain stats");
+		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
+		    "depth", CTLFLAG_RD, &turnstile_chains[i].tc_depth, 0,
+		    NULL);
+		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
+		    "max_depth", CTLFLAG_RD, &turnstile_chains[i].tc_max_depth,
+		    0, NULL);
+	}
+}
+SYSINIT(turnstile_profiling, SI_SUB_LOCK, SI_ORDER_ANY,
+    init_turnstile_profiling, NULL);
+#endif
+
+static void
+init_turnstile0(void *dummy)
+{
+
+	turnstile_zone = uma_zcreate("TURNSTILE", sizeof(struct turnstile),
+	    NULL,
+#ifdef INVARIANTS
+	    turnstile_dtor,
+#else
+	    NULL,
+#endif
+	    turnstile_init, turnstile_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
+	thread0.td_turnstile = turnstile_alloc();
+}
+SYSINIT(turnstile0, SI_SUB_LOCK, SI_ORDER_ANY, init_turnstile0, NULL);
+
+/*
+ * Update a thread on the turnstile list after it's priority has been changed.
+ * The old priority is passed in as an argument.
+ */
+void
+turnstile_adjust(struct thread *td, u_char oldpri)
+{
+	struct turnstile *ts;
+
+	MPASS(TD_ON_LOCK(td));
+
+	/*
+	 * Pick up the lock that td is blocked on.
+	 */
+	ts = td->td_blocked;
+	MPASS(ts != NULL);
+	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+
+	/* Resort the turnstile on the list. */
+	if (!turnstile_adjust_thread(ts, td))
+		return;
+	/*
+	 * If our priority was lowered and we are at the head of the
+	 * turnstile, then propagate our new priority up the chain.
+	 * Note that we currently don't try to revoke lent priorities
+	 * when our priority goes up.
+	 */
+	MPASS(td->td_tsqueue == TS_EXCLUSIVE_QUEUE ||
+	    td->td_tsqueue == TS_SHARED_QUEUE);
+	if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) &&
+	    td->td_priority < oldpri) {
+		propagate_priority(td);
+	}
+}
+
+/*
+ * Set the owner of the lock this turnstile is attached to.
+ */
+static void
+turnstile_setowner(struct turnstile *ts, struct thread *owner)
+{
+
+	mtx_assert(&td_contested_lock, MA_OWNED);
+	MPASS(ts->ts_owner == NULL);
+
+	/* A shared lock might not have an owner. */
+	if (owner == NULL)
+		return;
+
+	MPASS(owner->td_proc->p_magic == P_MAGIC);
+	ts->ts_owner = owner;
+	LIST_INSERT_HEAD(&owner->td_contested, ts, ts_link);
+}
+
+#ifdef INVARIANTS
+/*
+ * UMA zone item deallocator.
+ */
+static void
+turnstile_dtor(void *mem, int size, void *arg)
+{
+	struct turnstile *ts;
+
+	ts = mem;
+	MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]));
+	MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
+	MPASS(TAILQ_EMPTY(&ts->ts_pending));
+}
+#endif
+
+/*
+ * UMA zone item initializer.
+ */
+static int
+turnstile_init(void *mem, int size, int flags)
+{
+	struct turnstile *ts;
+
+	bzero(mem, size);
+	ts = mem;
+	TAILQ_INIT(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
+	TAILQ_INIT(&ts->ts_blocked[TS_SHARED_QUEUE]);
+	TAILQ_INIT(&ts->ts_pending);
+	LIST_INIT(&ts->ts_free);
+	mtx_init(&ts->ts_lock, "turnstile lock", NULL, MTX_SPIN | MTX_RECURSE);
+	return (0);
+}
+
+static void
+turnstile_fini(void *mem, int size)
+{
+	struct turnstile *ts;
+
+	ts = mem;
+	mtx_destroy(&ts->ts_lock);
+}
+
+/*
+ * Get a turnstile for a new thread.
+ */
+struct turnstile *
+turnstile_alloc(void)
+{
+
+	return (uma_zalloc(turnstile_zone, M_WAITOK));
+}
+
+/*
+ * Free a turnstile when a thread is destroyed.
+ */
+void
+turnstile_free(struct turnstile *ts)
+{
+
+	uma_zfree(turnstile_zone, ts);
+}
+
+/*
+ * Lock the turnstile chain associated with the specified lock.
+ */
+void
+turnstile_chain_lock(struct lock_object *lock)
+{
+	struct turnstile_chain *tc;
+
+	tc = TC_LOOKUP(lock);
+	mtx_lock_spin(&tc->tc_lock);
+}
+
+struct turnstile *
+turnstile_trywait(struct lock_object *lock)
+{
+	struct turnstile_chain *tc;
+	struct turnstile *ts;
+
+	tc = TC_LOOKUP(lock);
+	mtx_lock_spin(&tc->tc_lock);
+	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+		if (ts->ts_lockobj == lock) {
+			mtx_lock_spin(&ts->ts_lock);
+			return (ts);
+		}
+
+	ts = curthread->td_turnstile;
+	MPASS(ts != NULL);
+	mtx_lock_spin(&ts->ts_lock);
+	KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer"));
+	ts->ts_lockobj = lock;
+
+	return (ts);
+}
+
+void
+turnstile_cancel(struct turnstile *ts)
+{
+	struct turnstile_chain *tc;
+	struct lock_object *lock;
+
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+
+	mtx_unlock_spin(&ts->ts_lock);
+	lock = ts->ts_lockobj;
+	if (ts == curthread->td_turnstile)
+		ts->ts_lockobj = NULL;
+	tc = TC_LOOKUP(lock);
+	mtx_unlock_spin(&tc->tc_lock);
+}
+
+/*
+ * Look up the turnstile for a lock in the hash table locking the associated
+ * turnstile chain along the way.  If no turnstile is found in the hash
+ * table, NULL is returned.
+ */
+struct turnstile *
+turnstile_lookup(struct lock_object *lock)
+{
+	struct turnstile_chain *tc;
+	struct turnstile *ts;
+
+	tc = TC_LOOKUP(lock);
+	mtx_assert(&tc->tc_lock, MA_OWNED);
+	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+		if (ts->ts_lockobj == lock) {
+			mtx_lock_spin(&ts->ts_lock);
+			return (ts);
+		}
+	return (NULL);
+}
+
+/*
+ * Unlock the turnstile chain associated with a given lock.
+ */
+void
+turnstile_chain_unlock(struct lock_object *lock)
+{
+	struct turnstile_chain *tc;
+
+	tc = TC_LOOKUP(lock);
+	mtx_unlock_spin(&tc->tc_lock);
+}
+
+/*
+ * Return a pointer to the thread waiting on this turnstile with the
+ * most important priority or NULL if the turnstile has no waiters.
+ */
+static struct thread *
+turnstile_first_waiter(struct turnstile *ts)
+{
+	struct thread *std, *xtd;
+
+	std = TAILQ_FIRST(&ts->ts_blocked[TS_SHARED_QUEUE]);
+	xtd = TAILQ_FIRST(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
+	if (xtd == NULL || (std != NULL && std->td_priority < xtd->td_priority))
+		return (std);
+	return (xtd);
+}
+
+/*
+ * Take ownership of a turnstile and adjust the priority of the new
+ * owner appropriately.
+ */
+void
+turnstile_claim(struct turnstile *ts)
+{
+	struct thread *td, *owner;
+	struct turnstile_chain *tc;
+
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+	MPASS(ts != curthread->td_turnstile);
+
+	owner = curthread;
+	mtx_lock_spin(&td_contested_lock);
+	turnstile_setowner(ts, owner);
+	mtx_unlock_spin(&td_contested_lock);
+
+	td = turnstile_first_waiter(ts);
+	MPASS(td != NULL);
+	MPASS(td->td_proc->p_magic == P_MAGIC);
+	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+
+	/*
+	 * Update the priority of the new owner if needed.
+	 */
+	thread_lock(owner);
+	if (td->td_priority < owner->td_priority)
+		sched_lend_prio(owner, td->td_priority);
+	thread_unlock(owner);
+	tc = TC_LOOKUP(ts->ts_lockobj);
+	mtx_unlock_spin(&ts->ts_lock);
+	mtx_unlock_spin(&tc->tc_lock);
+}
+
+/*
+ * Block the current thread on the turnstile assicated with 'lock'.  This
+ * function will context switch and not return until this thread has been
+ * woken back up.  This function must be called with the appropriate
+ * turnstile chain locked and will return with it unlocked.
+ */
+void
+turnstile_wait(struct turnstile *ts, struct thread *owner, int queue)
+{
+	struct turnstile_chain *tc;
+	struct thread *td, *td1;
+	struct lock_object *lock;
+
+	td = curthread;
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+	if (owner)
+		MPASS(owner->td_proc->p_magic == P_MAGIC);
+	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+
+	/*
+	 * If the lock does not already have a turnstile, use this thread's
+	 * turnstile.  Otherwise insert the current thread into the
+	 * turnstile already in use by this lock.
+	 */
+	tc = TC_LOOKUP(ts->ts_lockobj);
+	mtx_assert(&tc->tc_lock, MA_OWNED);
+	if (ts == td->td_turnstile) {
+#ifdef TURNSTILE_PROFILING
+		tc->tc_depth++;
+		if (tc->tc_depth > tc->tc_max_depth) {
+			tc->tc_max_depth = tc->tc_depth;
+			if (tc->tc_max_depth > turnstile_max_depth)
+				turnstile_max_depth = tc->tc_max_depth;
+		}
+#endif
+		LIST_INSERT_HEAD(&tc->tc_turnstiles, ts, ts_hash);
+		KASSERT(TAILQ_EMPTY(&ts->ts_pending),
+		    ("thread's turnstile has pending threads"));
+		KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]),
+		    ("thread's turnstile has exclusive waiters"));
+		KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]),
+		    ("thread's turnstile has shared waiters"));
+		KASSERT(LIST_EMPTY(&ts->ts_free),
+		    ("thread's turnstile has a non-empty free list"));
+		MPASS(ts->ts_lockobj != NULL);
+		mtx_lock_spin(&td_contested_lock);
+		TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
+		turnstile_setowner(ts, owner);
+		mtx_unlock_spin(&td_contested_lock);
+	} else {
+		TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq)
+			if (td1->td_priority > td->td_priority)
+				break;
+		mtx_lock_spin(&td_contested_lock);
+		if (td1 != NULL)
+			TAILQ_INSERT_BEFORE(td1, td, td_lockq);
+		else
+			TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
+		MPASS(owner == ts->ts_owner);
+		mtx_unlock_spin(&td_contested_lock);
+		MPASS(td->td_turnstile != NULL);
+		LIST_INSERT_HEAD(&ts->ts_free, td->td_turnstile, ts_hash);
+	}
+	thread_lock(td);
+	thread_lock_set(td, &ts->ts_lock);
+	td->td_turnstile = NULL;
+
+	/* Save who we are blocked on and switch. */
+	lock = ts->ts_lockobj;
+	td->td_tsqueue = queue;
+	td->td_blocked = ts;
+	td->td_lockname = lock->lo_name;
+	td->td_blktick = ticks;
+	TD_SET_LOCK(td);
+	mtx_unlock_spin(&tc->tc_lock);
+	propagate_priority(td);
+
+	if (LOCK_LOG_TEST(lock, 0))
+		CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__,
+		    td->td_tid, lock, lock->lo_name);
+
+	SDT_PROBE0(sched, , , sleep);
+
+	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+	mi_switch(SW_VOL | SWT_TURNSTILE, NULL);
+
+	if (LOCK_LOG_TEST(lock, 0))
+		CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s",
+		    __func__, td->td_tid, lock, lock->lo_name);
+	thread_unlock(td);
+}
+
+/*
+ * Pick the highest priority thread on this turnstile and put it on the
+ * pending list.  This must be called with the turnstile chain locked.
+ */
+int
+turnstile_signal(struct turnstile *ts, int queue)
+{
+	struct turnstile_chain *tc;
+	struct thread *td;
+	int empty;
+
+	MPASS(ts != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+	MPASS(curthread->td_proc->p_magic == P_MAGIC);
+	MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL);
+	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+
+	/*
+	 * Pick the highest priority thread blocked on this lock and
+	 * move it to the pending list.
+	 */
+	td = TAILQ_FIRST(&ts->ts_blocked[queue]);
+	MPASS(td->td_proc->p_magic == P_MAGIC);
+	mtx_lock_spin(&td_contested_lock);
+	TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
+	mtx_unlock_spin(&td_contested_lock);
+	TAILQ_INSERT_TAIL(&ts->ts_pending, td, td_lockq);
+
+	/*
+	 * If the turnstile is now empty, remove it from its chain and
+	 * give it to the about-to-be-woken thread.  Otherwise take a
+	 * turnstile from the free list and give it to the thread.
+	 */
+	empty = TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
+	    TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]);
+	if (empty) {
+		tc = TC_LOOKUP(ts->ts_lockobj);
+		mtx_assert(&tc->tc_lock, MA_OWNED);
+		MPASS(LIST_EMPTY(&ts->ts_free));
+#ifdef TURNSTILE_PROFILING
+		tc->tc_depth--;
+#endif
+	} else
+		ts = LIST_FIRST(&ts->ts_free);
+	MPASS(ts != NULL);
+	LIST_REMOVE(ts, ts_hash);
+	td->td_turnstile = ts;
+
+	return (empty);
+}
+	
+/*
+ * Put all blocked threads on the pending list.  This must be called with
+ * the turnstile chain locked.
+ */
+void
+turnstile_broadcast(struct turnstile *ts, int queue)
+{
+	struct turnstile_chain *tc;
+	struct turnstile *ts1;
+	struct thread *td;
+
+	MPASS(ts != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+	MPASS(curthread->td_proc->p_magic == P_MAGIC);
+	MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL);
+	/*
+	 * We must have the chain locked so that we can remove the empty
+	 * turnstile from the hash queue.
+	 */
+	tc = TC_LOOKUP(ts->ts_lockobj);
+	mtx_assert(&tc->tc_lock, MA_OWNED);
+	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+
+	/*
+	 * Transfer the blocked list to the pending list.
+	 */
+	mtx_lock_spin(&td_contested_lock);
+	TAILQ_CONCAT(&ts->ts_pending, &ts->ts_blocked[queue], td_lockq);
+	mtx_unlock_spin(&td_contested_lock);
+
+	/*
+	 * Give a turnstile to each thread.  The last thread gets
+	 * this turnstile if the turnstile is empty.
+	 */
+	TAILQ_FOREACH(td, &ts->ts_pending, td_lockq) {
+		if (LIST_EMPTY(&ts->ts_free)) {
+			MPASS(TAILQ_NEXT(td, td_lockq) == NULL);
+			ts1 = ts;
+#ifdef TURNSTILE_PROFILING
+			tc->tc_depth--;
+#endif
+		} else
+			ts1 = LIST_FIRST(&ts->ts_free);
+		MPASS(ts1 != NULL);
+		LIST_REMOVE(ts1, ts_hash);
+		td->td_turnstile = ts1;
+	}
+}
+
+/*
+ * Wakeup all threads on the pending list and adjust the priority of the
+ * current thread appropriately.  This must be called with the turnstile
+ * chain locked.
+ */
+void
+turnstile_unpend(struct turnstile *ts, int owner_type)
+{
+	TAILQ_HEAD( ,thread) pending_threads;
+	struct turnstile *nts;
+	struct thread *td;
+	u_char cp, pri;
+
+	MPASS(ts != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+	MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL);
+	MPASS(!TAILQ_EMPTY(&ts->ts_pending));
+
+	/*
+	 * Move the list of pending threads out of the turnstile and
+	 * into a local variable.
+	 */
+	TAILQ_INIT(&pending_threads);
+	TAILQ_CONCAT(&pending_threads, &ts->ts_pending, td_lockq);
+#ifdef INVARIANTS
+	if (TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
+	    TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]))
+		ts->ts_lockobj = NULL;
+#endif
+	/*
+	 * Adjust the priority of curthread based on other contested
+	 * locks it owns.  Don't lower the priority below the base
+	 * priority however.
+	 */
+	td = curthread;
+	pri = PRI_MAX;
+	thread_lock(td);
+	mtx_lock_spin(&td_contested_lock);
+	/*
+	 * Remove the turnstile from this thread's list of contested locks
+	 * since this thread doesn't own it anymore.  New threads will
+	 * not be blocking on the turnstile until it is claimed by a new
+	 * owner.  There might not be a current owner if this is a shared
+	 * lock.
+	 */
+	if (ts->ts_owner != NULL) {
+		ts->ts_owner = NULL;
+		LIST_REMOVE(ts, ts_link);
+	}
+	LIST_FOREACH(nts, &td->td_contested, ts_link) {
+		cp = turnstile_first_waiter(nts)->td_priority;
+		if (cp < pri)
+			pri = cp;
+	}
+	mtx_unlock_spin(&td_contested_lock);
+	sched_unlend_prio(td, pri);
+	thread_unlock(td);
+	/*
+	 * Wake up all the pending threads.  If a thread is not blocked
+	 * on a lock, then it is currently executing on another CPU in
+	 * turnstile_wait() or sitting on a run queue waiting to resume
+	 * in turnstile_wait().  Set a flag to force it to try to acquire
+	 * the lock again instead of blocking.
+	 */
+	while (!TAILQ_EMPTY(&pending_threads)) {
+		td = TAILQ_FIRST(&pending_threads);
+		TAILQ_REMOVE(&pending_threads, td, td_lockq);
+		SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
+		thread_lock(td);
+		THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+		MPASS(td->td_proc->p_magic == P_MAGIC);
+		MPASS(TD_ON_LOCK(td));
+		TD_CLR_LOCK(td);
+		MPASS(TD_CAN_RUN(td));
+		td->td_blocked = NULL;
+		td->td_lockname = NULL;
+		td->td_blktick = 0;
+#ifdef INVARIANTS
+		td->td_tsqueue = 0xff;
+#endif
+		sched_add(td, SRQ_BORING);
+		thread_unlock(td);
+	}
+	mtx_unlock_spin(&ts->ts_lock);
+}
+
+/*
+ * Give up ownership of a turnstile.  This must be called with the
+ * turnstile chain locked.
+ */
+void
+turnstile_disown(struct turnstile *ts)
+{
+	struct thread *td;
+	u_char cp, pri;
+
+	MPASS(ts != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+	MPASS(ts->ts_owner == curthread);
+	MPASS(TAILQ_EMPTY(&ts->ts_pending));
+	MPASS(!TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) ||
+	    !TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
+
+	/*
+	 * Remove the turnstile from this thread's list of contested locks
+	 * since this thread doesn't own it anymore.  New threads will
+	 * not be blocking on the turnstile until it is claimed by a new
+	 * owner.
+	 */
+	mtx_lock_spin(&td_contested_lock);
+	ts->ts_owner = NULL;
+	LIST_REMOVE(ts, ts_link);
+	mtx_unlock_spin(&td_contested_lock);
+
+	/*
+	 * Adjust the priority of curthread based on other contested
+	 * locks it owns.  Don't lower the priority below the base
+	 * priority however.
+	 */
+	td = curthread;
+	pri = PRI_MAX;
+	thread_lock(td);
+	mtx_unlock_spin(&ts->ts_lock);
+	mtx_lock_spin(&td_contested_lock);
+	LIST_FOREACH(ts, &td->td_contested, ts_link) {
+		cp = turnstile_first_waiter(ts)->td_priority;
+		if (cp < pri)
+			pri = cp;
+	}
+	mtx_unlock_spin(&td_contested_lock);
+	sched_unlend_prio(td, pri);
+	thread_unlock(td);
+}
+
+/*
+ * Return the first thread in a turnstile.
+ */
+struct thread *
+turnstile_head(struct turnstile *ts, int queue)
+{
+#ifdef INVARIANTS
+
+	MPASS(ts != NULL);
+	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+#endif
+	return (TAILQ_FIRST(&ts->ts_blocked[queue]));
+}
+
+/*
+ * Returns true if a sub-queue of a turnstile is empty.
+ */
+int
+turnstile_empty(struct turnstile *ts, int queue)
+{
+#ifdef INVARIANTS
+
+	MPASS(ts != NULL);
+	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+#endif
+	return (TAILQ_EMPTY(&ts->ts_blocked[queue]));
+}
+
+#ifdef DDB
+static void
+print_thread(struct thread *td, const char *prefix)
+{
+
+	db_printf("%s%p (tid %d, pid %d, \"%s\")\n", prefix, td, td->td_tid,
+	    td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
+	    td->td_name);
+}
+
+static void
+print_queue(struct threadqueue *queue, const char *header, const char *prefix)
+{
+	struct thread *td;
+
+	db_printf("%s:\n", header);
+	if (TAILQ_EMPTY(queue)) {
+		db_printf("%sempty\n", prefix);
+		return;
+	}
+	TAILQ_FOREACH(td, queue, td_lockq) {
+		print_thread(td, prefix);
+	}
+}
+
+DB_SHOW_COMMAND(turnstile, db_show_turnstile)
+{
+	struct turnstile_chain *tc;
+	struct turnstile *ts;
+	struct lock_object *lock;
+	int i;
+
+	if (!have_addr)
+		return;
+
+	/*
+	 * First, see if there is an active turnstile for the lock indicated
+	 * by the address.
+	 */
+	lock = (struct lock_object *)addr;
+	tc = TC_LOOKUP(lock);
+	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+		if (ts->ts_lockobj == lock)
+			goto found;
+
+	/*
+	 * Second, see if there is an active turnstile at the address
+	 * indicated.
+	 */
+	for (i = 0; i < TC_TABLESIZE; i++)
+		LIST_FOREACH(ts, &turnstile_chains[i].tc_turnstiles, ts_hash) {
+			if (ts == (struct turnstile *)addr)
+				goto found;
+		}
+
+	db_printf("Unable to locate a turnstile via %p\n", (void *)addr);
+	return;
+found:
+	lock = ts->ts_lockobj;
+	db_printf("Lock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name,
+	    lock->lo_name);
+	if (ts->ts_owner)
+		print_thread(ts->ts_owner, "Lock Owner: ");
+	else
+		db_printf("Lock Owner: none\n");
+	print_queue(&ts->ts_blocked[TS_SHARED_QUEUE], "Shared Waiters", "\t");
+	print_queue(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE], "Exclusive Waiters",
+	    "\t");
+	print_queue(&ts->ts_pending, "Pending Threads", "\t");
+	
+}
+
+/*
+ * Show all the threads a particular thread is waiting on based on
+ * non-sleepable and non-spin locks.
+ */
+static void
+print_lockchain(struct thread *td, const char *prefix)
+{
+	struct lock_object *lock;
+	struct lock_class *class;
+	struct turnstile *ts;
+
+	/*
+	 * Follow the chain.  We keep walking as long as the thread is
+	 * blocked on a turnstile that has an owner.
+	 */
+	while (!db_pager_quit) {
+		db_printf("%sthread %d (pid %d, %s) ", prefix, td->td_tid,
+		    td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
+		    td->td_name);
+		switch (td->td_state) {
+		case TDS_INACTIVE:
+			db_printf("is inactive\n");
+			return;
+		case TDS_CAN_RUN:
+			db_printf("can run\n");
+			return;
+		case TDS_RUNQ:
+			db_printf("is on a run queue\n");
+			return;
+		case TDS_RUNNING:
+			db_printf("running on CPU %d\n", td->td_oncpu);
+			return;
+		case TDS_INHIBITED:
+			if (TD_ON_LOCK(td)) {
+				ts = td->td_blocked;
+				lock = ts->ts_lockobj;
+				class = LOCK_CLASS(lock);
+				db_printf("blocked on lock %p (%s) \"%s\"\n",
+				    lock, class->lc_name, lock->lo_name);
+				if (ts->ts_owner == NULL)
+					return;
+				td = ts->ts_owner;
+				break;
+			}
+			db_printf("inhibited\n");
+			return;
+		default:
+			db_printf("??? (%#x)\n", td->td_state);
+			return;
+		}
+	}
+}
+
+DB_SHOW_COMMAND(lockchain, db_show_lockchain)
+{
+	struct thread *td;
+
+	/* Figure out which thread to start with. */
+	if (have_addr)
+		td = db_lookup_thread(addr, TRUE);
+	else
+		td = kdb_thread;
+
+	print_lockchain(td, "");
+}
+
+DB_SHOW_ALL_COMMAND(chains, db_show_allchains)
+{
+	struct thread *td;
+	struct proc *p;
+	int i;
+
+	i = 1;
+	FOREACH_PROC_IN_SYSTEM(p) {
+		FOREACH_THREAD_IN_PROC(p, td) {
+			if (TD_ON_LOCK(td) && LIST_EMPTY(&td->td_contested)) {
+				db_printf("chain %d:\n", i++);
+				print_lockchain(td, " ");
+			}
+			if (db_pager_quit)
+				return;
+		}
+	}
+}
+DB_SHOW_ALIAS(allchains, db_show_allchains)
+
+/*
+ * Show all the threads a particular thread is waiting on based on
+ * sleepable locks.
+ */
+static void
+print_sleepchain(struct thread *td, const char *prefix)
+{
+	struct thread *owner;
+
+	/*
+	 * Follow the chain.  We keep walking as long as the thread is
+	 * blocked on a sleep lock that has an owner.
+	 */
+	while (!db_pager_quit) {
+		db_printf("%sthread %d (pid %d, %s) ", prefix, td->td_tid,
+		    td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
+		    td->td_name);
+		switch (td->td_state) {
+		case TDS_INACTIVE:
+			db_printf("is inactive\n");
+			return;
+		case TDS_CAN_RUN:
+			db_printf("can run\n");
+			return;
+		case TDS_RUNQ:
+			db_printf("is on a run queue\n");
+			return;
+		case TDS_RUNNING:
+			db_printf("running on CPU %d\n", td->td_oncpu);
+			return;
+		case TDS_INHIBITED:
+			if (TD_ON_SLEEPQ(td)) {
+				if (lockmgr_chain(td, &owner) ||
+				    sx_chain(td, &owner)) {
+					if (owner == NULL)
+						return;
+					td = owner;
+					break;
+				}
+				db_printf("sleeping on %p \"%s\"\n",
+				    td->td_wchan, td->td_wmesg);
+				return;
+			}
+			db_printf("inhibited\n");
+			return;
+		default:
+			db_printf("??? (%#x)\n", td->td_state);
+			return;
+		}
+	}
+}
+
+DB_SHOW_COMMAND(sleepchain, db_show_sleepchain)
+{
+	struct thread *td;
+
+	/* Figure out which thread to start with. */
+	if (have_addr)
+		td = db_lookup_thread(addr, TRUE);
+	else
+		td = kdb_thread;
+
+	print_sleepchain(td, "");
+}
+
+static void	print_waiters(struct turnstile *ts, int indent);
+	
+static void
+print_waiter(struct thread *td, int indent)
+{
+	struct turnstile *ts;
+	int i;
+
+	if (db_pager_quit)
+		return;
+	for (i = 0; i < indent; i++)
+		db_printf(" ");
+	print_thread(td, "thread ");
+	LIST_FOREACH(ts, &td->td_contested, ts_link)
+		print_waiters(ts, indent + 1);
+}
+
+static void
+print_waiters(struct turnstile *ts, int indent)
+{
+	struct lock_object *lock;
+	struct lock_class *class;
+	struct thread *td;
+	int i;
+
+	if (db_pager_quit)
+		return;
+	lock = ts->ts_lockobj;
+	class = LOCK_CLASS(lock);
+	for (i = 0; i < indent; i++)
+		db_printf(" ");
+	db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name);
+	TAILQ_FOREACH(td, &ts->ts_blocked[TS_EXCLUSIVE_QUEUE], td_lockq)
+		print_waiter(td, indent + 1);
+	TAILQ_FOREACH(td, &ts->ts_blocked[TS_SHARED_QUEUE], td_lockq)
+		print_waiter(td, indent + 1);
+	TAILQ_FOREACH(td, &ts->ts_pending, td_lockq)
+		print_waiter(td, indent + 1);
+}
+
+DB_SHOW_COMMAND(locktree, db_show_locktree)
+{
+	struct lock_object *lock;
+	struct lock_class *class;
+	struct turnstile_chain *tc;
+	struct turnstile *ts;
+
+	if (!have_addr)
+		return;
+	lock = (struct lock_object *)addr;
+	tc = TC_LOOKUP(lock);
+	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+		if (ts->ts_lockobj == lock)
+			break;
+	if (ts == NULL) {
+		class = LOCK_CLASS(lock);
+		db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name,
+		    lock->lo_name);
+	} else
+		print_waiters(ts, 0);
+}
+#endif
diff --git a/sys/kern/subr_uio.c b/sys/kern/subr_uio.c
new file mode 100644
index 0000000..53f87c0
--- /dev/null
+++ b/sys/kern/subr_uio.c
@@ -0,0 +1,611 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_subr.c	8.3 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_zero.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mman.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_map.h>
+#ifdef SOCKET_SEND_COW
+#include <vm/vm_object.h>
+#endif
+
+SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV,
+	"Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
+
+static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault);
+
+#ifdef SOCKET_SEND_COW
+/* Declared in uipc_socket.c */
+extern int so_zero_copy_receive;
+
+/*
+ * Identify the physical page mapped at the given kernel virtual
+ * address.  Insert this physical page into the given address space at
+ * the given virtual address, replacing the physical page, if any,
+ * that already exists there.
+ */
+static int
+vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr)
+{
+	vm_map_t map = mapa;
+	vm_page_t kern_pg, user_pg;
+	vm_object_t uobject;
+	vm_map_entry_t entry;
+	vm_pindex_t upindex;
+	vm_prot_t prot;
+	boolean_t wired;
+
+	KASSERT((uaddr & PAGE_MASK) == 0,
+	    ("vm_pgmoveco: uaddr is not page aligned"));
+
+	/*
+	 * Herein the physical page is validated and dirtied.  It is
+	 * unwired in sf_buf_mext().
+	 */
+	kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr));
+	kern_pg->valid = VM_PAGE_BITS_ALL;
+	KASSERT(kern_pg->queue == PQ_NONE && kern_pg->wire_count == 1,
+	    ("vm_pgmoveco: kern_pg is not correctly wired"));
+
+	if ((vm_map_lookup(&map, uaddr,
+			   VM_PROT_WRITE, &entry, &uobject,
+			   &upindex, &prot, &wired)) != KERN_SUCCESS) {
+		return(EFAULT);
+	}
+	VM_OBJECT_WLOCK(uobject);
+retry:
+	if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) {
+		if (vm_page_sleep_if_busy(user_pg, "vm_pgmoveco"))
+			goto retry;
+		vm_page_lock(user_pg);
+		pmap_remove_all(user_pg);
+		vm_page_free(user_pg);
+		vm_page_unlock(user_pg);
+	} else {
+		/*
+		 * Even if a physical page does not exist in the
+		 * object chain's first object, a physical page from a
+		 * backing object may be mapped read only.
+		 */
+		if (uobject->backing_object != NULL)
+			pmap_remove(map->pmap, uaddr, uaddr + PAGE_SIZE);
+	}
+	if (vm_page_insert(kern_pg, uobject, upindex)) {
+		VM_OBJECT_WUNLOCK(uobject);
+		VM_WAIT;
+		VM_OBJECT_WLOCK(uobject);
+		goto retry;
+	}
+	vm_page_dirty(kern_pg);
+	VM_OBJECT_WUNLOCK(uobject);
+	vm_map_lookup_done(map, entry);
+	return(KERN_SUCCESS);
+}
+#endif /* SOCKET_SEND_COW */
+
+int
+copyin_nofault(const void *udaddr, void *kaddr, size_t len)
+{
+	int error, save;
+
+	save = vm_fault_disable_pagefaults();
+	error = copyin(udaddr, kaddr, len);
+	vm_fault_enable_pagefaults(save);
+	return (error);
+}
+
+int
+copyout_nofault(const void *kaddr, void *udaddr, size_t len)
+{
+	int error, save;
+
+	save = vm_fault_disable_pagefaults();
+	error = copyout(kaddr, udaddr, len);
+	vm_fault_enable_pagefaults(save);
+	return (error);
+}
+
+#define	PHYS_PAGE_COUNT(len)	(howmany(len, PAGE_SIZE) + 1)
+
+int
+physcopyin(void *src, vm_paddr_t dst, size_t len)
+{
+	vm_page_t m[PHYS_PAGE_COUNT(len)];
+	struct iovec iov[1];
+	struct uio uio;
+	int i;
+
+	iov[0].iov_base = src;
+	iov[0].iov_len = len;
+	uio.uio_iov = iov;
+	uio.uio_iovcnt = 1;
+	uio.uio_offset = 0;
+	uio.uio_resid = len;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_rw = UIO_WRITE;
+	for (i = 0; i < PHYS_PAGE_COUNT(len); i++, dst += PAGE_SIZE)
+		m[i] = PHYS_TO_VM_PAGE(dst);
+	return (uiomove_fromphys(m, dst & PAGE_MASK, len, &uio));
+}
+
+int
+physcopyout(vm_paddr_t src, void *dst, size_t len)
+{
+	vm_page_t m[PHYS_PAGE_COUNT(len)];
+	struct iovec iov[1];
+	struct uio uio;
+	int i;
+
+	iov[0].iov_base = dst;
+	iov[0].iov_len = len;
+	uio.uio_iov = iov;
+	uio.uio_iovcnt = 1;
+	uio.uio_offset = 0;
+	uio.uio_resid = len;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_rw = UIO_READ;
+	for (i = 0; i < PHYS_PAGE_COUNT(len); i++, src += PAGE_SIZE)
+		m[i] = PHYS_TO_VM_PAGE(src);
+	return (uiomove_fromphys(m, src & PAGE_MASK, len, &uio));
+}
+
+#undef PHYS_PAGE_COUNT
+
+int
+uiomove(void *cp, int n, struct uio *uio)
+{
+
+	return (uiomove_faultflag(cp, n, uio, 0));
+}
+
+int
+uiomove_nofault(void *cp, int n, struct uio *uio)
+{
+
+	return (uiomove_faultflag(cp, n, uio, 1));
+}
+
+static int
+uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
+{
+	struct thread *td;
+	struct iovec *iov;
+	size_t cnt;
+	int error, newflags, save;
+
+	td = curthread;
+	error = 0;
+
+	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+	    ("uiomove: mode"));
+	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td,
+	    ("uiomove proc"));
+	if (!nofault)
+		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+		    "Calling uiomove()");
+
+	/* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */
+	newflags = TDP_DEADLKTREAT;
+	if (uio->uio_segflg == UIO_USERSPACE && nofault) {
+		/*
+		 * Fail if a non-spurious page fault occurs.
+		 */
+		newflags |= TDP_NOFAULTING | TDP_RESETSPUR;
+	}
+	save = curthread_pflags_set(newflags);
+
+	while (n > 0 && uio->uio_resid) {
+		iov = uio->uio_iov;
+		cnt = iov->iov_len;
+		if (cnt == 0) {
+			uio->uio_iov++;
+			uio->uio_iovcnt--;
+			continue;
+		}
+		if (cnt > n)
+			cnt = n;
+
+		switch (uio->uio_segflg) {
+
+		case UIO_USERSPACE:
+			maybe_yield();
+			if (uio->uio_rw == UIO_READ)
+				error = copyout(cp, iov->iov_base, cnt);
+			else
+				error = copyin(iov->iov_base, cp, cnt);
+			if (error)
+				goto out;
+			break;
+
+		case UIO_SYSSPACE:
+			if (uio->uio_rw == UIO_READ)
+				bcopy(cp, iov->iov_base, cnt);
+			else
+				bcopy(iov->iov_base, cp, cnt);
+			break;
+		case UIO_NOCOPY:
+			break;
+		}
+		iov->iov_base = (char *)iov->iov_base + cnt;
+		iov->iov_len -= cnt;
+		uio->uio_resid -= cnt;
+		uio->uio_offset += cnt;
+		cp = (char *)cp + cnt;
+		n -= cnt;
+	}
+out:
+	curthread_pflags_restore(save);
+	return (error);
+}
+
+/*
+ * Wrapper for uiomove() that validates the arguments against a known-good
+ * kernel buffer.  Currently, uiomove accepts a signed (n) argument, which
+ * is almost definitely a bad thing, so we catch that here as well.  We
+ * return a runtime failure, but it might be desirable to generate a runtime
+ * assertion failure instead.
+ */
+int
+uiomove_frombuf(void *buf, int buflen, struct uio *uio)
+{
+	size_t offset, n;
+
+	if (uio->uio_offset < 0 || uio->uio_resid < 0 ||
+	    (offset = uio->uio_offset) != uio->uio_offset)
+		return (EINVAL);
+	if (buflen <= 0 || offset >= buflen)
+		return (0);
+	if ((n = buflen - offset) > IOSIZE_MAX)
+		return (EINVAL);
+	return (uiomove((char *)buf + offset, n, uio));
+}
+
+#ifdef SOCKET_RECV_PFLIP
+/*
+ * Experimental support for zero-copy I/O
+ */
+static int
+userspaceco(void *cp, u_int cnt, struct uio *uio, int disposable)
+{
+	struct iovec *iov;
+	int error;
+
+	iov = uio->uio_iov;
+	if (uio->uio_rw == UIO_READ) {
+		if ((so_zero_copy_receive != 0)
+		 && ((cnt & PAGE_MASK) == 0)
+		 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
+		 && ((uio->uio_offset & PAGE_MASK) == 0)
+		 && ((((intptr_t) cp) & PAGE_MASK) == 0)
+		 && (disposable != 0)) {
+			/* SOCKET: use page-trading */
+			/*
+			 * We only want to call vm_pgmoveco() on
+			 * disposeable pages, since it gives the
+			 * kernel page to the userland process.
+			 */
+			error =	vm_pgmoveco(&curproc->p_vmspace->vm_map,
+			    (vm_offset_t)cp, (vm_offset_t)iov->iov_base);
+
+			/*
+			 * If we get an error back, attempt
+			 * to use copyout() instead.  The
+			 * disposable page should be freed
+			 * automatically if we weren't able to move
+			 * it into userland.
+			 */
+			if (error != 0)
+				error = copyout(cp, iov->iov_base, cnt);
+		} else {
+			error = copyout(cp, iov->iov_base, cnt);
+		}
+	} else {
+		error = copyin(iov->iov_base, cp, cnt);
+	}
+	return (error);
+}
+
+int
+uiomoveco(void *cp, int n, struct uio *uio, int disposable)
+{
+	struct iovec *iov;
+	u_int cnt;
+	int error;
+
+	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+	    ("uiomoveco: mode"));
+	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
+	    ("uiomoveco proc"));
+
+	while (n > 0 && uio->uio_resid) {
+		iov = uio->uio_iov;
+		cnt = iov->iov_len;
+		if (cnt == 0) {
+			uio->uio_iov++;
+			uio->uio_iovcnt--;
+			continue;
+		}
+		if (cnt > n)
+			cnt = n;
+
+		switch (uio->uio_segflg) {
+
+		case UIO_USERSPACE:
+			maybe_yield();
+			error = userspaceco(cp, cnt, uio, disposable);
+			if (error)
+				return (error);
+			break;
+
+		case UIO_SYSSPACE:
+			if (uio->uio_rw == UIO_READ)
+				bcopy(cp, iov->iov_base, cnt);
+			else
+				bcopy(iov->iov_base, cp, cnt);
+			break;
+		case UIO_NOCOPY:
+			break;
+		}
+		iov->iov_base = (char *)iov->iov_base + cnt;
+		iov->iov_len -= cnt;
+		uio->uio_resid -= cnt;
+		uio->uio_offset += cnt;
+		cp = (char *)cp + cnt;
+		n -= cnt;
+	}
+	return (0);
+}
+#endif /* SOCKET_RECV_PFLIP */
+
+/*
+ * Give next character to user as result of read.
+ */
+int
+ureadc(int c, struct uio *uio)
+{
+	struct iovec *iov;
+	char *iov_base;
+
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+	    "Calling ureadc()");
+
+again:
+	if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
+		panic("ureadc");
+	iov = uio->uio_iov;
+	if (iov->iov_len == 0) {
+		uio->uio_iovcnt--;
+		uio->uio_iov++;
+		goto again;
+	}
+	switch (uio->uio_segflg) {
+
+	case UIO_USERSPACE:
+		if (subyte(iov->iov_base, c) < 0)
+			return (EFAULT);
+		break;
+
+	case UIO_SYSSPACE:
+		iov_base = iov->iov_base;
+		*iov_base = c;
+		break;
+
+	case UIO_NOCOPY:
+		break;
+	}
+	iov->iov_base = (char *)iov->iov_base + 1;
+	iov->iov_len--;
+	uio->uio_resid--;
+	uio->uio_offset++;
+	return (0);
+}
+
+int
+copyinfrom(const void * __restrict src, void * __restrict dst, size_t len,
+    int seg)
+{
+	int error = 0;
+
+	switch (seg) {
+	case UIO_USERSPACE:
+		error = copyin(src, dst, len);
+		break;
+	case UIO_SYSSPACE:
+		bcopy(src, dst, len);
+		break;
+	default:
+		panic("copyinfrom: bad seg %d\n", seg);
+	}
+	return (error);
+}
+
+int
+copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len,
+    size_t * __restrict copied, int seg)
+{
+	int error = 0;
+
+	switch (seg) {
+	case UIO_USERSPACE:
+		error = copyinstr(src, dst, len, copied);
+		break;
+	case UIO_SYSSPACE:
+		error = copystr(src, dst, len, copied);
+		break;
+	default:
+		panic("copyinstrfrom: bad seg %d\n", seg);
+	}
+	return (error);
+}
+
+int
+copyiniov(const struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error)
+{
+	u_int iovlen;
+
+	*iov = NULL;
+	if (iovcnt > UIO_MAXIOV)
+		return (error);
+	iovlen = iovcnt * sizeof (struct iovec);
+	*iov = malloc(iovlen, M_IOV, M_WAITOK);
+	error = copyin(iovp, *iov, iovlen);
+	if (error) {
+		free(*iov, M_IOV);
+		*iov = NULL;
+	}
+	return (error);
+}
+
+int
+copyinuio(const struct iovec *iovp, u_int iovcnt, struct uio **uiop)
+{
+	struct iovec *iov;
+	struct uio *uio;
+	u_int iovlen;
+	int error, i;
+
+	*uiop = NULL;
+	if (iovcnt > UIO_MAXIOV)
+		return (EINVAL);
+	iovlen = iovcnt * sizeof (struct iovec);
+	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
+	iov = (struct iovec *)(uio + 1);
+	error = copyin(iovp, iov, iovlen);
+	if (error) {
+		free(uio, M_IOV);
+		return (error);
+	}
+	uio->uio_iov = iov;
+	uio->uio_iovcnt = iovcnt;
+	uio->uio_segflg = UIO_USERSPACE;
+	uio->uio_offset = -1;
+	uio->uio_resid = 0;
+	for (i = 0; i < iovcnt; i++) {
+		if (iov->iov_len > IOSIZE_MAX - uio->uio_resid) {
+			free(uio, M_IOV);
+			return (EINVAL);
+		}
+		uio->uio_resid += iov->iov_len;
+		iov++;
+	}
+	*uiop = uio;
+	return (0);
+}
+
+struct uio *
+cloneuio(struct uio *uiop)
+{
+	struct uio *uio;
+	int iovlen;
+
+	iovlen = uiop->uio_iovcnt * sizeof (struct iovec);
+	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
+	*uio = *uiop;
+	uio->uio_iov = (struct iovec *)(uio + 1);
+	bcopy(uiop->uio_iov, uio->uio_iov, iovlen);
+	return (uio);
+}
+
+/*
+ * Map some anonymous memory in user space of size sz, rounded up to the page
+ * boundary.
+ */
+int
+copyout_map(struct thread *td, vm_offset_t *addr, size_t sz)
+{
+	struct vmspace *vms;
+	int error;
+	vm_size_t size;
+
+	vms = td->td_proc->p_vmspace;
+
+	/*
+	 * Map somewhere after heap in process memory.
+	 */
+	PROC_LOCK(td->td_proc);
+	*addr = round_page((vm_offset_t)vms->vm_daddr +
+	    lim_max(td->td_proc, RLIMIT_DATA));
+	PROC_UNLOCK(td->td_proc);
+
+	/* round size up to page boundry */
+	size = (vm_size_t)round_page(sz);
+
+	error = vm_mmap(&vms->vm_map, addr, size, PROT_READ | PROT_WRITE,
+	    VM_PROT_ALL, MAP_PRIVATE | MAP_ANON, OBJT_DEFAULT, NULL, 0);
+
+	return (error);
+}
+
+/*
+ * Unmap memory in user space.
+ */
+int
+copyout_unmap(struct thread *td, vm_offset_t addr, size_t sz)
+{
+	vm_map_t map;
+	vm_size_t size;
+
+	if (sz == 0)
+		return (0);
+
+	map = &td->td_proc->p_vmspace->vm_map;
+	size = (vm_size_t)round_page(sz);
+
+	if (vm_map_remove(map, addr, addr + size) != KERN_SUCCESS)
+		return (EINVAL);
+
+	return (0);
+}
diff --git a/sys/kern/subr_unit.c b/sys/kern/subr_unit.c
new file mode 100644
index 0000000..3bf7aaf
--- /dev/null
+++ b/sys/kern/subr_unit.c
@@ -0,0 +1,1015 @@
+/*-
+ * Copyright (c) 2004 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ *
+ * Unit number allocation functions.
+ *
+ * These functions implement a mixed run-length/bitmap management of unit
+ * number spaces in a very memory efficient manner.
+ *
+ * Allocation policy is always lowest free number first.
+ *
+ * A return value of -1 signals that no more unit numbers are available.
+ *
+ * There is no cost associated with the range of unitnumbers, so unless
+ * the resource really is finite, specify INT_MAX to new_unrhdr() and
+ * forget about checking the return value.
+ *
+ * If a mutex is not provided when the unit number space is created, a
+ * default global mutex is used.  The advantage to passing a mutex in, is
+ * that the alloc_unrl() function can be called with the mutex already
+ * held (it will not be released by alloc_unrl()).
+ *
+ * The allocation function alloc_unr{l}() never sleeps (but it may block on
+ * the mutex of course).
+ *
+ * Freeing a unit number may require allocating memory, and can therefore
+ * sleep so the free_unr() function does not come in a pre-locked variant.
+ *
+ * A userland test program is included.
+ *
+ * Memory usage is a very complex function of the exact allocation
+ * pattern, but always very compact:
+ *    * For the very typical case where a single unbroken run of unit
+ *      numbers are allocated 44 bytes are used on i386.
+ *    * For a unit number space of 1000 units and the random pattern
+ *      in the usermode test program included, the worst case usage
+ *	was 252 bytes on i386 for 500 allocated and 500 free units.
+ *    * For a unit number space of 10000 units and the random pattern
+ *      in the usermode test program included, the worst case usage
+ *	was 798 bytes on i386 for 5000 allocated and 5000 free units.
+ *    * The worst case is where every other unit number is allocated and
+ *	the rest are free.  In that case 44 + N/4 bytes are used where
+ *	N is the number of the highest unit allocated.
+ */
+
+#include <sys/types.h>
+#include <sys/bitstring.h>
+#include <sys/_unrhdr.h>
+
+#ifdef _KERNEL
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+/*
+ * In theory it would be smarter to allocate the individual blocks
+ * with the zone allocator, but at this time the expectation is that
+ * there will typically not even be enough allocations to fill a single
+ * page, so we stick with malloc for now.
+ */
+static MALLOC_DEFINE(M_UNIT, "Unitno", "Unit number allocation");
+
+#define Malloc(foo) malloc(foo, M_UNIT, M_WAITOK | M_ZERO)
+#define Free(foo) free(foo, M_UNIT)
+
+static struct mtx unitmtx;
+
+MTX_SYSINIT(unit, &unitmtx, "unit# allocation", MTX_DEF);
+
+#else /* ...USERLAND */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define KASSERT(cond, arg) \
+	do { \
+		if (!(cond)) { \
+			printf arg; \
+			abort(); \
+		} \
+	} while (0)
+
+static int no_alloc;
+#define Malloc(foo) _Malloc(foo, __LINE__)
+static void *
+_Malloc(size_t foo, int line)
+{
+
+	KASSERT(no_alloc == 0, ("malloc in wrong place() line %d", line));
+	return (calloc(foo, 1));
+}
+#define Free(foo) free(foo)
+
+struct unrhdr;
+
+
+struct mtx {
+	int	state;
+} unitmtx;
+
+static void
+mtx_lock(struct mtx *mp)
+{
+	KASSERT(mp->state == 0, ("mutex already locked"));
+	mp->state = 1;
+}
+
+static void
+mtx_unlock(struct mtx *mp)
+{
+	KASSERT(mp->state == 1, ("mutex not locked"));
+	mp->state = 0;
+}
+
+#define MA_OWNED	9
+
+static void
+mtx_assert(struct mtx *mp, int flag)
+{
+	if (flag == MA_OWNED) {
+		KASSERT(mp->state == 1, ("mtx_assert(MA_OWNED) not true"));
+	}
+}
+
+#define CTASSERT(foo)
+#define WITNESS_WARN(flags, lock, fmt, ...)	(void)0
+
+#endif /* USERLAND */
+
+/*
+ * This is our basic building block.
+ *
+ * It can be used in three different ways depending on the value of the ptr
+ * element:
+ *     If ptr is NULL, it represents a run of free items.
+ *     If ptr points to the unrhdr it represents a run of allocated items.
+ *     Otherwise it points to an bitstring of allocated items.
+ *
+ * For runs the len field is the length of the run.
+ * For bitmaps the len field represents the number of allocated items.
+ *
+ * The bitmap is the same size as struct unr to optimize memory management.
+ */
+struct unr {
+	TAILQ_ENTRY(unr)	list;
+	u_int			len;
+	void			*ptr;
+};
+
+struct unrb {
+	u_char			busy;
+	bitstr_t		map[sizeof(struct unr) - 1];
+};
+
+CTASSERT(sizeof(struct unr) == sizeof(struct unrb));
+
+/* Number of bits in the bitmap */
+#define NBITS	((int)sizeof(((struct unrb *)NULL)->map) * 8)
+
+#if defined(DIAGNOSTIC) || !defined(_KERNEL)
+/*
+ * Consistency check function.
+ *
+ * Checks the internal consistency as well as we can.
+ * 
+ * Called at all boundaries of this API.
+ */
+static void
+check_unrhdr(struct unrhdr *uh, int line)
+{
+	struct unr *up;
+	struct unrb *ub;
+	u_int x, y, z, w;
+
+	y = uh->first;
+	z = 0;
+	TAILQ_FOREACH(up, &uh->head, list) {
+		z++;
+		if (up->ptr != uh && up->ptr != NULL) {
+			ub = up->ptr;
+			KASSERT (up->len <= NBITS,
+			    ("UNR inconsistency: len %u max %d (line %d)\n",
+			    up->len, NBITS, line));
+			z++;
+			w = 0;
+			for (x = 0; x < up->len; x++)
+				if (bit_test(ub->map, x))
+					w++;
+			KASSERT (w == ub->busy,
+			    ("UNR inconsistency: busy %u found %u (line %d)\n",
+			    ub->busy, w, line));
+			y += w;
+		} else if (up->ptr != NULL) 
+			y += up->len;
+	}
+	KASSERT (y == uh->busy,
+	    ("UNR inconsistency: items %u found %u (line %d)\n",
+	    uh->busy, y, line));
+	KASSERT (z == uh->alloc,
+	    ("UNR inconsistency: chunks %u found %u (line %d)\n",
+	    uh->alloc, z, line));
+}
+
+#else
+
+static __inline void
+check_unrhdr(struct unrhdr *uh, int line)
+{
+
+}
+
+#endif
+
+
+/*
+ * Userland memory management.  Just use calloc and keep track of how
+ * many elements we have allocated for check_unrhdr().
+ */
+
+static __inline void *
+new_unr(struct unrhdr *uh, void **p1, void **p2)
+{
+	void *p;
+
+	uh->alloc++;
+	KASSERT(*p1 != NULL || *p2 != NULL, ("Out of cached memory"));
+	if (*p1 != NULL) {
+		p = *p1;
+		*p1 = NULL;
+		return (p);
+	} else {
+		p = *p2;
+		*p2 = NULL;
+		return (p);
+	}
+}
+
+static __inline void
+delete_unr(struct unrhdr *uh, void *ptr)
+{
+	struct unr *up;
+
+	uh->alloc--;
+	up = ptr;
+	TAILQ_INSERT_TAIL(&uh->ppfree, up, list);
+}
+
+void
+clean_unrhdrl(struct unrhdr *uh)
+{
+	struct unr *up;
+
+	mtx_assert(uh->mtx, MA_OWNED);
+	while ((up = TAILQ_FIRST(&uh->ppfree)) != NULL) {
+		TAILQ_REMOVE(&uh->ppfree, up, list);
+		mtx_unlock(uh->mtx);
+		Free(up);
+		mtx_lock(uh->mtx);
+	}
+
+}
+
+void
+clean_unrhdr(struct unrhdr *uh)
+{
+
+	mtx_lock(uh->mtx);
+	clean_unrhdrl(uh);
+	mtx_unlock(uh->mtx);
+}
+
+void
+init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex)
+{
+
+	KASSERT(low >= 0 && low <= high,
+	    ("UNR: use error: new_unrhdr(%d, %d)", low, high));
+	if (mutex != NULL)
+		uh->mtx = mutex;
+	else
+		uh->mtx = &unitmtx;
+	TAILQ_INIT(&uh->head);
+	TAILQ_INIT(&uh->ppfree);
+	uh->low = low;
+	uh->high = high;
+	uh->first = 0;
+	uh->last = 1 + (high - low);
+	check_unrhdr(uh, __LINE__);
+}
+
+/*
+ * Allocate a new unrheader set.
+ *
+ * Highest and lowest valid values given as parameters.
+ */
+
+struct unrhdr *
+new_unrhdr(int low, int high, struct mtx *mutex)
+{
+	struct unrhdr *uh;
+
+	uh = Malloc(sizeof *uh);
+	init_unrhdr(uh, low, high, mutex);
+	return (uh);
+}
+
+void
+delete_unrhdr(struct unrhdr *uh)
+{
+
+	check_unrhdr(uh, __LINE__);
+	KASSERT(uh->busy == 0, ("unrhdr has %u allocations", uh->busy));
+	KASSERT(uh->alloc == 0, ("UNR memory leak in delete_unrhdr"));
+	KASSERT(TAILQ_FIRST(&uh->ppfree) == NULL,
+	    ("unrhdr has postponed item for free"));
+	Free(uh);
+}
+
+static __inline int
+is_bitmap(struct unrhdr *uh, struct unr *up)
+{
+	return (up->ptr != uh && up->ptr != NULL);
+}
+
+/*
+ * Look for sequence of items which can be combined into a bitmap, if
+ * multiple are present, take the one which saves most memory.
+ * 
+ * Return (1) if a sequence was found to indicate that another call
+ * might be able to do more.  Return (0) if we found no suitable sequence.
+ *
+ * NB: called from alloc_unr(), no new memory allocation allowed.
+ */
+static int
+optimize_unr(struct unrhdr *uh)
+{
+	struct unr *up, *uf, *us;
+	struct unrb *ub, *ubf;
+	u_int a, l, ba;
+
+	/*
+	 * Look for the run of items (if any) which when collapsed into
+	 * a bitmap would save most memory.
+	 */
+	us = NULL;
+	ba = 0;
+	TAILQ_FOREACH(uf, &uh->head, list) {
+		if (uf->len >= NBITS)
+			continue;
+		a = 1;
+		if (is_bitmap(uh, uf))
+			a++;
+		l = uf->len;
+		up = uf;
+		while (1) {
+			up = TAILQ_NEXT(up, list);
+			if (up == NULL)
+				break;
+			if ((up->len + l) > NBITS)
+				break;
+			a++;
+			if (is_bitmap(uh, up))
+				a++;
+			l += up->len;
+		}
+		if (a > ba) {
+			ba = a;
+			us = uf;
+		}
+	}
+	if (ba < 3)
+		return (0);
+
+	/*
+	 * If the first element is not a bitmap, make it one.
+	 * Trying to do so without allocating more memory complicates things
+	 * a bit
+	 */
+	if (!is_bitmap(uh, us)) {
+		uf = TAILQ_NEXT(us, list);
+		TAILQ_REMOVE(&uh->head, us, list);
+		a = us->len;
+		l = us->ptr == uh ? 1 : 0;
+		ub = (void *)us;
+		ub->busy = 0;
+		if (l) {
+			bit_nset(ub->map, 0, a);
+			ub->busy += a;
+		} else {
+			bit_nclear(ub->map, 0, a);
+		}
+		if (!is_bitmap(uh, uf)) {
+			if (uf->ptr == NULL) {
+				bit_nclear(ub->map, a, a + uf->len - 1);
+			} else {
+				bit_nset(ub->map, a, a + uf->len - 1);
+				ub->busy += uf->len;
+			}
+			uf->ptr = ub;
+			uf->len += a;
+			us = uf;
+		} else {
+			ubf = uf->ptr;
+			for (l = 0; l < uf->len; l++, a++) {
+				if (bit_test(ubf->map, l)) {
+					bit_set(ub->map, a);
+					ub->busy++;
+				} else {
+					bit_clear(ub->map, a);
+				}
+			}
+			uf->len = a;
+			delete_unr(uh, uf->ptr);
+			uf->ptr = ub;
+			us = uf;
+		}
+	}
+	ub = us->ptr;
+	while (1) {
+		uf = TAILQ_NEXT(us, list);
+		if (uf == NULL)
+			return (1);
+		if (uf->len + us->len > NBITS)
+			return (1);
+		if (uf->ptr == NULL) {
+			bit_nclear(ub->map, us->len, us->len + uf->len - 1);
+			us->len += uf->len;
+			TAILQ_REMOVE(&uh->head, uf, list);
+			delete_unr(uh, uf);
+		} else if (uf->ptr == uh) {
+			bit_nset(ub->map, us->len, us->len + uf->len - 1);
+			ub->busy += uf->len;
+			us->len += uf->len;
+			TAILQ_REMOVE(&uh->head, uf, list);
+			delete_unr(uh, uf);
+		} else {
+			ubf = uf->ptr;
+			for (l = 0; l < uf->len; l++, us->len++) {
+				if (bit_test(ubf->map, l)) {
+					bit_set(ub->map, us->len);
+					ub->busy++;
+				} else {
+					bit_clear(ub->map, us->len);
+				}
+			}
+			TAILQ_REMOVE(&uh->head, uf, list);
+			delete_unr(uh, ubf);
+			delete_unr(uh, uf);
+		}
+	}
+}
+
+/*
+ * See if a given unr should be collapsed with a neighbor.
+ *
+ * NB: called from alloc_unr(), no new memory allocation allowed.
+ */
+static void
+collapse_unr(struct unrhdr *uh, struct unr *up)
+{
+	struct unr *upp;
+	struct unrb *ub;
+
+	/* If bitmap is all set or clear, change it to runlength */
+	if (is_bitmap(uh, up)) {
+		ub = up->ptr;
+		if (ub->busy == up->len) {
+			delete_unr(uh, up->ptr);
+			up->ptr = uh;
+		} else if (ub->busy == 0) {
+			delete_unr(uh, up->ptr);
+			up->ptr = NULL;
+		}
+	}
+
+	/* If nothing left in runlength, delete it */
+	if (up->len == 0) {
+		upp = TAILQ_PREV(up, unrhd, list);
+		if (upp == NULL)
+			upp = TAILQ_NEXT(up, list);
+		TAILQ_REMOVE(&uh->head, up, list);
+		delete_unr(uh, up);
+		up = upp;
+	}
+
+	/* If we have "hot-spot" still, merge with neighbor if possible */
+	if (up != NULL) {
+		upp = TAILQ_PREV(up, unrhd, list);
+		if (upp != NULL && up->ptr == upp->ptr) {
+			up->len += upp->len;
+			TAILQ_REMOVE(&uh->head, upp, list);
+			delete_unr(uh, upp);
+			}
+		upp = TAILQ_NEXT(up, list);
+		if (upp != NULL && up->ptr == upp->ptr) {
+			up->len += upp->len;
+			TAILQ_REMOVE(&uh->head, upp, list);
+			delete_unr(uh, upp);
+		}
+	}
+
+	/* Merge into ->first if possible */
+	upp = TAILQ_FIRST(&uh->head);
+	if (upp != NULL && upp->ptr == uh) {
+		uh->first += upp->len;
+		TAILQ_REMOVE(&uh->head, upp, list);
+		delete_unr(uh, upp);
+		if (up == upp)
+			up = NULL;
+	}
+
+	/* Merge into ->last if possible */
+	upp = TAILQ_LAST(&uh->head, unrhd);
+	if (upp != NULL && upp->ptr == NULL) {
+		uh->last += upp->len;
+		TAILQ_REMOVE(&uh->head, upp, list);
+		delete_unr(uh, upp);
+		if (up == upp)
+			up = NULL;
+	}
+
+	/* Try to make bitmaps */
+	while (optimize_unr(uh))
+		continue;
+}
+
+/*
+ * Allocate a free unr.
+ */
+int
+alloc_unrl(struct unrhdr *uh)
+{
+	struct unr *up;
+	struct unrb *ub;
+	u_int x;
+	int y;
+
+	mtx_assert(uh->mtx, MA_OWNED);
+	check_unrhdr(uh, __LINE__);
+	x = uh->low + uh->first;
+
+	up = TAILQ_FIRST(&uh->head);
+
+	/*
+	 * If we have an ideal split, just adjust the first+last
+	 */
+	if (up == NULL && uh->last > 0) {
+		uh->first++;
+		uh->last--;
+		uh->busy++;
+		return (x);
+	}
+
+	/*
+	 * We can always allocate from the first list element, so if we have 
+	 * nothing on the list, we must have run out of unit numbers.
+	 */
+	if (up == NULL)
+		return (-1);
+
+	KASSERT(up->ptr != uh, ("UNR first element is allocated"));
+
+	if (up->ptr == NULL) {	/* free run */
+		uh->first++;
+		up->len--;
+	} else {		/* bitmap */
+		ub = up->ptr;
+		KASSERT(ub->busy < up->len, ("UNR bitmap confusion"));
+		bit_ffc(ub->map, up->len, &y);
+		KASSERT(y != -1, ("UNR corruption: No clear bit in bitmap."));
+		bit_set(ub->map, y);
+		ub->busy++;
+		x += y;
+	}
+	uh->busy++;
+	collapse_unr(uh, up);
+	return (x);
+}
+
+int
+alloc_unr(struct unrhdr *uh)
+{
+	int i;
+
+	mtx_lock(uh->mtx);
+	i = alloc_unrl(uh);
+	clean_unrhdrl(uh);
+	mtx_unlock(uh->mtx);
+	return (i);
+}
+
+static int
+alloc_unr_specificl(struct unrhdr *uh, u_int item, void **p1, void **p2)
+{
+	struct unr *up, *upn;
+	struct unrb *ub;
+	u_int i, last, tl;
+
+	mtx_assert(uh->mtx, MA_OWNED);
+
+	if (item < uh->low + uh->first || item > uh->high)
+		return (-1);
+
+	up = TAILQ_FIRST(&uh->head);
+	/* Ideal split. */
+	if (up == NULL && item - uh->low == uh->first) {
+		uh->first++;
+		uh->last--;
+		uh->busy++;
+		check_unrhdr(uh, __LINE__);
+		return (item);
+	}
+
+	i = item - uh->low - uh->first;
+
+	if (up == NULL) {
+		up = new_unr(uh, p1, p2);
+		up->ptr = NULL;
+		up->len = i;
+		TAILQ_INSERT_TAIL(&uh->head, up, list);
+		up = new_unr(uh, p1, p2);
+		up->ptr = uh;
+		up->len = 1;
+		TAILQ_INSERT_TAIL(&uh->head, up, list);
+		uh->last = uh->high - uh->low - i;
+		uh->busy++;
+		check_unrhdr(uh, __LINE__);
+		return (item);
+	} else {
+		/* Find the item which contains the unit we want to allocate. */
+		TAILQ_FOREACH(up, &uh->head, list) {
+			if (up->len > i)
+				break;
+			i -= up->len;
+		}
+	}
+
+	if (up == NULL) {
+		if (i > 0) {
+			up = new_unr(uh, p1, p2);
+			up->ptr = NULL;
+			up->len = i;
+			TAILQ_INSERT_TAIL(&uh->head, up, list);
+		}
+		up = new_unr(uh, p1, p2);
+		up->ptr = uh;
+		up->len = 1;
+		TAILQ_INSERT_TAIL(&uh->head, up, list);
+		goto done;
+	}
+
+	if (is_bitmap(uh, up)) {
+		ub = up->ptr;
+		if (bit_test(ub->map, i) == 0) {
+			bit_set(ub->map, i);
+			ub->busy++;
+			goto done;
+		} else
+			return (-1);
+	} else if (up->ptr == uh)
+		return (-1);
+
+	KASSERT(up->ptr == NULL,
+	    ("alloc_unr_specificl: up->ptr != NULL (up=%p)", up));
+
+	/* Split off the tail end, if any. */
+	tl = up->len - (1 + i);
+	if (tl > 0) {
+		upn = new_unr(uh, p1, p2);
+		upn->ptr = NULL;
+		upn->len = tl;
+		TAILQ_INSERT_AFTER(&uh->head, up, upn, list);
+	}
+
+	/* Split off head end, if any */
+	if (i > 0) {
+		upn = new_unr(uh, p1, p2);
+		upn->len = i;
+		upn->ptr = NULL;
+		TAILQ_INSERT_BEFORE(up, upn, list);
+	}
+	up->len = 1;
+	up->ptr = uh;
+
+done:
+	last = uh->high - uh->low - (item - uh->low);
+	if (uh->last > last)
+		uh->last = last;
+	uh->busy++;
+	collapse_unr(uh, up);
+	check_unrhdr(uh, __LINE__);
+	return (item);
+}
+
+int
+alloc_unr_specific(struct unrhdr *uh, u_int item)
+{
+	void *p1, *p2;
+	int i;
+
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "alloc_unr_specific");
+
+	p1 = Malloc(sizeof(struct unr));
+	p2 = Malloc(sizeof(struct unr));
+
+	mtx_lock(uh->mtx);
+	i = alloc_unr_specificl(uh, item, &p1, &p2);
+	mtx_unlock(uh->mtx);
+
+	if (p1 != NULL)
+		Free(p1);
+	if (p2 != NULL)
+		Free(p2);
+
+	return (i);
+}
+
+/*
+ * Free a unr.
+ *
+ * If we can save unrs by using a bitmap, do so.
+ */
+static void
+free_unrl(struct unrhdr *uh, u_int item, void **p1, void **p2)
+{
+	struct unr *up, *upp, *upn;
+	struct unrb *ub;
+	u_int pl;
+
+	KASSERT(item >= uh->low && item <= uh->high,
+	    ("UNR: free_unr(%u) out of range [%u...%u]",
+	     item, uh->low, uh->high));
+	check_unrhdr(uh, __LINE__);
+	item -= uh->low;
+	upp = TAILQ_FIRST(&uh->head);
+	/*
+	 * Freeing in the ideal split case
+	 */
+	if (item + 1 == uh->first && upp == NULL) {
+		uh->last++;
+		uh->first--;
+		uh->busy--;
+		check_unrhdr(uh, __LINE__);
+		return;
+	}
+	/*
+ 	 * Freeing in the ->first section.  Create a run starting at the
+	 * freed item.  The code below will subdivide it.
+	 */
+	if (item < uh->first) {
+		up = new_unr(uh, p1, p2);
+		up->ptr = uh;
+		up->len = uh->first - item;
+		TAILQ_INSERT_HEAD(&uh->head, up, list);
+		uh->first -= up->len;
+	}
+
+	item -= uh->first;
+
+	/* Find the item which contains the unit we want to free */
+	TAILQ_FOREACH(up, &uh->head, list) {
+		if (up->len > item)
+			break;
+		item -= up->len;
+	}
+
+	/* Handle bitmap items */
+	if (is_bitmap(uh, up)) {
+		ub = up->ptr;
+		
+		KASSERT(bit_test(ub->map, item) != 0,
+		    ("UNR: Freeing free item %d (bitmap)\n", item));
+		bit_clear(ub->map, item);
+		uh->busy--;
+		ub->busy--;
+		collapse_unr(uh, up);
+		return;
+	}
+
+	KASSERT(up->ptr == uh, ("UNR Freeing free item %d (run))\n", item));
+
+	/* Just this one left, reap it */
+	if (up->len == 1) {
+		up->ptr = NULL;
+		uh->busy--;
+		collapse_unr(uh, up);
+		return;
+	}
+
+	/* Check if we can shift the item into the previous 'free' run */
+	upp = TAILQ_PREV(up, unrhd, list);
+	if (item == 0 && upp != NULL && upp->ptr == NULL) {
+		upp->len++;
+		up->len--;
+		uh->busy--;
+		collapse_unr(uh, up);
+		return;
+	}
+
+	/* Check if we can shift the item to the next 'free' run */
+	upn = TAILQ_NEXT(up, list);
+	if (item == up->len - 1 && upn != NULL && upn->ptr == NULL) {
+		upn->len++;
+		up->len--;
+		uh->busy--;
+		collapse_unr(uh, up);
+		return;
+	}
+
+	/* Split off the tail end, if any. */
+	pl = up->len - (1 + item);
+	if (pl > 0) {
+		upp = new_unr(uh, p1, p2);
+		upp->ptr = uh;
+		upp->len = pl;
+		TAILQ_INSERT_AFTER(&uh->head, up, upp, list);
+	}
+
+	/* Split off head end, if any */
+	if (item > 0) {
+		upp = new_unr(uh, p1, p2);
+		upp->len = item;
+		upp->ptr = uh;
+		TAILQ_INSERT_BEFORE(up, upp, list);
+	}
+	up->len = 1;
+	up->ptr = NULL;
+	uh->busy--;
+	collapse_unr(uh, up);
+}
+
+void
+free_unr(struct unrhdr *uh, u_int item)
+{
+	void *p1, *p2;
+
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "free_unr");
+	p1 = Malloc(sizeof(struct unr));
+	p2 = Malloc(sizeof(struct unr));
+	mtx_lock(uh->mtx);
+	free_unrl(uh, item, &p1, &p2);
+	clean_unrhdrl(uh);
+	mtx_unlock(uh->mtx);
+	if (p1 != NULL)
+		Free(p1);
+	if (p2 != NULL)
+		Free(p2);
+}
+
+#ifndef _KERNEL	/* USERLAND test driver */
+
+/*
+ * Simple stochastic test driver for the above functions
+ */
+
+static void
+print_unr(struct unrhdr *uh, struct unr *up)
+{
+	u_int x;
+	struct unrb *ub;
+
+	printf("  %p len = %5u ", up, up->len);
+	if (up->ptr == NULL)
+		printf("free\n");
+	else if (up->ptr == uh)
+		printf("alloc\n");
+	else {
+		ub = up->ptr;
+		printf("bitmap(%d) [", ub->busy);
+		for (x = 0; x < up->len; x++) {
+			if (bit_test(ub->map, x))
+				printf("#");
+			else 
+				printf(" ");
+		}
+		printf("]\n");
+	}
+}
+
+static void
+print_unrhdr(struct unrhdr *uh)
+{
+	struct unr *up;
+	u_int x;
+
+	printf(
+	    "%p low = %u high = %u first = %u last = %u busy %u chunks = %u\n",
+	    uh, uh->low, uh->high, uh->first, uh->last, uh->busy, uh->alloc);
+	x = uh->low + uh->first;
+	TAILQ_FOREACH(up, &uh->head, list) {
+		printf("  from = %5u", x);
+		print_unr(uh, up);
+		if (up->ptr == NULL || up->ptr == uh)
+			x += up->len;
+		else
+			x += NBITS;
+	}
+}
+
+static void
+test_alloc_unr(struct unrhdr *uh, u_int i, char a[])
+{
+	int j;
+
+	if (a[i]) {
+		printf("F %u\n", i);
+		free_unr(uh, i);
+		a[i] = 0;
+	} else {
+		no_alloc = 1;
+		j = alloc_unr(uh);
+		if (j != -1) {
+			a[j] = 1;
+			printf("A %d\n", j);
+		}
+		no_alloc = 0;
+	}
+}
+
+static void
+test_alloc_unr_specific(struct unrhdr *uh, u_int i, char a[])
+{
+	int j;
+
+	j = alloc_unr_specific(uh, i);
+	if (j == -1) {
+		printf("F %u\n", i);
+		a[i] = 0;
+		free_unr(uh, i);
+	} else {
+		a[i] = 1;
+		printf("A %d\n", j);
+	}
+}
+
+/* Number of unrs to test */
+#define NN	10000
+
+int
+main(int argc __unused, const char **argv __unused)
+{
+	struct unrhdr *uh;
+	u_int i, x, m, j;
+	char a[NN];
+
+	setbuf(stdout, NULL);
+	uh = new_unrhdr(0, NN - 1, NULL);
+	print_unrhdr(uh);
+
+	memset(a, 0, sizeof a);
+	srandomdev();
+
+	fprintf(stderr, "sizeof(struct unr) %zu\n", sizeof(struct unr));
+	fprintf(stderr, "sizeof(struct unrb) %zu\n", sizeof(struct unrb));
+	fprintf(stderr, "sizeof(struct unrhdr) %zu\n", sizeof(struct unrhdr));
+	fprintf(stderr, "NBITS %d\n", NBITS);
+	x = 1;
+	for (m = 0; m < NN * 100; m++) {
+		j = random();
+		i = (j >> 1) % NN;
+#if 0
+		if (a[i] && (j & 1))
+			continue;
+#endif
+		if ((random() & 1) != 0)
+			test_alloc_unr(uh, i, a);
+		else
+			test_alloc_unr_specific(uh, i, a);
+
+		if (1)	/* XXX: change this for detailed debug printout */
+			print_unrhdr(uh);
+		check_unrhdr(uh, __LINE__);
+	}
+	for (i = 0; i < NN; i++) {
+		if (a[i]) {
+			printf("C %u\n", i);
+			free_unr(uh, i);
+			print_unrhdr(uh);
+		}
+	}
+	print_unrhdr(uh);
+	delete_unrhdr(uh);
+	return (0);
+}
+#endif
diff --git a/sys/kern/subr_vmem.c b/sys/kern/subr_vmem.c
new file mode 100644
index 0000000..f3f3eec
--- /dev/null
+++ b/sys/kern/subr_vmem.c
@@ -0,0 +1,1487 @@
+/*-
+ * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi,
+ * Copyright (c) 2013 EMC Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * From:
+ *	$NetBSD: vmem_impl.h,v 1.2 2013/01/29 21:26:24 para Exp $
+ *	$NetBSD: subr_vmem.c,v 1.83 2013/03/06 11:20:10 yamt Exp $
+ */
+
+/*
+ * reference:
+ * -	Magazines and Vmem: Extending the Slab Allocator
+ *	to Many CPUs and Arbitrary Resources
+ *	http://www.usenix.org/event/usenix01/bonwick.html
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/callout.h>
+#include <sys/hash.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/smp.h>
+#include <sys/condvar.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/vmem.h>
+
+#include "opt_vm.h"
+
+#include <vm/uma.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+#include <vm/vm_pageout.h>
+
+#define	VMEM_MAXORDER		(sizeof(vmem_size_t) * NBBY)
+
+#define	VMEM_HASHSIZE_MIN	16
+#define	VMEM_HASHSIZE_MAX	131072
+
+#define	VMEM_QCACHE_IDX_MAX	16
+
+#define	VMEM_FITMASK	(M_BESTFIT | M_FIRSTFIT)
+
+#define	VMEM_FLAGS						\
+    (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM | M_BESTFIT | M_FIRSTFIT)
+
+#define	BT_FLAGS	(M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM)
+
+#define	QC_NAME_MAX	16
+
+/*
+ * Data structures private to vmem.
+ */
+MALLOC_DEFINE(M_VMEM, "vmem", "vmem internal structures");
+
+typedef struct vmem_btag bt_t;
+
+TAILQ_HEAD(vmem_seglist, vmem_btag);
+LIST_HEAD(vmem_freelist, vmem_btag);
+LIST_HEAD(vmem_hashlist, vmem_btag);
+
+struct qcache {
+	uma_zone_t	qc_cache;
+	vmem_t 		*qc_vmem;
+	vmem_size_t	qc_size;
+	char		qc_name[QC_NAME_MAX];
+};
+typedef struct qcache qcache_t;
+#define	QC_POOL_TO_QCACHE(pool)	((qcache_t *)(pool->pr_qcache))
+
+#define	VMEM_NAME_MAX	16
+
+/* vmem arena */
+struct vmem {
+	struct mtx_padalign	vm_lock;
+	struct cv		vm_cv;
+	char			vm_name[VMEM_NAME_MAX+1];
+	LIST_ENTRY(vmem)	vm_alllist;
+	struct vmem_hashlist	vm_hash0[VMEM_HASHSIZE_MIN];
+	struct vmem_freelist	vm_freelist[VMEM_MAXORDER];
+	struct vmem_seglist	vm_seglist;
+	struct vmem_hashlist	*vm_hashlist;
+	vmem_size_t		vm_hashsize;
+
+	/* Constant after init */
+	vmem_size_t		vm_qcache_max;
+	vmem_size_t		vm_quantum_mask;
+	vmem_size_t		vm_import_quantum;
+	int			vm_quantum_shift;
+
+	/* Written on alloc/free */
+	LIST_HEAD(, vmem_btag)	vm_freetags;
+	int			vm_nfreetags;
+	int			vm_nbusytag;
+	vmem_size_t		vm_inuse;
+	vmem_size_t		vm_size;
+
+	/* Used on import. */
+	vmem_import_t		*vm_importfn;
+	vmem_release_t		*vm_releasefn;
+	void			*vm_arg;
+
+	/* Space exhaustion callback. */
+	vmem_reclaim_t		*vm_reclaimfn;
+
+	/* quantum cache */
+	qcache_t		vm_qcache[VMEM_QCACHE_IDX_MAX];
+};
+
+/* boundary tag */
+struct vmem_btag {
+	TAILQ_ENTRY(vmem_btag) bt_seglist;
+	union {
+		LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */
+		LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */
+	} bt_u;
+#define	bt_hashlist	bt_u.u_hashlist
+#define	bt_freelist	bt_u.u_freelist
+	vmem_addr_t	bt_start;
+	vmem_size_t	bt_size;
+	int		bt_type;
+};
+
+#define	BT_TYPE_SPAN		1	/* Allocated from importfn */
+#define	BT_TYPE_SPAN_STATIC	2	/* vmem_add() or create. */
+#define	BT_TYPE_FREE		3	/* Available space. */
+#define	BT_TYPE_BUSY		4	/* Used space. */
+#define	BT_ISSPAN_P(bt)	((bt)->bt_type <= BT_TYPE_SPAN_STATIC)
+
+#define	BT_END(bt)	((bt)->bt_start + (bt)->bt_size - 1)
+
+#if defined(DIAGNOSTIC)
+static int enable_vmem_check = 1;
+SYSCTL_INT(_debug, OID_AUTO, vmem_check, CTLFLAG_RW,
+    &enable_vmem_check, 0, "Enable vmem check");
+static void vmem_check(vmem_t *);
+#endif
+
+static struct callout	vmem_periodic_ch;
+static int		vmem_periodic_interval;
+static struct task	vmem_periodic_wk;
+
+static struct mtx_padalign vmem_list_lock;
+static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list);
+
+/* ---- misc */
+#define	VMEM_CONDVAR_INIT(vm, wchan)	cv_init(&vm->vm_cv, wchan)
+#define	VMEM_CONDVAR_DESTROY(vm)	cv_destroy(&vm->vm_cv)
+#define	VMEM_CONDVAR_WAIT(vm)		cv_wait(&vm->vm_cv, &vm->vm_lock)
+#define	VMEM_CONDVAR_BROADCAST(vm)	cv_broadcast(&vm->vm_cv)
+
+
+#define	VMEM_LOCK(vm)		mtx_lock(&vm->vm_lock)
+#define	VMEM_TRYLOCK(vm)	mtx_trylock(&vm->vm_lock)
+#define	VMEM_UNLOCK(vm)		mtx_unlock(&vm->vm_lock)
+#define	VMEM_LOCK_INIT(vm, name) mtx_init(&vm->vm_lock, (name), NULL, MTX_DEF)
+#define	VMEM_LOCK_DESTROY(vm)	mtx_destroy(&vm->vm_lock)
+#define	VMEM_ASSERT_LOCKED(vm)	mtx_assert(&vm->vm_lock, MA_OWNED);
+
+#define	VMEM_ALIGNUP(addr, align)	(-(-(addr) & -(align)))
+
+#define	VMEM_CROSS_P(addr1, addr2, boundary) \
+	((((addr1) ^ (addr2)) & -(boundary)) != 0)
+
+#define	ORDER2SIZE(order)	((vmem_size_t)1 << (order))
+#define	SIZE2ORDER(size)	((int)flsl(size) - 1)
+
+/*
+ * Maximum number of boundary tags that may be required to satisfy an
+ * allocation.  Two may be required to import.  Another two may be
+ * required to clip edges.
+ */
+#define	BT_MAXALLOC	4
+
+/*
+ * Max free limits the number of locally cached boundary tags.  We
+ * just want to avoid hitting the zone allocator for every call.
+ */
+#define BT_MAXFREE	(BT_MAXALLOC * 8)
+
+/* Allocator for boundary tags. */
+static uma_zone_t vmem_bt_zone;
+
+/* boot time arena storage. */
+static struct vmem kernel_arena_storage;
+static struct vmem kmem_arena_storage;
+static struct vmem buffer_arena_storage;
+static struct vmem transient_arena_storage;
+vmem_t *kernel_arena = &kernel_arena_storage;
+vmem_t *kmem_arena = &kmem_arena_storage;
+vmem_t *buffer_arena = &buffer_arena_storage;
+vmem_t *transient_arena = &transient_arena_storage;
+
+#ifdef DEBUG_MEMGUARD
+static struct vmem memguard_arena_storage;
+vmem_t *memguard_arena = &memguard_arena_storage;
+#endif
+
+/*
+ * Fill the vmem's boundary tag cache.  We guarantee that boundary tag
+ * allocation will not fail once bt_fill() passes.  To do so we cache
+ * at least the maximum possible tag allocations in the arena.
+ */
+static int
+bt_fill(vmem_t *vm, int flags)
+{
+	bt_t *bt;
+
+	VMEM_ASSERT_LOCKED(vm);
+
+	/*
+	 * Only allow the kmem arena to dip into reserve tags.  It is the
+	 * vmem where new tags come from.
+	 */
+	flags &= BT_FLAGS;
+	if (vm != kmem_arena)
+		flags &= ~M_USE_RESERVE;
+
+	/*
+	 * Loop until we meet the reserve.  To minimize the lock shuffle
+	 * and prevent simultaneous fills we first try a NOWAIT regardless
+	 * of the caller's flags.  Specify M_NOVM so we don't recurse while
+	 * holding a vmem lock.
+	 */
+	while (vm->vm_nfreetags < BT_MAXALLOC) {
+		bt = uma_zalloc(vmem_bt_zone,
+		    (flags & M_USE_RESERVE) | M_NOWAIT | M_NOVM);
+		if (bt == NULL) {
+			VMEM_UNLOCK(vm);
+			bt = uma_zalloc(vmem_bt_zone, flags);
+			VMEM_LOCK(vm);
+			if (bt == NULL && (flags & M_NOWAIT) != 0)
+				break;
+		}
+		LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
+		vm->vm_nfreetags++;
+	}
+
+	if (vm->vm_nfreetags < BT_MAXALLOC)
+		return ENOMEM;
+
+	return 0;
+}
+
+/*
+ * Pop a tag off of the freetag stack.
+ */
+static bt_t *
+bt_alloc(vmem_t *vm)
+{
+	bt_t *bt;
+
+	VMEM_ASSERT_LOCKED(vm);
+	bt = LIST_FIRST(&vm->vm_freetags);
+	MPASS(bt != NULL);
+	LIST_REMOVE(bt, bt_freelist);
+	vm->vm_nfreetags--;
+
+	return bt;
+}
+
+/*
+ * Trim the per-vmem free list.  Returns with the lock released to
+ * avoid allocator recursions.
+ */
+static void
+bt_freetrim(vmem_t *vm, int freelimit)
+{
+	LIST_HEAD(, vmem_btag) freetags;
+	bt_t *bt;
+
+	LIST_INIT(&freetags);
+	VMEM_ASSERT_LOCKED(vm);
+	while (vm->vm_nfreetags > freelimit) {
+		bt = LIST_FIRST(&vm->vm_freetags);
+		LIST_REMOVE(bt, bt_freelist);
+		vm->vm_nfreetags--;
+		LIST_INSERT_HEAD(&freetags, bt, bt_freelist);
+	}
+	VMEM_UNLOCK(vm);
+	while ((bt = LIST_FIRST(&freetags)) != NULL) {
+		LIST_REMOVE(bt, bt_freelist);
+		uma_zfree(vmem_bt_zone, bt);
+	}
+}
+
+static inline void
+bt_free(vmem_t *vm, bt_t *bt)
+{
+
+	VMEM_ASSERT_LOCKED(vm);
+	MPASS(LIST_FIRST(&vm->vm_freetags) != bt);
+	LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
+	vm->vm_nfreetags++;
+}
+
+/*
+ * freelist[0] ... [1, 1]
+ * freelist[1] ... [2, 3]
+ * freelist[2] ... [4, 7]
+ * freelist[3] ... [8, 15]
+ *  :
+ * freelist[n] ... [(1 << n), (1 << (n + 1)) - 1]
+ *  :
+ */
+
+static struct vmem_freelist *
+bt_freehead_tofree(vmem_t *vm, vmem_size_t size)
+{
+	const vmem_size_t qsize = size >> vm->vm_quantum_shift;
+	const int idx = SIZE2ORDER(qsize);
+
+	MPASS(size != 0 && qsize != 0);
+	MPASS((size & vm->vm_quantum_mask) == 0);
+	MPASS(idx >= 0);
+	MPASS(idx < VMEM_MAXORDER);
+
+	return &vm->vm_freelist[idx];
+}
+
+/*
+ * bt_freehead_toalloc: return the freelist for the given size and allocation
+ * strategy.
+ *
+ * For M_FIRSTFIT, return the list in which any blocks are large enough
+ * for the requested size.  otherwise, return the list which can have blocks
+ * large enough for the requested size.
+ */
+static struct vmem_freelist *
+bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, int strat)
+{
+	const vmem_size_t qsize = size >> vm->vm_quantum_shift;
+	int idx = SIZE2ORDER(qsize);
+
+	MPASS(size != 0 && qsize != 0);
+	MPASS((size & vm->vm_quantum_mask) == 0);
+
+	if (strat == M_FIRSTFIT && ORDER2SIZE(idx) != qsize) {
+		idx++;
+		/* check too large request? */
+	}
+	MPASS(idx >= 0);
+	MPASS(idx < VMEM_MAXORDER);
+
+	return &vm->vm_freelist[idx];
+}
+
+/* ---- boundary tag hash */
+
+static struct vmem_hashlist *
+bt_hashhead(vmem_t *vm, vmem_addr_t addr)
+{
+	struct vmem_hashlist *list;
+	unsigned int hash;
+
+	hash = hash32_buf(&addr, sizeof(addr), 0);
+	list = &vm->vm_hashlist[hash % vm->vm_hashsize];
+
+	return list;
+}
+
+static bt_t *
+bt_lookupbusy(vmem_t *vm, vmem_addr_t addr)
+{
+	struct vmem_hashlist *list;
+	bt_t *bt;
+
+	VMEM_ASSERT_LOCKED(vm);
+	list = bt_hashhead(vm, addr); 
+	LIST_FOREACH(bt, list, bt_hashlist) {
+		if (bt->bt_start == addr) {
+			break;
+		}
+	}
+
+	return bt;
+}
+
+static void
+bt_rembusy(vmem_t *vm, bt_t *bt)
+{
+
+	VMEM_ASSERT_LOCKED(vm);
+	MPASS(vm->vm_nbusytag > 0);
+	vm->vm_inuse -= bt->bt_size;
+	vm->vm_nbusytag--;
+	LIST_REMOVE(bt, bt_hashlist);
+}
+
+static void
+bt_insbusy(vmem_t *vm, bt_t *bt)
+{
+	struct vmem_hashlist *list;
+
+	VMEM_ASSERT_LOCKED(vm);
+	MPASS(bt->bt_type == BT_TYPE_BUSY);
+
+	list = bt_hashhead(vm, bt->bt_start);
+	LIST_INSERT_HEAD(list, bt, bt_hashlist);
+	vm->vm_nbusytag++;
+	vm->vm_inuse += bt->bt_size;
+}
+
+/* ---- boundary tag list */
+
+static void
+bt_remseg(vmem_t *vm, bt_t *bt)
+{
+
+	TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist);
+	bt_free(vm, bt);
+}
+
+static void
+bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev)
+{
+
+	TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist);
+}
+
+static void
+bt_insseg_tail(vmem_t *vm, bt_t *bt)
+{
+
+	TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist);
+}
+
+static void
+bt_remfree(vmem_t *vm, bt_t *bt)
+{
+
+	MPASS(bt->bt_type == BT_TYPE_FREE);
+
+	LIST_REMOVE(bt, bt_freelist);
+}
+
+static void
+bt_insfree(vmem_t *vm, bt_t *bt)
+{
+	struct vmem_freelist *list;
+
+	list = bt_freehead_tofree(vm, bt->bt_size);
+	LIST_INSERT_HEAD(list, bt, bt_freelist);
+}
+
+/* ---- vmem internal functions */
+
+/*
+ * Import from the arena into the quantum cache in UMA.
+ */
+static int
+qc_import(void *arg, void **store, int cnt, int flags)
+{
+	qcache_t *qc;
+	vmem_addr_t addr;
+	int i;
+
+	qc = arg;
+	flags |= M_BESTFIT;
+	for (i = 0; i < cnt; i++) {
+		if (vmem_xalloc(qc->qc_vmem, qc->qc_size, 0, 0, 0,
+		    VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, &addr) != 0)
+			break;
+		store[i] = (void *)addr;
+		/* Only guarantee one allocation. */
+		flags &= ~M_WAITOK;
+		flags |= M_NOWAIT;
+	}
+	return i;
+}
+
+/*
+ * Release memory from the UMA cache to the arena.
+ */
+static void
+qc_release(void *arg, void **store, int cnt)
+{
+	qcache_t *qc;
+	int i;
+
+	qc = arg;
+	for (i = 0; i < cnt; i++)
+		vmem_xfree(qc->qc_vmem, (vmem_addr_t)store[i], qc->qc_size);
+}
+
+static void
+qc_init(vmem_t *vm, vmem_size_t qcache_max)
+{
+	qcache_t *qc;
+	vmem_size_t size;
+	int qcache_idx_max;
+	int i;
+
+	MPASS((qcache_max & vm->vm_quantum_mask) == 0);
+	qcache_idx_max = MIN(qcache_max >> vm->vm_quantum_shift,
+	    VMEM_QCACHE_IDX_MAX);
+	vm->vm_qcache_max = qcache_idx_max << vm->vm_quantum_shift;
+	for (i = 0; i < qcache_idx_max; i++) {
+		qc = &vm->vm_qcache[i];
+		size = (i + 1) << vm->vm_quantum_shift;
+		snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu",
+		    vm->vm_name, size);
+		qc->qc_vmem = vm;
+		qc->qc_size = size;
+		qc->qc_cache = uma_zcache_create(qc->qc_name, size,
+		    NULL, NULL, NULL, NULL, qc_import, qc_release, qc,
+		    UMA_ZONE_VM);
+		MPASS(qc->qc_cache);
+	}
+}
+
+static void
+qc_destroy(vmem_t *vm)
+{
+	int qcache_idx_max;
+	int i;
+
+	qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
+	for (i = 0; i < qcache_idx_max; i++)
+		uma_zdestroy(vm->vm_qcache[i].qc_cache);
+}
+
+static void
+qc_drain(vmem_t *vm)
+{
+	int qcache_idx_max;
+	int i;
+
+	qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
+	for (i = 0; i < qcache_idx_max; i++)
+		zone_drain(vm->vm_qcache[i].qc_cache);
+}
+
+#ifndef UMA_MD_SMALL_ALLOC
+
+static struct mtx_padalign vmem_bt_lock;
+
+/*
+ * vmem_bt_alloc:  Allocate a new page of boundary tags.
+ *
+ * On architectures with uma_small_alloc there is no recursion; no address
+ * space need be allocated to allocate boundary tags.  For the others, we
+ * must handle recursion.  Boundary tags are necessary to allocate new
+ * boundary tags.
+ *
+ * UMA guarantees that enough tags are held in reserve to allocate a new
+ * page of kva.  We dip into this reserve by specifying M_USE_RESERVE only
+ * when allocating the page to hold new boundary tags.  In this way the
+ * reserve is automatically filled by the allocation that uses the reserve.
+ * 
+ * We still have to guarantee that the new tags are allocated atomically since
+ * many threads may try concurrently.  The bt_lock provides this guarantee.
+ * We convert WAITOK allocations to NOWAIT and then handle the blocking here
+ * on failure.  It's ok to return NULL for a WAITOK allocation as UMA will
+ * loop again after checking to see if we lost the race to allocate.
+ *
+ * There is a small race between vmem_bt_alloc() returning the page and the
+ * zone lock being acquired to add the page to the zone.  For WAITOK
+ * allocations we just pause briefly.  NOWAIT may experience a transient
+ * failure.  To alleviate this we permit a small number of simultaneous
+ * fills to proceed concurrently so NOWAIT is less likely to fail unless
+ * we are really out of KVA.
+ */
+static void *
+vmem_bt_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
+{
+	vmem_addr_t addr;
+
+	*pflag = UMA_SLAB_KMEM;
+
+	/*
+	 * Single thread boundary tag allocation so that the address space
+	 * and memory are added in one atomic operation.
+	 */
+	mtx_lock(&vmem_bt_lock);
+	if (vmem_xalloc(kmem_arena, bytes, 0, 0, 0, VMEM_ADDR_MIN,
+	    VMEM_ADDR_MAX, M_NOWAIT | M_NOVM | M_USE_RESERVE | M_BESTFIT,
+	    &addr) == 0) {
+		if (kmem_back(kmem_object, addr, bytes,
+		    M_NOWAIT | M_USE_RESERVE) == 0) {
+			mtx_unlock(&vmem_bt_lock);
+			return ((void *)addr);
+		}
+		vmem_xfree(kmem_arena, addr, bytes);
+		mtx_unlock(&vmem_bt_lock);
+		/*
+		 * Out of memory, not address space.  This may not even be
+		 * possible due to M_USE_RESERVE page allocation.
+		 */
+		if (wait & M_WAITOK)
+			VM_WAIT;
+		return (NULL);
+	}
+	mtx_unlock(&vmem_bt_lock);
+	/*
+	 * We're either out of address space or lost a fill race.
+	 */
+	if (wait & M_WAITOK)
+		pause("btalloc", 1);
+
+	return (NULL);
+}
+#endif
+
+void
+vmem_startup(void)
+{
+
+	mtx_init(&vmem_list_lock, "vmem list lock", NULL, MTX_DEF);
+	vmem_bt_zone = uma_zcreate("vmem btag",
+	    sizeof(struct vmem_btag), NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_PTR, UMA_ZONE_VM);
+#ifndef UMA_MD_SMALL_ALLOC
+	mtx_init(&vmem_bt_lock, "btag lock", NULL, MTX_DEF);
+	uma_prealloc(vmem_bt_zone, BT_MAXALLOC);
+	/*
+	 * Reserve enough tags to allocate new tags.  We allow multiple
+	 * CPUs to attempt to allocate new tags concurrently to limit
+	 * false restarts in UMA.
+	 */
+	uma_zone_reserve(vmem_bt_zone, BT_MAXALLOC * (mp_ncpus + 1) / 2);
+	uma_zone_set_allocf(vmem_bt_zone, vmem_bt_alloc);
+#endif
+}
+
+/* ---- rehash */
+
+static int
+vmem_rehash(vmem_t *vm, vmem_size_t newhashsize)
+{
+	bt_t *bt;
+	int i;
+	struct vmem_hashlist *newhashlist;
+	struct vmem_hashlist *oldhashlist;
+	vmem_size_t oldhashsize;
+
+	MPASS(newhashsize > 0);
+
+	newhashlist = malloc(sizeof(struct vmem_hashlist) * newhashsize,
+	    M_VMEM, M_NOWAIT);
+	if (newhashlist == NULL)
+		return ENOMEM;
+	for (i = 0; i < newhashsize; i++) {
+		LIST_INIT(&newhashlist[i]);
+	}
+
+	VMEM_LOCK(vm);
+	oldhashlist = vm->vm_hashlist;
+	oldhashsize = vm->vm_hashsize;
+	vm->vm_hashlist = newhashlist;
+	vm->vm_hashsize = newhashsize;
+	if (oldhashlist == NULL) {
+		VMEM_UNLOCK(vm);
+		return 0;
+	}
+	for (i = 0; i < oldhashsize; i++) {
+		while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) {
+			bt_rembusy(vm, bt);
+			bt_insbusy(vm, bt);
+		}
+	}
+	VMEM_UNLOCK(vm);
+
+	if (oldhashlist != vm->vm_hash0) {
+		free(oldhashlist, M_VMEM);
+	}
+
+	return 0;
+}
+
+static void
+vmem_periodic_kick(void *dummy)
+{
+
+	taskqueue_enqueue(taskqueue_thread, &vmem_periodic_wk);
+}
+
+static void
+vmem_periodic(void *unused, int pending)
+{
+	vmem_t *vm;
+	vmem_size_t desired;
+	vmem_size_t current;
+
+	mtx_lock(&vmem_list_lock);
+	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
+#ifdef DIAGNOSTIC
+		/* Convenient time to verify vmem state. */
+		if (enable_vmem_check == 1) {
+			VMEM_LOCK(vm);
+			vmem_check(vm);
+			VMEM_UNLOCK(vm);
+		}
+#endif
+		desired = 1 << flsl(vm->vm_nbusytag);
+		desired = MIN(MAX(desired, VMEM_HASHSIZE_MIN),
+		    VMEM_HASHSIZE_MAX);
+		current = vm->vm_hashsize;
+
+		/* Grow in powers of two.  Shrink less aggressively. */
+		if (desired >= current * 2 || desired * 4 <= current)
+			vmem_rehash(vm, desired);
+	}
+	mtx_unlock(&vmem_list_lock);
+
+	callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
+	    vmem_periodic_kick, NULL);
+}
+
+static void
+vmem_start_callout(void *unused)
+{
+
+	TASK_INIT(&vmem_periodic_wk, 0, vmem_periodic, NULL);
+	vmem_periodic_interval = hz * 10;
+	callout_init(&vmem_periodic_ch, CALLOUT_MPSAFE);
+	callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
+	    vmem_periodic_kick, NULL);
+}
+SYSINIT(vfs, SI_SUB_CONFIGURE, SI_ORDER_ANY, vmem_start_callout, NULL);
+
+static void
+vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int type)
+{
+	bt_t *btspan;
+	bt_t *btfree;
+
+	MPASS(type == BT_TYPE_SPAN || type == BT_TYPE_SPAN_STATIC);
+	MPASS((size & vm->vm_quantum_mask) == 0);
+
+	btspan = bt_alloc(vm);
+	btspan->bt_type = type;
+	btspan->bt_start = addr;
+	btspan->bt_size = size;
+	bt_insseg_tail(vm, btspan);
+
+	btfree = bt_alloc(vm);
+	btfree->bt_type = BT_TYPE_FREE;
+	btfree->bt_start = addr;
+	btfree->bt_size = size;
+	bt_insseg(vm, btfree, btspan);
+	bt_insfree(vm, btfree);
+
+	vm->vm_size += size;
+}
+
+static void
+vmem_destroy1(vmem_t *vm)
+{
+	bt_t *bt;
+
+	/*
+	 * Drain per-cpu quantum caches.
+	 */
+	qc_destroy(vm);
+
+	/*
+	 * The vmem should now only contain empty segments.
+	 */
+	VMEM_LOCK(vm);
+	MPASS(vm->vm_nbusytag == 0);
+
+	while ((bt = TAILQ_FIRST(&vm->vm_seglist)) != NULL)
+		bt_remseg(vm, bt);
+
+	if (vm->vm_hashlist != NULL && vm->vm_hashlist != vm->vm_hash0)
+		free(vm->vm_hashlist, M_VMEM);
+
+	bt_freetrim(vm, 0);
+
+	VMEM_CONDVAR_DESTROY(vm);
+	VMEM_LOCK_DESTROY(vm);
+	free(vm, M_VMEM);
+}
+
+static int
+vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags)
+{
+	vmem_addr_t addr;
+	int error;
+
+	if (vm->vm_importfn == NULL)
+		return EINVAL;
+
+	/*
+	 * To make sure we get a span that meets the alignment we double it
+	 * and add the size to the tail.  This slightly overestimates.
+	 */
+	if (align != vm->vm_quantum_mask + 1)
+		size = (align * 2) + size;
+	size = roundup(size, vm->vm_import_quantum);
+
+	/*
+	 * Hide MAXALLOC tags so we're guaranteed to be able to add this
+	 * span and the tag we want to allocate from it.
+	 */
+	MPASS(vm->vm_nfreetags >= BT_MAXALLOC);
+	vm->vm_nfreetags -= BT_MAXALLOC;
+	VMEM_UNLOCK(vm);
+	error = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr);
+	VMEM_LOCK(vm);
+	vm->vm_nfreetags += BT_MAXALLOC;
+	if (error)
+		return ENOMEM;
+
+	vmem_add1(vm, addr, size, BT_TYPE_SPAN);
+
+	return 0;
+}
+
+/*
+ * vmem_fit: check if a bt can satisfy the given restrictions.
+ *
+ * it's a caller's responsibility to ensure the region is big enough
+ * before calling us.
+ */
+static int
+vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align,
+    vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr,
+    vmem_addr_t maxaddr, vmem_addr_t *addrp)
+{
+	vmem_addr_t start;
+	vmem_addr_t end;
+
+	MPASS(size > 0);
+	MPASS(bt->bt_size >= size); /* caller's responsibility */
+
+	/*
+	 * XXX assumption: vmem_addr_t and vmem_size_t are
+	 * unsigned integer of the same size.
+	 */
+
+	start = bt->bt_start;
+	if (start < minaddr) {
+		start = minaddr;
+	}
+	end = BT_END(bt);
+	if (end > maxaddr)
+		end = maxaddr;
+	if (start > end) 
+		return (ENOMEM);
+
+	start = VMEM_ALIGNUP(start - phase, align) + phase;
+	if (start < bt->bt_start)
+		start += align;
+	if (VMEM_CROSS_P(start, start + size - 1, nocross)) {
+		MPASS(align < nocross);
+		start = VMEM_ALIGNUP(start - phase, nocross) + phase;
+	}
+	if (start <= end && end - start >= size - 1) {
+		MPASS((start & (align - 1)) == phase);
+		MPASS(!VMEM_CROSS_P(start, start + size - 1, nocross));
+		MPASS(minaddr <= start);
+		MPASS(maxaddr == 0 || start + size - 1 <= maxaddr);
+		MPASS(bt->bt_start <= start);
+		MPASS(BT_END(bt) - start >= size - 1);
+		*addrp = start;
+
+		return (0);
+	}
+	return (ENOMEM);
+}
+
+/*
+ * vmem_clip:  Trim the boundary tag edges to the requested start and size.
+ */
+static void
+vmem_clip(vmem_t *vm, bt_t *bt, vmem_addr_t start, vmem_size_t size)
+{
+	bt_t *btnew;
+	bt_t *btprev;
+
+	VMEM_ASSERT_LOCKED(vm);
+	MPASS(bt->bt_type == BT_TYPE_FREE);
+	MPASS(bt->bt_size >= size);
+	bt_remfree(vm, bt);
+	if (bt->bt_start != start) {
+		btprev = bt_alloc(vm);
+		btprev->bt_type = BT_TYPE_FREE;
+		btprev->bt_start = bt->bt_start;
+		btprev->bt_size = start - bt->bt_start;
+		bt->bt_start = start;
+		bt->bt_size -= btprev->bt_size;
+		bt_insfree(vm, btprev);
+		bt_insseg(vm, btprev,
+		    TAILQ_PREV(bt, vmem_seglist, bt_seglist));
+	}
+	MPASS(bt->bt_start == start);
+	if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) {
+		/* split */
+		btnew = bt_alloc(vm);
+		btnew->bt_type = BT_TYPE_BUSY;
+		btnew->bt_start = bt->bt_start;
+		btnew->bt_size = size;
+		bt->bt_start = bt->bt_start + size;
+		bt->bt_size -= size;
+		bt_insfree(vm, bt);
+		bt_insseg(vm, btnew,
+		    TAILQ_PREV(bt, vmem_seglist, bt_seglist));
+		bt_insbusy(vm, btnew);
+		bt = btnew;
+	} else {
+		bt->bt_type = BT_TYPE_BUSY;
+		bt_insbusy(vm, bt);
+	}
+	MPASS(bt->bt_size >= size);
+	bt->bt_type = BT_TYPE_BUSY;
+}
+
+/* ---- vmem API */
+
+void
+vmem_set_import(vmem_t *vm, vmem_import_t *importfn,
+     vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum)
+{
+
+	VMEM_LOCK(vm);
+	vm->vm_importfn = importfn;
+	vm->vm_releasefn = releasefn;
+	vm->vm_arg = arg;
+	vm->vm_import_quantum = import_quantum;
+	VMEM_UNLOCK(vm);
+}
+
+void
+vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn)
+{
+
+	VMEM_LOCK(vm);
+	vm->vm_reclaimfn = reclaimfn;
+	VMEM_UNLOCK(vm);
+}
+
+/*
+ * vmem_init: Initializes vmem arena.
+ */
+vmem_t *
+vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size,
+    vmem_size_t quantum, vmem_size_t qcache_max, int flags)
+{
+	int i;
+
+	MPASS(quantum > 0);
+
+	bzero(vm, sizeof(*vm));
+
+	VMEM_CONDVAR_INIT(vm, name);
+	VMEM_LOCK_INIT(vm, name);
+	vm->vm_nfreetags = 0;
+	LIST_INIT(&vm->vm_freetags);
+	strlcpy(vm->vm_name, name, sizeof(vm->vm_name));
+	vm->vm_quantum_mask = quantum - 1;
+	vm->vm_quantum_shift = SIZE2ORDER(quantum);
+	MPASS(ORDER2SIZE(vm->vm_quantum_shift) == quantum);
+	vm->vm_nbusytag = 0;
+	vm->vm_size = 0;
+	vm->vm_inuse = 0;
+	qc_init(vm, qcache_max);
+
+	TAILQ_INIT(&vm->vm_seglist);
+	for (i = 0; i < VMEM_MAXORDER; i++) {
+		LIST_INIT(&vm->vm_freelist[i]);
+	}
+	memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0));
+	vm->vm_hashsize = VMEM_HASHSIZE_MIN;
+	vm->vm_hashlist = vm->vm_hash0;
+
+	if (size != 0) {
+		if (vmem_add(vm, base, size, flags) != 0) {
+			vmem_destroy1(vm);
+			return NULL;
+		}
+	}
+
+	mtx_lock(&vmem_list_lock);
+	LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist);
+	mtx_unlock(&vmem_list_lock);
+
+	return vm;
+}
+
+/*
+ * vmem_create: create an arena.
+ */
+vmem_t *
+vmem_create(const char *name, vmem_addr_t base, vmem_size_t size,
+    vmem_size_t quantum, vmem_size_t qcache_max, int flags)
+{
+
+	vmem_t *vm;
+
+	vm = malloc(sizeof(*vm), M_VMEM, flags & (M_WAITOK|M_NOWAIT));
+	if (vm == NULL)
+		return (NULL);
+	if (vmem_init(vm, name, base, size, quantum, qcache_max,
+	    flags) == NULL) {
+		free(vm, M_VMEM);
+		return (NULL);
+	}
+	return (vm);
+}
+
+void
+vmem_destroy(vmem_t *vm)
+{
+
+	mtx_lock(&vmem_list_lock);
+	LIST_REMOVE(vm, vm_alllist);
+	mtx_unlock(&vmem_list_lock);
+
+	vmem_destroy1(vm);
+}
+
+vmem_size_t
+vmem_roundup_size(vmem_t *vm, vmem_size_t size)
+{
+
+	return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask;
+}
+
+/*
+ * vmem_alloc: allocate resource from the arena.
+ */
+int
+vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp)
+{
+	const int strat __unused = flags & VMEM_FITMASK;
+	qcache_t *qc;
+
+	flags &= VMEM_FLAGS;
+	MPASS(size > 0);
+	MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
+	if ((flags & M_NOWAIT) == 0)
+		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_alloc");
+
+	if (size <= vm->vm_qcache_max) {
+		qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
+		*addrp = (vmem_addr_t)uma_zalloc(qc->qc_cache, flags);
+		if (*addrp == 0)
+			return (ENOMEM);
+		return (0);
+	}
+
+	return vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
+	    flags, addrp);
+}
+
+int
+vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align,
+    const vmem_size_t phase, const vmem_size_t nocross,
+    const vmem_addr_t minaddr, const vmem_addr_t maxaddr, int flags,
+    vmem_addr_t *addrp)
+{
+	const vmem_size_t size = vmem_roundup_size(vm, size0);
+	struct vmem_freelist *list;
+	struct vmem_freelist *first;
+	struct vmem_freelist *end;
+	vmem_size_t avail;
+	bt_t *bt;
+	int error;
+	int strat;
+
+	flags &= VMEM_FLAGS;
+	strat = flags & VMEM_FITMASK;
+	MPASS(size0 > 0);
+	MPASS(size > 0);
+	MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
+	MPASS((flags & (M_NOWAIT|M_WAITOK)) != (M_NOWAIT|M_WAITOK));
+	if ((flags & M_NOWAIT) == 0)
+		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_xalloc");
+	MPASS((align & vm->vm_quantum_mask) == 0);
+	MPASS((align & (align - 1)) == 0);
+	MPASS((phase & vm->vm_quantum_mask) == 0);
+	MPASS((nocross & vm->vm_quantum_mask) == 0);
+	MPASS((nocross & (nocross - 1)) == 0);
+	MPASS((align == 0 && phase == 0) || phase < align);
+	MPASS(nocross == 0 || nocross >= size);
+	MPASS(minaddr <= maxaddr);
+	MPASS(!VMEM_CROSS_P(phase, phase + size - 1, nocross));
+
+	if (align == 0)
+		align = vm->vm_quantum_mask + 1;
+
+	*addrp = 0;
+	end = &vm->vm_freelist[VMEM_MAXORDER];
+	/*
+	 * choose a free block from which we allocate.
+	 */
+	first = bt_freehead_toalloc(vm, size, strat);
+	VMEM_LOCK(vm);
+	for (;;) {
+		/*
+		 * Make sure we have enough tags to complete the
+		 * operation.
+		 */
+		if (vm->vm_nfreetags < BT_MAXALLOC &&
+		    bt_fill(vm, flags) != 0) {
+			error = ENOMEM;
+			break;
+		}
+		/*
+	 	 * Scan freelists looking for a tag that satisfies the
+		 * allocation.  If we're doing BESTFIT we may encounter
+		 * sizes below the request.  If we're doing FIRSTFIT we
+		 * inspect only the first element from each list.
+		 */
+		for (list = first; list < end; list++) {
+			LIST_FOREACH(bt, list, bt_freelist) {
+				if (bt->bt_size >= size) {
+					error = vmem_fit(bt, size, align, phase,
+					    nocross, minaddr, maxaddr, addrp);
+					if (error == 0) {
+						vmem_clip(vm, bt, *addrp, size);
+						goto out;
+					}
+				}
+				/* FIRST skips to the next list. */
+				if (strat == M_FIRSTFIT)
+					break;
+			}
+		}
+		/*
+		 * Retry if the fast algorithm failed.
+		 */
+		if (strat == M_FIRSTFIT) {
+			strat = M_BESTFIT;
+			first = bt_freehead_toalloc(vm, size, strat);
+			continue;
+		}
+		/*
+		 * XXX it is possible to fail to meet restrictions with the
+		 * imported region.  It is up to the user to specify the
+		 * import quantum such that it can satisfy any allocation.
+		 */
+		if (vmem_import(vm, size, align, flags) == 0)
+			continue;
+
+		/*
+		 * Try to free some space from the quantum cache or reclaim
+		 * functions if available.
+		 */
+		if (vm->vm_qcache_max != 0 || vm->vm_reclaimfn != NULL) {
+			avail = vm->vm_size - vm->vm_inuse;
+			VMEM_UNLOCK(vm);
+			if (vm->vm_qcache_max != 0)
+				qc_drain(vm);
+			if (vm->vm_reclaimfn != NULL)
+				vm->vm_reclaimfn(vm, flags);
+			VMEM_LOCK(vm);
+			/* If we were successful retry even NOWAIT. */
+			if (vm->vm_size - vm->vm_inuse > avail)
+				continue;
+		}
+		if ((flags & M_NOWAIT) != 0) {
+			error = ENOMEM;
+			break;
+		}
+		VMEM_CONDVAR_WAIT(vm);
+	}
+out:
+	VMEM_UNLOCK(vm);
+	if (error != 0 && (flags & M_NOWAIT) == 0)
+		panic("failed to allocate waiting allocation\n");
+
+	return (error);
+}
+
+/*
+ * vmem_free: free the resource to the arena.
+ */
+void
+vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
+{
+	qcache_t *qc;
+	MPASS(size > 0);
+
+	if (size <= vm->vm_qcache_max) {
+		qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
+		uma_zfree(qc->qc_cache, (void *)addr);
+	} else
+		vmem_xfree(vm, addr, size);
+}
+
+void
+vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
+{
+	bt_t *bt;
+	bt_t *t;
+
+	MPASS(size > 0);
+
+	VMEM_LOCK(vm);
+	bt = bt_lookupbusy(vm, addr);
+	MPASS(bt != NULL);
+	MPASS(bt->bt_start == addr);
+	MPASS(bt->bt_size == vmem_roundup_size(vm, size) ||
+	    bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask);
+	MPASS(bt->bt_type == BT_TYPE_BUSY);
+	bt_rembusy(vm, bt);
+	bt->bt_type = BT_TYPE_FREE;
+
+	/* coalesce */
+	t = TAILQ_NEXT(bt, bt_seglist);
+	if (t != NULL && t->bt_type == BT_TYPE_FREE) {
+		MPASS(BT_END(bt) < t->bt_start);	/* YYY */
+		bt->bt_size += t->bt_size;
+		bt_remfree(vm, t);
+		bt_remseg(vm, t);
+	}
+	t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
+	if (t != NULL && t->bt_type == BT_TYPE_FREE) {
+		MPASS(BT_END(t) < bt->bt_start);	/* YYY */
+		bt->bt_size += t->bt_size;
+		bt->bt_start = t->bt_start;
+		bt_remfree(vm, t);
+		bt_remseg(vm, t);
+	}
+
+	t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
+	MPASS(t != NULL);
+	MPASS(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY);
+	if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN &&
+	    t->bt_size == bt->bt_size) {
+		vmem_addr_t spanaddr;
+		vmem_size_t spansize;
+
+		MPASS(t->bt_start == bt->bt_start);
+		spanaddr = bt->bt_start;
+		spansize = bt->bt_size;
+		bt_remseg(vm, bt);
+		bt_remseg(vm, t);
+		vm->vm_size -= spansize;
+		VMEM_CONDVAR_BROADCAST(vm);
+		bt_freetrim(vm, BT_MAXFREE);
+		(*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize);
+	} else {
+		bt_insfree(vm, bt);
+		VMEM_CONDVAR_BROADCAST(vm);
+		bt_freetrim(vm, BT_MAXFREE);
+	}
+}
+
+/*
+ * vmem_add:
+ *
+ */
+int
+vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags)
+{
+	int error;
+
+	error = 0;
+	flags &= VMEM_FLAGS;
+	VMEM_LOCK(vm);
+	if (vm->vm_nfreetags >= BT_MAXALLOC || bt_fill(vm, flags) == 0)
+		vmem_add1(vm, addr, size, BT_TYPE_SPAN_STATIC);
+	else
+		error = ENOMEM;
+	VMEM_UNLOCK(vm);
+
+	return (error);
+}
+
+/*
+ * vmem_size: information about arenas size
+ */
+vmem_size_t
+vmem_size(vmem_t *vm, int typemask)
+{
+
+	switch (typemask) {
+	case VMEM_ALLOC:
+		return vm->vm_inuse;
+	case VMEM_FREE:
+		return vm->vm_size - vm->vm_inuse;
+	case VMEM_FREE|VMEM_ALLOC:
+		return vm->vm_size;
+	default:
+		panic("vmem_size");
+	}
+}
+
+/* ---- debug */
+
+#if defined(DDB) || defined(DIAGNOSTIC)
+
+static void bt_dump(const bt_t *, int (*)(const char *, ...)
+    __printflike(1, 2));
+
+static const char *
+bt_type_string(int type)
+{
+
+	switch (type) {
+	case BT_TYPE_BUSY:
+		return "busy";
+	case BT_TYPE_FREE:
+		return "free";
+	case BT_TYPE_SPAN:
+		return "span";
+	case BT_TYPE_SPAN_STATIC:
+		return "static span";
+	default:
+		break;
+	}
+	return "BOGUS";
+}
+
+static void
+bt_dump(const bt_t *bt, int (*pr)(const char *, ...))
+{
+
+	(*pr)("\t%p: %jx %jx, %d(%s)\n",
+	    bt, (intmax_t)bt->bt_start, (intmax_t)bt->bt_size,
+	    bt->bt_type, bt_type_string(bt->bt_type));
+}
+
+static void
+vmem_dump(const vmem_t *vm , int (*pr)(const char *, ...) __printflike(1, 2))
+{
+	const bt_t *bt;
+	int i;
+
+	(*pr)("vmem %p '%s'\n", vm, vm->vm_name);
+	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
+		bt_dump(bt, pr);
+	}
+
+	for (i = 0; i < VMEM_MAXORDER; i++) {
+		const struct vmem_freelist *fl = &vm->vm_freelist[i];
+
+		if (LIST_EMPTY(fl)) {
+			continue;
+		}
+
+		(*pr)("freelist[%d]\n", i);
+		LIST_FOREACH(bt, fl, bt_freelist) {
+			bt_dump(bt, pr);
+		}
+	}
+}
+
+#endif /* defined(DDB) || defined(DIAGNOSTIC) */
+
+#if defined(DDB)
+static bt_t *
+vmem_whatis_lookup(vmem_t *vm, vmem_addr_t addr)
+{
+	bt_t *bt;
+
+	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
+		if (BT_ISSPAN_P(bt)) {
+			continue;
+		}
+		if (bt->bt_start <= addr && addr <= BT_END(bt)) {
+			return bt;
+		}
+	}
+
+	return NULL;
+}
+
+void
+vmem_whatis(vmem_addr_t addr, int (*pr)(const char *, ...))
+{
+	vmem_t *vm;
+
+	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
+		bt_t *bt;
+
+		bt = vmem_whatis_lookup(vm, addr);
+		if (bt == NULL) {
+			continue;
+		}
+		(*pr)("%p is %p+%zu in VMEM '%s' (%s)\n",
+		    (void *)addr, (void *)bt->bt_start,
+		    (vmem_size_t)(addr - bt->bt_start), vm->vm_name,
+		    (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free");
+	}
+}
+
+void
+vmem_printall(const char *modif, int (*pr)(const char *, ...))
+{
+	const vmem_t *vm;
+
+	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
+		vmem_dump(vm, pr);
+	}
+}
+
+void
+vmem_print(vmem_addr_t addr, const char *modif, int (*pr)(const char *, ...))
+{
+	const vmem_t *vm = (const void *)addr;
+
+	vmem_dump(vm, pr);
+}
+#endif /* defined(DDB) */
+
+#define vmem_printf printf
+
+#if defined(DIAGNOSTIC)
+
+static bool
+vmem_check_sanity(vmem_t *vm)
+{
+	const bt_t *bt, *bt2;
+
+	MPASS(vm != NULL);
+
+	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
+		if (bt->bt_start > BT_END(bt)) {
+			printf("corrupted tag\n");
+			bt_dump(bt, vmem_printf);
+			return false;
+		}
+	}
+	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
+		TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) {
+			if (bt == bt2) {
+				continue;
+			}
+			if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) {
+				continue;
+			}
+			if (bt->bt_start <= BT_END(bt2) &&
+			    bt2->bt_start <= BT_END(bt)) {
+				printf("overwrapped tags\n");
+				bt_dump(bt, vmem_printf);
+				bt_dump(bt2, vmem_printf);
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+static void
+vmem_check(vmem_t *vm)
+{
+
+	if (!vmem_check_sanity(vm)) {
+		panic("insanity vmem %p", vm);
+	}
+}
+
+#endif /* defined(DIAGNOSTIC) */
diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
new file mode 100644
index 0000000..9d3040d
--- /dev/null
+++ b/sys/kern/subr_witness.c
@@ -0,0 +1,2912 @@
+/*-
+ * Copyright (c) 2008 Isilon Systems, Inc.
+ * Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
+ * Copyright (c) 1998 Berkeley Software Design, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ *    promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
+ */
+
+/*
+ * Implementation of the `witness' lock verifier.  Originally implemented for
+ * mutexes in BSD/OS.  Extended to handle generic lock objects and lock
+ * classes in FreeBSD.
+ */
+
+/*
+ *	Main Entry: witness
+ *	Pronunciation: 'wit-n&s
+ *	Function: noun
+ *	Etymology: Middle English witnesse, from Old English witnes knowledge,
+ *	    testimony, witness, from 2wit
+ *	Date: before 12th century
+ *	1 : attestation of a fact or event : TESTIMONY
+ *	2 : one that gives evidence; specifically : one who testifies in
+ *	    a cause or before a judicial tribunal
+ *	3 : one asked to be present at a transaction so as to be able to
+ *	    testify to its having taken place
+ *	4 : one who has personal knowledge of something
+ *	5 a : something serving as evidence or proof : SIGN
+ *	  b : public affirmation by word or example of usually
+ *	      religious faith or conviction <the heroic witness to divine
+ *	      life -- Pilot>
+ *	6 capitalized : a member of the Jehovah's Witnesses 
+ */
+
+/*
+ * Special rules concerning Giant and lock orders:
+ *
+ * 1) Giant must be acquired before any other mutexes.  Stated another way,
+ *    no other mutex may be held when Giant is acquired.
+ *
+ * 2) Giant must be released when blocking on a sleepable lock.
+ *
+ * This rule is less obvious, but is a result of Giant providing the same
+ * semantics as spl().  Basically, when a thread sleeps, it must release
+ * Giant.  When a thread blocks on a sleepable lock, it sleeps.  Hence rule
+ * 2).
+ *
+ * 3) Giant may be acquired before or after sleepable locks.
+ *
+ * This rule is also not quite as obvious.  Giant may be acquired after
+ * a sleepable lock because it is a non-sleepable lock and non-sleepable
+ * locks may always be acquired while holding a sleepable lock.  The second
+ * case, Giant before a sleepable lock, follows from rule 2) above.  Suppose
+ * you have two threads T1 and T2 and a sleepable lock X.  Suppose that T1
+ * acquires X and blocks on Giant.  Then suppose that T2 acquires Giant and
+ * blocks on X.  When T2 blocks on X, T2 will release Giant allowing T1 to
+ * execute.  Thus, acquiring Giant both before and after a sleepable lock
+ * will not result in a lock order reversal.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_stack.h"
+#include "opt_witness.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/stack.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <machine/stdarg.h>
+
+#if !defined(DDB) && !defined(STACK)
+#error "DDB or STACK options are required for WITNESS"
+#endif
+
+/* Note that these traces do not work with KTR_ALQ. */
+#if 0
+#define	KTR_WITNESS	KTR_SUBSYS
+#else
+#define	KTR_WITNESS	0
+#endif
+
+#define	LI_RECURSEMASK	0x0000ffff	/* Recursion depth of lock instance. */
+#define	LI_EXCLUSIVE	0x00010000	/* Exclusive lock instance. */
+#define	LI_NORELEASE	0x00020000	/* Lock not allowed to be released. */
+
+/* Define this to check for blessed mutexes */
+#undef BLESSING
+
+#define	WITNESS_COUNT 		1024
+#define	WITNESS_CHILDCOUNT 	(WITNESS_COUNT * 4)
+#define	WITNESS_HASH_SIZE	251	/* Prime, gives load factor < 2 */
+#define	WITNESS_PENDLIST	1024
+
+/* Allocate 256 KB of stack data space */
+#define	WITNESS_LO_DATA_COUNT	2048
+
+/* Prime, gives load factor of ~2 at full load */
+#define	WITNESS_LO_HASH_SIZE	1021
+
+/*
+ * XXX: This is somewhat bogus, as we assume here that at most 2048 threads
+ * will hold LOCK_NCHILDREN locks.  We handle failure ok, and we should
+ * probably be safe for the most part, but it's still a SWAG.
+ */
+#define	LOCK_NCHILDREN	5
+#define	LOCK_CHILDCOUNT	2048
+
+#define	MAX_W_NAME	64
+
+#define	BADSTACK_SBUF_SIZE	(256 * WITNESS_COUNT)
+#define	FULLGRAPH_SBUF_SIZE	512
+
+/*
+ * These flags go in the witness relationship matrix and describe the
+ * relationship between any two struct witness objects.
+ */
+#define	WITNESS_UNRELATED        0x00    /* No lock order relation. */
+#define	WITNESS_PARENT           0x01    /* Parent, aka direct ancestor. */
+#define	WITNESS_ANCESTOR         0x02    /* Direct or indirect ancestor. */
+#define	WITNESS_CHILD            0x04    /* Child, aka direct descendant. */
+#define	WITNESS_DESCENDANT       0x08    /* Direct or indirect descendant. */
+#define	WITNESS_ANCESTOR_MASK    (WITNESS_PARENT | WITNESS_ANCESTOR)
+#define	WITNESS_DESCENDANT_MASK  (WITNESS_CHILD | WITNESS_DESCENDANT)
+#define	WITNESS_RELATED_MASK						\
+	(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
+#define	WITNESS_REVERSAL         0x10    /* A lock order reversal has been
+					  * observed. */
+#define	WITNESS_RESERVED1        0x20    /* Unused flag, reserved. */
+#define	WITNESS_RESERVED2        0x40    /* Unused flag, reserved. */
+#define	WITNESS_LOCK_ORDER_KNOWN 0x80    /* This lock order is known. */
+
+/* Descendant to ancestor flags */
+#define	WITNESS_DTOA(x)	(((x) & WITNESS_RELATED_MASK) >> 2)
+
+/* Ancestor to descendant flags */
+#define	WITNESS_ATOD(x)	(((x) & WITNESS_RELATED_MASK) << 2)
+
+#define	WITNESS_INDEX_ASSERT(i)						\
+	MPASS((i) > 0 && (i) <= w_max_used_index && (i) < WITNESS_COUNT)
+
+static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness");
+
+/*
+ * Lock instances.  A lock instance is the data associated with a lock while
+ * it is held by witness.  For example, a lock instance will hold the
+ * recursion count of a lock.  Lock instances are held in lists.  Spin locks
+ * are held in a per-cpu list while sleep locks are held in per-thread list.
+ */
+struct lock_instance {
+	struct lock_object	*li_lock;
+	const char		*li_file;
+	int			li_line;
+	u_int			li_flags;
+};
+
+/*
+ * A simple list type used to build the list of locks held by a thread
+ * or CPU.  We can't simply embed the list in struct lock_object since a
+ * lock may be held by more than one thread if it is a shared lock.  Locks
+ * are added to the head of the list, so we fill up each list entry from
+ * "the back" logically.  To ease some of the arithmetic, we actually fill
+ * in each list entry the normal way (children[0] then children[1], etc.) but
+ * when we traverse the list we read children[count-1] as the first entry
+ * down to children[0] as the final entry.
+ */
+struct lock_list_entry {
+	struct lock_list_entry	*ll_next;
+	struct lock_instance	ll_children[LOCK_NCHILDREN];
+	u_int			ll_count;
+};
+
+/*
+ * The main witness structure. One of these per named lock type in the system
+ * (for example, "vnode interlock").
+ */
+struct witness {
+	char  			w_name[MAX_W_NAME];
+	uint32_t 		w_index;  /* Index in the relationship matrix */
+	struct lock_class	*w_class;
+	STAILQ_ENTRY(witness) 	w_list;		/* List of all witnesses. */
+	STAILQ_ENTRY(witness) 	w_typelist;	/* Witnesses of a type. */
+	struct witness		*w_hash_next; /* Linked list in hash buckets. */
+	const char		*w_file; /* File where last acquired */
+	uint32_t 		w_line; /* Line where last acquired */
+	uint32_t 		w_refcount;
+	uint16_t 		w_num_ancestors; /* direct/indirect
+						  * ancestor count */
+	uint16_t 		w_num_descendants; /* direct/indirect
+						    * descendant count */
+	int16_t 		w_ddb_level;
+	unsigned		w_displayed:1;
+	unsigned		w_reversed:1;
+};
+
+STAILQ_HEAD(witness_list, witness);
+
+/*
+ * The witness hash table. Keys are witness names (const char *), elements are
+ * witness objects (struct witness *).
+ */
+struct witness_hash {
+	struct witness	*wh_array[WITNESS_HASH_SIZE];
+	uint32_t	wh_size;
+	uint32_t	wh_count;
+};
+
+/*
+ * Key type for the lock order data hash table.
+ */
+struct witness_lock_order_key {
+	uint16_t	from;
+	uint16_t	to;
+};
+
+struct witness_lock_order_data {
+	struct stack			wlod_stack;
+	struct witness_lock_order_key	wlod_key;
+	struct witness_lock_order_data	*wlod_next;
+};
+
+/*
+ * The witness lock order data hash table. Keys are witness index tuples
+ * (struct witness_lock_order_key), elements are lock order data objects
+ * (struct witness_lock_order_data). 
+ */
+struct witness_lock_order_hash {
+	struct witness_lock_order_data	*wloh_array[WITNESS_LO_HASH_SIZE];
+	u_int	wloh_size;
+	u_int	wloh_count;
+};
+
+#ifdef BLESSING
+struct witness_blessed {
+	const char	*b_lock1;
+	const char	*b_lock2;
+};
+#endif
+
+struct witness_pendhelp {
+	const char		*wh_type;
+	struct lock_object	*wh_lock;
+};
+
+struct witness_order_list_entry {
+	const char		*w_name;
+	struct lock_class	*w_class;
+};
+
+/*
+ * Returns 0 if one of the locks is a spin lock and the other is not.
+ * Returns 1 otherwise.
+ */
+static __inline int
+witness_lock_type_equal(struct witness *w1, struct witness *w2)
+{
+
+	return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
+		(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
+}
+
+static __inline int
+witness_lock_order_key_empty(const struct witness_lock_order_key *key)
+{
+
+	return (key->from == 0 && key->to == 0);
+}
+
+static __inline int
+witness_lock_order_key_equal(const struct witness_lock_order_key *a,
+    const struct witness_lock_order_key *b)
+{
+
+	return (a->from == b->from && a->to == b->to);
+}
+
+static int	_isitmyx(struct witness *w1, struct witness *w2, int rmask,
+		    const char *fname);
+#ifdef KDB
+static void	_witness_debugger(int cond, const char *msg);
+#endif
+static void	adopt(struct witness *parent, struct witness *child);
+#ifdef BLESSING
+static int	blessed(struct witness *, struct witness *);
+#endif
+static void	depart(struct witness *w);
+static struct witness	*enroll(const char *description,
+			    struct lock_class *lock_class);
+static struct lock_instance	*find_instance(struct lock_list_entry *list,
+				    const struct lock_object *lock);
+static int	isitmychild(struct witness *parent, struct witness *child);
+static int	isitmydescendant(struct witness *parent, struct witness *child);
+static void	itismychild(struct witness *parent, struct witness *child);
+static int	sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS);
+static int	sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
+static int	sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS);
+static void	witness_add_fullgraph(struct sbuf *sb, struct witness *parent);
+#ifdef DDB
+static void	witness_ddb_compute_levels(void);
+static void	witness_ddb_display(int(*)(const char *fmt, ...));
+static void	witness_ddb_display_descendants(int(*)(const char *fmt, ...),
+		    struct witness *, int indent);
+static void	witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
+		    struct witness_list *list);
+static void	witness_ddb_level_descendants(struct witness *parent, int l);
+static void	witness_ddb_list(struct thread *td);
+#endif
+static void	witness_free(struct witness *m);
+static struct witness	*witness_get(void);
+static uint32_t	witness_hash_djb2(const uint8_t *key, uint32_t size);
+static struct witness	*witness_hash_get(const char *key);
+static void	witness_hash_put(struct witness *w);
+static void	witness_init_hash_tables(void);
+static void	witness_increment_graph_generation(void);
+static void	witness_lock_list_free(struct lock_list_entry *lle);
+static struct lock_list_entry	*witness_lock_list_get(void);
+static int	witness_lock_order_add(struct witness *parent,
+		    struct witness *child);
+static int	witness_lock_order_check(struct witness *parent,
+		    struct witness *child);
+static struct witness_lock_order_data	*witness_lock_order_get(
+					    struct witness *parent,
+					    struct witness *child);
+static void	witness_list_lock(struct lock_instance *instance,
+		    int (*prnt)(const char *fmt, ...));
+static void	witness_setflag(struct lock_object *lock, int flag, int set);
+
+#ifdef KDB
+#define	witness_debugger(c)	_witness_debugger(c, __func__)
+#else
+#define	witness_debugger(c)
+#endif
+
+static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL,
+    "Witness Locking");
+
+/*
+ * If set to 0, lock order checking is disabled.  If set to -1,
+ * witness is completely disabled.  Otherwise witness performs full
+ * lock order checking for all locks.  At runtime, lock order checking
+ * may be toggled.  However, witness cannot be reenabled once it is
+ * completely disabled.
+ */
+static int witness_watch = 1;
+TUNABLE_INT("debug.witness.watch", &witness_watch);
+SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RW | CTLTYPE_INT, NULL, 0,
+    sysctl_debug_witness_watch, "I", "witness is watching lock operations");
+
+#ifdef KDB
+/*
+ * When KDB is enabled and witness_kdb is 1, it will cause the system
+ * to drop into kdebug() when:
+ *	- a lock hierarchy violation occurs
+ *	- locks are held when going to sleep.
+ */
+#ifdef WITNESS_KDB
+int	witness_kdb = 1;
+#else
+int	witness_kdb = 0;
+#endif
+TUNABLE_INT("debug.witness.kdb", &witness_kdb);
+SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RW, &witness_kdb, 0, "");
+
+/*
+ * When KDB is enabled and witness_trace is 1, it will cause the system
+ * to print a stack trace:
+ *	- a lock hierarchy violation occurs
+ *	- locks are held when going to sleep.
+ */
+int	witness_trace = 1;
+TUNABLE_INT("debug.witness.trace", &witness_trace);
+SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RW, &witness_trace, 0, "");
+#endif /* KDB */
+
+#ifdef WITNESS_SKIPSPIN
+int	witness_skipspin = 1;
+#else
+int	witness_skipspin = 0;
+#endif
+TUNABLE_INT("debug.witness.skipspin", &witness_skipspin);
+SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin,
+    0, "");
+
+/*
+ * Call this to print out the relations between locks.
+ */
+SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD,
+    NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs");
+
+/*
+ * Call this to print out the witness faulty stacks.
+ */
+SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD,
+    NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks");
+
+static struct mtx w_mtx;
+
+/* w_list */
+static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
+static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
+
+/* w_typelist */
+static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
+static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
+
+/* lock list */
+static struct lock_list_entry *w_lock_list_free = NULL;
+static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
+static u_int pending_cnt;
+
+static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
+SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, "");
+SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, "");
+SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0,
+    "");
+
+static struct witness *w_data;
+static uint8_t w_rmatrix[WITNESS_COUNT+1][WITNESS_COUNT+1];
+static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
+static struct witness_hash w_hash;	/* The witness hash table. */
+
+/* The lock order data hash */
+static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
+static struct witness_lock_order_data *w_lofree = NULL;
+static struct witness_lock_order_hash w_lohash;
+static int w_max_used_index = 0;
+static unsigned int w_generation = 0;
+static const char w_notrunning[] = "Witness not running\n";
+static const char w_stillcold[] = "Witness is still cold\n";
+
+
+static struct witness_order_list_entry order_lists[] = {
+	/*
+	 * sx locks
+	 */
+	{ "proctree", &lock_class_sx },
+	{ "allproc", &lock_class_sx },
+	{ "allprison", &lock_class_sx },
+	{ NULL, NULL },
+	/*
+	 * Various mutexes
+	 */
+	{ "Giant", &lock_class_mtx_sleep },
+	{ "pipe mutex", &lock_class_mtx_sleep },
+	{ "sigio lock", &lock_class_mtx_sleep },
+	{ "process group", &lock_class_mtx_sleep },
+	{ "process lock", &lock_class_mtx_sleep },
+	{ "session", &lock_class_mtx_sleep },
+	{ "uidinfo hash", &lock_class_rw },
+#ifdef	HWPMC_HOOKS
+	{ "pmc-sleep", &lock_class_mtx_sleep },
+#endif
+	{ "time lock", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
+	 * Sockets
+	 */
+	{ "accept", &lock_class_mtx_sleep },
+	{ "so_snd", &lock_class_mtx_sleep },
+	{ "so_rcv", &lock_class_mtx_sleep },
+	{ "sellck", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
+	 * Routing
+	 */
+	{ "so_rcv", &lock_class_mtx_sleep },
+	{ "radix node head", &lock_class_rw },
+	{ "rtentry", &lock_class_mtx_sleep },
+	{ "ifaddr", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
+	 * IPv4 multicast:
+	 * protocol locks before interface locks, after UDP locks.
+	 */
+	{ "udpinp", &lock_class_rw },
+	{ "in_multi_mtx", &lock_class_mtx_sleep },
+	{ "igmp_mtx", &lock_class_mtx_sleep },
+	{ "if_addr_lock", &lock_class_rw },
+	{ NULL, NULL },
+	/*
+	 * IPv6 multicast:
+	 * protocol locks before interface locks, after UDP locks.
+	 */
+	{ "udpinp", &lock_class_rw },
+	{ "in6_multi_mtx", &lock_class_mtx_sleep },
+	{ "mld_mtx", &lock_class_mtx_sleep },
+	{ "if_addr_lock", &lock_class_rw },
+	{ NULL, NULL },
+	/*
+	 * UNIX Domain Sockets
+	 */
+	{ "unp_global_rwlock", &lock_class_rw },
+	{ "unp_list_lock", &lock_class_mtx_sleep },
+	{ "unp", &lock_class_mtx_sleep },
+	{ "so_snd", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
+	 * UDP/IP
+	 */
+	{ "udp", &lock_class_rw },
+	{ "udpinp", &lock_class_rw },
+	{ "so_snd", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
+	 * TCP/IP
+	 */
+	{ "tcp", &lock_class_rw },
+	{ "tcpinp", &lock_class_rw },
+	{ "so_snd", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
+	 * netatalk
+	 */
+	{ "ddp_list_mtx", &lock_class_mtx_sleep },
+	{ "ddp_mtx", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
+	 * BPF
+	 */
+	{ "bpf global lock", &lock_class_mtx_sleep },
+	{ "bpf interface lock", &lock_class_rw },
+	{ "bpf cdev lock", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
+	 * NFS server
+	 */
+	{ "nfsd_mtx", &lock_class_mtx_sleep },
+	{ "so_snd", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+
+	/*
+	 * IEEE 802.11
+	 */
+	{ "802.11 com lock", &lock_class_mtx_sleep},
+	{ NULL, NULL },
+	/*
+	 * Network drivers
+	 */
+	{ "network driver", &lock_class_mtx_sleep},
+	{ NULL, NULL },
+
+	/*
+	 * Netgraph
+	 */
+	{ "ng_node", &lock_class_mtx_sleep },
+	{ "ng_worklist", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
+	 * CDEV
+	 */
+	{ "vm map (system)", &lock_class_mtx_sleep },
+	{ "vm page queue", &lock_class_mtx_sleep },
+	{ "vnode interlock", &lock_class_mtx_sleep },
+	{ "cdev", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
+	 * VM
+	 */
+	{ "vm map (user)", &lock_class_sx },
+	{ "vm object", &lock_class_rw },
+	{ "vm page", &lock_class_mtx_sleep },
+	{ "vm page queue", &lock_class_mtx_sleep },
+	{ "pmap pv global", &lock_class_rw },
+	{ "pmap", &lock_class_mtx_sleep },
+	{ "pmap pv list", &lock_class_rw },
+	{ "vm page free queue", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
+	 * kqueue/VFS interaction
+	 */
+	{ "kqueue", &lock_class_mtx_sleep },
+	{ "struct mount mtx", &lock_class_mtx_sleep },
+	{ "vnode interlock", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
+	 * ZFS locking
+	 */
+	{ "dn->dn_mtx", &lock_class_sx },
+	{ "dr->dt.di.dr_mtx", &lock_class_sx },
+	{ "db->db_mtx", &lock_class_sx },
+	{ NULL, NULL },
+	/*
+	 * spin locks
+	 */
+#ifdef SMP
+	{ "ap boot", &lock_class_mtx_spin },
+#endif
+	{ "rm.mutex_mtx", &lock_class_mtx_spin },
+	{ "sio", &lock_class_mtx_spin },
+	{ "scrlock", &lock_class_mtx_spin },
+#ifdef __i386__
+	{ "cy", &lock_class_mtx_spin },
+#endif
+#ifdef __sparc64__
+	{ "pcib_mtx", &lock_class_mtx_spin },
+	{ "rtc_mtx", &lock_class_mtx_spin },
+#endif
+	{ "scc_hwmtx", &lock_class_mtx_spin },
+	{ "uart_hwmtx", &lock_class_mtx_spin },
+	{ "fast_taskqueue", &lock_class_mtx_spin },
+	{ "intr table", &lock_class_mtx_spin },
+#ifdef	HWPMC_HOOKS
+	{ "pmc-per-proc", &lock_class_mtx_spin },
+#endif
+	{ "process slock", &lock_class_mtx_spin },
+	{ "sleepq chain", &lock_class_mtx_spin },
+	{ "umtx lock", &lock_class_mtx_spin },
+	{ "rm_spinlock", &lock_class_mtx_spin },
+	{ "turnstile chain", &lock_class_mtx_spin },
+	{ "turnstile lock", &lock_class_mtx_spin },
+	{ "sched lock", &lock_class_mtx_spin },
+	{ "td_contested", &lock_class_mtx_spin },
+	{ "callout", &lock_class_mtx_spin },
+	{ "entropy harvest mutex", &lock_class_mtx_spin },
+	{ "syscons video lock", &lock_class_mtx_spin },
+#ifdef SMP
+	{ "smp rendezvous", &lock_class_mtx_spin },
+#endif
+#ifdef __powerpc__
+	{ "tlb0", &lock_class_mtx_spin },
+#endif
+	/*
+	 * leaf locks
+	 */
+	{ "intrcnt", &lock_class_mtx_spin },
+	{ "icu", &lock_class_mtx_spin },
+#ifdef __i386__
+	{ "allpmaps", &lock_class_mtx_spin },
+	{ "descriptor tables", &lock_class_mtx_spin },
+#endif
+	{ "clk", &lock_class_mtx_spin },
+	{ "cpuset", &lock_class_mtx_spin },
+	{ "mprof lock", &lock_class_mtx_spin },
+	{ "zombie lock", &lock_class_mtx_spin },
+	{ "ALD Queue", &lock_class_mtx_spin },
+#ifdef __ia64__
+	{ "MCA spin lock", &lock_class_mtx_spin },
+#endif
+#if defined(__i386__) || defined(__amd64__)
+	{ "pcicfg", &lock_class_mtx_spin },
+	{ "NDIS thread lock", &lock_class_mtx_spin },
+#endif
+	{ "tw_osl_io_lock", &lock_class_mtx_spin },
+	{ "tw_osl_q_lock", &lock_class_mtx_spin },
+	{ "tw_cl_io_lock", &lock_class_mtx_spin },
+	{ "tw_cl_intr_lock", &lock_class_mtx_spin },
+	{ "tw_cl_gen_lock", &lock_class_mtx_spin },
+#ifdef	HWPMC_HOOKS
+	{ "pmc-leaf", &lock_class_mtx_spin },
+#endif
+	{ "blocked lock", &lock_class_mtx_spin },
+	{ NULL, NULL },
+	{ NULL, NULL }
+};
+
+#ifdef BLESSING
+/*
+ * Pairs of locks which have been blessed
+ * Don't complain about order problems with blessed locks
+ */
+static struct witness_blessed blessed_list[] = {
+};
+static int blessed_count =
+	sizeof(blessed_list) / sizeof(struct witness_blessed);
+#endif
+
+/*
+ * This global is set to 0 once it becomes safe to use the witness code.
+ */
+static int witness_cold = 1;
+
+/*
+ * This global is set to 1 once the static lock orders have been enrolled
+ * so that a warning can be issued for any spin locks enrolled later.
+ */
+static int witness_spin_warn = 0;
+
+/* Trim useless garbage from filenames. */
+static const char *
+fixup_filename(const char *file)
+{
+
+	if (file == NULL)
+		return (NULL);
+	while (strncmp(file, "../", 3) == 0)
+		file += 3;
+	return (file);
+}
+
+/*
+ * The WITNESS-enabled diagnostic code.  Note that the witness code does
+ * assume that the early boot is single-threaded at least until after this
+ * routine is completed.
+ */
+static void
+witness_initialize(void *dummy __unused)
+{
+	struct lock_object *lock;
+	struct witness_order_list_entry *order;
+	struct witness *w, *w1;
+	int i;
+
+	w_data = malloc(sizeof (struct witness) * WITNESS_COUNT, M_WITNESS,
+	    M_NOWAIT | M_ZERO);
+
+	/*
+	 * We have to release Giant before initializing its witness
+	 * structure so that WITNESS doesn't get confused.
+	 */
+	mtx_unlock(&Giant);
+	mtx_assert(&Giant, MA_NOTOWNED);
+
+	CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
+	mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
+	    MTX_NOWITNESS | MTX_NOPROFILE);
+	for (i = WITNESS_COUNT - 1; i >= 0; i--) {
+		w = &w_data[i];
+		memset(w, 0, sizeof(*w));
+		w_data[i].w_index = i;	/* Witness index never changes. */
+		witness_free(w);
+	}
+	KASSERT(STAILQ_FIRST(&w_free)->w_index == 0,
+	    ("%s: Invalid list of free witness objects", __func__));
+
+	/* Witness with index 0 is not used to aid in debugging. */
+	STAILQ_REMOVE_HEAD(&w_free, w_list);
+	w_free_cnt--;
+
+	memset(w_rmatrix, 0,
+	    (sizeof(**w_rmatrix) * (WITNESS_COUNT+1) * (WITNESS_COUNT+1)));
+
+	for (i = 0; i < LOCK_CHILDCOUNT; i++)
+		witness_lock_list_free(&w_locklistdata[i]);
+	witness_init_hash_tables();
+
+	/* First add in all the specified order lists. */
+	for (order = order_lists; order->w_name != NULL; order++) {
+		w = enroll(order->w_name, order->w_class);
+		if (w == NULL)
+			continue;
+		w->w_file = "order list";
+		for (order++; order->w_name != NULL; order++) {
+			w1 = enroll(order->w_name, order->w_class);
+			if (w1 == NULL)
+				continue;
+			w1->w_file = "order list";
+			itismychild(w, w1);
+			w = w1;
+		}
+	}
+	witness_spin_warn = 1;
+
+	/* Iterate through all locks and add them to witness. */
+	for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
+		lock = pending_locks[i].wh_lock;
+		KASSERT(lock->lo_flags & LO_WITNESS,
+		    ("%s: lock %s is on pending list but not LO_WITNESS",
+		    __func__, lock->lo_name));
+		lock->lo_witness = enroll(pending_locks[i].wh_type,
+		    LOCK_CLASS(lock));
+	}
+
+	/* Mark the witness code as being ready for use. */
+	witness_cold = 0;
+
+	mtx_lock(&Giant);
+}
+SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize,
+    NULL);
+
+void
+witness_init(struct lock_object *lock, const char *type)
+{
+	struct lock_class *class;
+
+	/* Various sanity checks. */
+	class = LOCK_CLASS(lock);
+	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
+	    (class->lc_flags & LC_RECURSABLE) == 0)
+		kassert_panic("%s: lock (%s) %s can not be recursable",
+		    __func__, class->lc_name, lock->lo_name);
+	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
+	    (class->lc_flags & LC_SLEEPABLE) == 0)
+		kassert_panic("%s: lock (%s) %s can not be sleepable",
+		    __func__, class->lc_name, lock->lo_name);
+	if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
+	    (class->lc_flags & LC_UPGRADABLE) == 0)
+		kassert_panic("%s: lock (%s) %s can not be upgradable",
+		    __func__, class->lc_name, lock->lo_name);
+
+	/*
+	 * If we shouldn't watch this lock, then just clear lo_witness.
+	 * Otherwise, if witness_cold is set, then it is too early to
+	 * enroll this lock, so defer it to witness_initialize() by adding
+	 * it to the pending_locks list.  If it is not too early, then enroll
+	 * the lock now.
+	 */
+	if (witness_watch < 1 || panicstr != NULL ||
+	    (lock->lo_flags & LO_WITNESS) == 0)
+		lock->lo_witness = NULL;
+	else if (witness_cold) {
+		pending_locks[pending_cnt].wh_lock = lock;
+		pending_locks[pending_cnt++].wh_type = type;
+		if (pending_cnt > WITNESS_PENDLIST)
+			panic("%s: pending locks list is too small, "
+			    "increase WITNESS_PENDLIST\n",
+			    __func__);
+	} else
+		lock->lo_witness = enroll(type, class);
+}
+
+void
+witness_destroy(struct lock_object *lock)
+{
+	struct lock_class *class;
+	struct witness *w;
+
+	class = LOCK_CLASS(lock);
+
+	if (witness_cold)
+		panic("lock (%s) %s destroyed while witness_cold",
+		    class->lc_name, lock->lo_name);
+
+	/* XXX: need to verify that no one holds the lock */
+	if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL)
+		return;
+	w = lock->lo_witness;
+
+	mtx_lock_spin(&w_mtx);
+	MPASS(w->w_refcount > 0);
+	w->w_refcount--;
+
+	if (w->w_refcount == 0)
+		depart(w);
+	mtx_unlock_spin(&w_mtx);
+}
+
+#ifdef DDB
+static void
+witness_ddb_compute_levels(void)
+{
+	struct witness *w;
+
+	/*
+	 * First clear all levels.
+	 */
+	STAILQ_FOREACH(w, &w_all, w_list)
+		w->w_ddb_level = -1;
+
+	/*
+	 * Look for locks with no parents and level all their descendants.
+	 */
+	STAILQ_FOREACH(w, &w_all, w_list) {
+
+		/* If the witness has ancestors (is not a root), skip it. */
+		if (w->w_num_ancestors > 0)
+			continue;
+		witness_ddb_level_descendants(w, 0);
+	}
+}
+
+static void
+witness_ddb_level_descendants(struct witness *w, int l)
+{
+	int i;
+
+	if (w->w_ddb_level >= l)
+		return;
+
+	w->w_ddb_level = l;
+	l++;
+
+	for (i = 1; i <= w_max_used_index; i++) {
+		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
+			witness_ddb_level_descendants(&w_data[i], l);
+	}
+}
+
+static void
+witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
+    struct witness *w, int indent)
+{
+	int i;
+
+ 	for (i = 0; i < indent; i++)
+ 		prnt(" ");
+	prnt("%s (type: %s, depth: %d, active refs: %d)",
+	     w->w_name, w->w_class->lc_name,
+	     w->w_ddb_level, w->w_refcount);
+ 	if (w->w_displayed) {
+ 		prnt(" -- (already displayed)\n");
+ 		return;
+ 	}
+ 	w->w_displayed = 1;
+	if (w->w_file != NULL && w->w_line != 0)
+		prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file),
+		    w->w_line);
+	else
+		prnt(" -- never acquired\n");
+	indent++;
+	WITNESS_INDEX_ASSERT(w->w_index);
+	for (i = 1; i <= w_max_used_index; i++) {
+		if (db_pager_quit)
+			return;
+		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
+			witness_ddb_display_descendants(prnt, &w_data[i],
+			    indent);
+	}
+}
+
+static void
+witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
+    struct witness_list *list)
+{
+	struct witness *w;
+
+	STAILQ_FOREACH(w, list, w_typelist) {
+		if (w->w_file == NULL || w->w_ddb_level > 0)
+			continue;
+
+		/* This lock has no anscestors - display its descendants. */
+		witness_ddb_display_descendants(prnt, w, 0);
+		if (db_pager_quit)
+			return;
+	}
+}
+	
+static void
+witness_ddb_display(int(*prnt)(const char *fmt, ...))
+{
+	struct witness *w;
+
+	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
+	witness_ddb_compute_levels();
+
+	/* Clear all the displayed flags. */
+	STAILQ_FOREACH(w, &w_all, w_list)
+		w->w_displayed = 0;
+
+	/*
+	 * First, handle sleep locks which have been acquired at least
+	 * once.
+	 */
+	prnt("Sleep locks:\n");
+	witness_ddb_display_list(prnt, &w_sleep);
+	if (db_pager_quit)
+		return;
+	
+	/*
+	 * Now do spin locks which have been acquired at least once.
+	 */
+	prnt("\nSpin locks:\n");
+	witness_ddb_display_list(prnt, &w_spin);
+	if (db_pager_quit)
+		return;
+	
+	/*
+	 * Finally, any locks which have not been acquired yet.
+	 */
+	prnt("\nLocks which were never acquired:\n");
+	STAILQ_FOREACH(w, &w_all, w_list) {
+		if (w->w_file != NULL || w->w_refcount == 0)
+			continue;
+		prnt("%s (type: %s, depth: %d)\n", w->w_name,
+		    w->w_class->lc_name, w->w_ddb_level);
+		if (db_pager_quit)
+			return;
+	}
+}
+#endif /* DDB */
+
+int
+witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
+{
+
+	if (witness_watch == -1 || panicstr != NULL)
+		return (0);
+
+	/* Require locks that witness knows about. */
+	if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
+	    lock2->lo_witness == NULL)
+		return (EINVAL);
+
+	mtx_assert(&w_mtx, MA_NOTOWNED);
+	mtx_lock_spin(&w_mtx);
+
+	/*
+	 * If we already have either an explicit or implied lock order that
+	 * is the other way around, then return an error.
+	 */
+	if (witness_watch &&
+	    isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
+		mtx_unlock_spin(&w_mtx);
+		return (EDOOFUS);
+	}
+	
+	/* Try to add the new order. */
+	CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
+	    lock2->lo_witness->w_name, lock1->lo_witness->w_name);
+	itismychild(lock1->lo_witness, lock2->lo_witness);
+	mtx_unlock_spin(&w_mtx);
+	return (0);
+}
+
+void
+witness_checkorder(struct lock_object *lock, int flags, const char *file,
+    int line, struct lock_object *interlock)
+{
+	struct lock_list_entry *lock_list, *lle;
+	struct lock_instance *lock1, *lock2, *plock;
+	struct lock_class *class, *iclass;
+	struct witness *w, *w1;
+	struct thread *td;
+	int i, j;
+
+	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL ||
+	    panicstr != NULL)
+		return;
+
+	w = lock->lo_witness;
+	class = LOCK_CLASS(lock);
+	td = curthread;
+
+	if (class->lc_flags & LC_SLEEPLOCK) {
+
+		/*
+		 * Since spin locks include a critical section, this check
+		 * implicitly enforces a lock order of all sleep locks before
+		 * all spin locks.
+		 */
+		if (td->td_critnest != 0 && !kdb_active)
+			kassert_panic("acquiring blockable sleep lock with "
+			    "spinlock or critical section held (%s) %s @ %s:%d",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+
+		/*
+		 * If this is the first lock acquired then just return as
+		 * no order checking is needed.
+		 */
+		lock_list = td->td_sleeplocks;
+		if (lock_list == NULL || lock_list->ll_count == 0)
+			return;
+	} else {
+
+		/*
+		 * If this is the first lock, just return as no order
+		 * checking is needed.  Avoid problems with thread
+		 * migration pinning the thread while checking if
+		 * spinlocks are held.  If at least one spinlock is held
+		 * the thread is in a safe path and it is allowed to
+		 * unpin it.
+		 */
+		sched_pin();
+		lock_list = PCPU_GET(spinlocks);
+		if (lock_list == NULL || lock_list->ll_count == 0) {
+			sched_unpin();
+			return;
+		}
+		sched_unpin();
+	}
+
+	/*
+	 * Check to see if we are recursing on a lock we already own.  If
+	 * so, make sure that we don't mismatch exclusive and shared lock
+	 * acquires.
+	 */
+	lock1 = find_instance(lock_list, lock);
+	if (lock1 != NULL) {
+		if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
+		    (flags & LOP_EXCLUSIVE) == 0) {
+			printf("shared lock of (%s) %s @ %s:%d\n",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+			printf("while exclusively locked from %s:%d\n",
+			    fixup_filename(lock1->li_file), lock1->li_line);
+			kassert_panic("excl->share");
+		}
+		if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
+		    (flags & LOP_EXCLUSIVE) != 0) {
+			printf("exclusive lock of (%s) %s @ %s:%d\n",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+			printf("while share locked from %s:%d\n",
+			    fixup_filename(lock1->li_file), lock1->li_line);
+			kassert_panic("share->excl");
+		}
+		return;
+	}
+
+	/* Warn if the interlock is not locked exactly once. */
+	if (interlock != NULL) {
+		iclass = LOCK_CLASS(interlock);
+		lock1 = find_instance(lock_list, interlock);
+		if (lock1 == NULL)
+			kassert_panic("interlock (%s) %s not locked @ %s:%d",
+			    iclass->lc_name, interlock->lo_name,
+			    fixup_filename(file), line);
+		else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
+			kassert_panic("interlock (%s) %s recursed @ %s:%d",
+			    iclass->lc_name, interlock->lo_name,
+			    fixup_filename(file), line);
+	}
+
+	/*
+	 * Find the previously acquired lock, but ignore interlocks.
+	 */
+	plock = &lock_list->ll_children[lock_list->ll_count - 1];
+	if (interlock != NULL && plock->li_lock == interlock) {
+		if (lock_list->ll_count > 1)
+			plock =
+			    &lock_list->ll_children[lock_list->ll_count - 2];
+		else {
+			lle = lock_list->ll_next;
+
+			/*
+			 * The interlock is the only lock we hold, so
+			 * simply return.
+			 */
+			if (lle == NULL)
+				return;
+			plock = &lle->ll_children[lle->ll_count - 1];
+		}
+	}
+	
+	/*
+	 * Try to perform most checks without a lock.  If this succeeds we
+	 * can skip acquiring the lock and return success.
+	 */
+	w1 = plock->li_lock->lo_witness;
+	if (witness_lock_order_check(w1, w))
+		return;
+
+	/*
+	 * Check for duplicate locks of the same type.  Note that we only
+	 * have to check for this on the last lock we just acquired.  Any
+	 * other cases will be caught as lock order violations.
+	 */
+	mtx_lock_spin(&w_mtx);
+	witness_lock_order_add(w1, w);
+	if (w1 == w) {
+		i = w->w_index;
+		if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
+		    !(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
+		    w_rmatrix[i][i] |= WITNESS_REVERSAL;
+			w->w_reversed = 1;
+			mtx_unlock_spin(&w_mtx);
+			printf(
+			    "acquiring duplicate lock of same type: \"%s\"\n", 
+			    w->w_name);
+			printf(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
+			    fixup_filename(plock->li_file), plock->li_line);
+			printf(" 2nd %s @ %s:%d\n", lock->lo_name,
+			    fixup_filename(file), line);
+			witness_debugger(1);
+		} else
+			mtx_unlock_spin(&w_mtx);
+		return;
+	}
+	mtx_assert(&w_mtx, MA_OWNED);
+
+	/*
+	 * If we know that the lock we are acquiring comes after
+	 * the lock we most recently acquired in the lock order tree,
+	 * then there is no need for any further checks.
+	 */
+	if (isitmychild(w1, w))
+		goto out;
+
+	for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
+		for (i = lle->ll_count - 1; i >= 0; i--, j++) {
+
+			MPASS(j < WITNESS_COUNT);
+			lock1 = &lle->ll_children[i];
+
+			/*
+			 * Ignore the interlock.
+			 */
+			if (interlock == lock1->li_lock)
+				continue;
+
+			/*
+			 * If this lock doesn't undergo witness checking,
+			 * then skip it.
+			 */
+			w1 = lock1->li_lock->lo_witness;
+			if (w1 == NULL) {
+				KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
+				    ("lock missing witness structure"));
+				continue;
+			}
+
+			/*
+			 * If we are locking Giant and this is a sleepable
+			 * lock, then skip it.
+			 */
+			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
+			    lock == &Giant.lock_object)
+				continue;
+
+			/*
+			 * If we are locking a sleepable lock and this lock
+			 * is Giant, then skip it.
+			 */
+			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
+			    lock1->li_lock == &Giant.lock_object)
+				continue;
+
+			/*
+			 * If we are locking a sleepable lock and this lock
+			 * isn't sleepable, we want to treat it as a lock
+			 * order violation to enfore a general lock order of
+			 * sleepable locks before non-sleepable locks.
+			 */
+			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
+			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
+				goto reversal;
+
+			/*
+			 * If we are locking Giant and this is a non-sleepable
+			 * lock, then treat it as a reversal.
+			 */
+			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
+			    lock == &Giant.lock_object)
+				goto reversal;
+
+			/*
+			 * Check the lock order hierarchy for a reveresal.
+			 */
+			if (!isitmydescendant(w, w1))
+				continue;
+		reversal:
+
+			/*
+			 * We have a lock order violation, check to see if it
+			 * is allowed or has already been yelled about.
+			 */
+#ifdef BLESSING
+
+			/*
+			 * If the lock order is blessed, just bail.  We don't
+			 * look for other lock order violations though, which
+			 * may be a bug.
+			 */
+			if (blessed(w, w1))
+				goto out;
+#endif
+
+			/* Bail if this violation is known */
+			if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
+				goto out;
+
+			/* Record this as a violation */
+			w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
+			w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
+			w->w_reversed = w1->w_reversed = 1;
+			witness_increment_graph_generation();
+			mtx_unlock_spin(&w_mtx);
+
+#ifdef WITNESS_NO_VNODE
+			/*
+			 * There are known LORs between VNODE locks. They are
+			 * not an indication of a bug. VNODE locks are flagged
+			 * as such (LO_IS_VNODE) and we don't yell if the LOR
+			 * is between 2 VNODE locks.
+			 */
+			if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
+			    (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
+				return;
+#endif
+
+			/*
+			 * Ok, yell about it.
+			 */
+			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
+			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
+				printf(
+		"lock order reversal: (sleepable after non-sleepable)\n");
+			else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
+			    && lock == &Giant.lock_object)
+				printf(
+		"lock order reversal: (Giant after non-sleepable)\n");
+			else
+				printf("lock order reversal:\n");
+
+			/*
+			 * Try to locate an earlier lock with
+			 * witness w in our list.
+			 */
+			do {
+				lock2 = &lle->ll_children[i];
+				MPASS(lock2->li_lock != NULL);
+				if (lock2->li_lock->lo_witness == w)
+					break;
+				if (i == 0 && lle->ll_next != NULL) {
+					lle = lle->ll_next;
+					i = lle->ll_count - 1;
+					MPASS(i >= 0 && i < LOCK_NCHILDREN);
+				} else
+					i--;
+			} while (i >= 0);
+			if (i < 0) {
+				printf(" 1st %p %s (%s) @ %s:%d\n",
+				    lock1->li_lock, lock1->li_lock->lo_name,
+				    w1->w_name, fixup_filename(lock1->li_file),
+				    lock1->li_line);
+				printf(" 2nd %p %s (%s) @ %s:%d\n", lock,
+				    lock->lo_name, w->w_name,
+				    fixup_filename(file), line);
+			} else {
+				printf(" 1st %p %s (%s) @ %s:%d\n",
+				    lock2->li_lock, lock2->li_lock->lo_name,
+				    lock2->li_lock->lo_witness->w_name,
+				    fixup_filename(lock2->li_file),
+				    lock2->li_line);
+				printf(" 2nd %p %s (%s) @ %s:%d\n",
+				    lock1->li_lock, lock1->li_lock->lo_name,
+				    w1->w_name, fixup_filename(lock1->li_file),
+				    lock1->li_line);
+				printf(" 3rd %p %s (%s) @ %s:%d\n", lock,
+				    lock->lo_name, w->w_name,
+				    fixup_filename(file), line);
+			}
+			witness_debugger(1);
+			return;
+		}
+	}
+
+	/*
+	 * If requested, build a new lock order.  However, don't build a new
+	 * relationship between a sleepable lock and Giant if it is in the
+	 * wrong direction.  The correct lock order is that sleepable locks
+	 * always come before Giant.
+	 */
+	if (flags & LOP_NEWORDER &&
+	    !(plock->li_lock == &Giant.lock_object &&
+	    (lock->lo_flags & LO_SLEEPABLE) != 0)) {
+		CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
+		    w->w_name, plock->li_lock->lo_witness->w_name);
+		itismychild(plock->li_lock->lo_witness, w);
+	}
+out:
+	mtx_unlock_spin(&w_mtx);
+}
+
+void
+witness_lock(struct lock_object *lock, int flags, const char *file, int line)
+{
+	struct lock_list_entry **lock_list, *lle;
+	struct lock_instance *instance;
+	struct witness *w;
+	struct thread *td;
+
+	if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL ||
+	    panicstr != NULL)
+		return;
+	w = lock->lo_witness;
+	td = curthread;
+
+	/* Determine lock list for this lock. */
+	if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
+		lock_list = &td->td_sleeplocks;
+	else
+		lock_list = PCPU_PTR(spinlocks);
+
+	/* Check to see if we are recursing on a lock we already own. */
+	instance = find_instance(*lock_list, lock);
+	if (instance != NULL) {
+		instance->li_flags++;
+		CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
+		    td->td_proc->p_pid, lock->lo_name,
+		    instance->li_flags & LI_RECURSEMASK);
+		instance->li_file = file;
+		instance->li_line = line;
+		return;
+	}
+
+	/* Update per-witness last file and line acquire. */
+	w->w_file = file;
+	w->w_line = line;
+
+	/* Find the next open lock instance in the list and fill it. */
+	lle = *lock_list;
+	if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
+		lle = witness_lock_list_get();
+		if (lle == NULL)
+			return;
+		lle->ll_next = *lock_list;
+		CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
+		    td->td_proc->p_pid, lle);
+		*lock_list = lle;
+	}
+	instance = &lle->ll_children[lle->ll_count++];
+	instance->li_lock = lock;
+	instance->li_line = line;
+	instance->li_file = file;
+	if ((flags & LOP_EXCLUSIVE) != 0)
+		instance->li_flags = LI_EXCLUSIVE;
+	else
+		instance->li_flags = 0;
+	CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
+	    td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
+}
+
+void
+witness_upgrade(struct lock_object *lock, int flags, const char *file, int line)
+{
+	struct lock_instance *instance;
+	struct lock_class *class;
+
+	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
+	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
+		return;
+	class = LOCK_CLASS(lock);
+	if (witness_watch) {
+		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
+			kassert_panic(
+			    "upgrade of non-upgradable lock (%s) %s @ %s:%d",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
+			kassert_panic(
+			    "upgrade of non-sleep lock (%s) %s @ %s:%d",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+	}
+	instance = find_instance(curthread->td_sleeplocks, lock);
+	if (instance == NULL) {
+		kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d",
+		    class->lc_name, lock->lo_name,
+		    fixup_filename(file), line);
+		return;
+	}
+	if (witness_watch) {
+		if ((instance->li_flags & LI_EXCLUSIVE) != 0)
+			kassert_panic(
+			    "upgrade of exclusive lock (%s) %s @ %s:%d",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+		if ((instance->li_flags & LI_RECURSEMASK) != 0)
+			kassert_panic(
+			    "upgrade of recursed lock (%s) %s r=%d @ %s:%d",
+			    class->lc_name, lock->lo_name,
+			    instance->li_flags & LI_RECURSEMASK,
+			    fixup_filename(file), line);
+	}
+	instance->li_flags |= LI_EXCLUSIVE;
+}
+
+void
+witness_downgrade(struct lock_object *lock, int flags, const char *file,
+    int line)
+{
+	struct lock_instance *instance;
+	struct lock_class *class;
+
+	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
+	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
+		return;
+	class = LOCK_CLASS(lock);
+	if (witness_watch) {
+		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
+			kassert_panic(
+			    "downgrade of non-upgradable lock (%s) %s @ %s:%d",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
+			kassert_panic(
+			    "downgrade of non-sleep lock (%s) %s @ %s:%d",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+	}
+	instance = find_instance(curthread->td_sleeplocks, lock);
+	if (instance == NULL) {
+		kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d",
+		    class->lc_name, lock->lo_name,
+		    fixup_filename(file), line);
+		return;
+	}
+	if (witness_watch) {
+		if ((instance->li_flags & LI_EXCLUSIVE) == 0)
+			kassert_panic(
+			    "downgrade of shared lock (%s) %s @ %s:%d",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+		if ((instance->li_flags & LI_RECURSEMASK) != 0)
+			kassert_panic(
+			    "downgrade of recursed lock (%s) %s r=%d @ %s:%d",
+			    class->lc_name, lock->lo_name,
+			    instance->li_flags & LI_RECURSEMASK,
+			    fixup_filename(file), line);
+	}
+	instance->li_flags &= ~LI_EXCLUSIVE;
+}
+
+void
+witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
+{
+	struct lock_list_entry **lock_list, *lle;
+	struct lock_instance *instance;
+	struct lock_class *class;
+	struct thread *td;
+	register_t s;
+	int i, j;
+
+	if (witness_cold || lock->lo_witness == NULL || panicstr != NULL)
+		return;
+	td = curthread;
+	class = LOCK_CLASS(lock);
+
+	/* Find lock instance associated with this lock. */
+	if (class->lc_flags & LC_SLEEPLOCK)
+		lock_list = &td->td_sleeplocks;
+	else
+		lock_list = PCPU_PTR(spinlocks);
+	lle = *lock_list;
+	for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
+		for (i = 0; i < (*lock_list)->ll_count; i++) {
+			instance = &(*lock_list)->ll_children[i];
+			if (instance->li_lock == lock)
+				goto found;
+		}
+
+	/*
+	 * When disabling WITNESS through witness_watch we could end up in
+	 * having registered locks in the td_sleeplocks queue.
+	 * We have to make sure we flush these queues, so just search for
+	 * eventual register locks and remove them.
+	 */
+	if (witness_watch > 0) {
+		kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
+		    lock->lo_name, fixup_filename(file), line);
+		return;
+	} else {
+		return;
+	}
+found:
+
+	/* First, check for shared/exclusive mismatches. */
+	if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
+	    (flags & LOP_EXCLUSIVE) == 0) {
+		printf("shared unlock of (%s) %s @ %s:%d\n", class->lc_name,
+		    lock->lo_name, fixup_filename(file), line);
+		printf("while exclusively locked from %s:%d\n",
+		    fixup_filename(instance->li_file), instance->li_line);
+		kassert_panic("excl->ushare");
+	}
+	if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
+	    (flags & LOP_EXCLUSIVE) != 0) {
+		printf("exclusive unlock of (%s) %s @ %s:%d\n", class->lc_name,
+		    lock->lo_name, fixup_filename(file), line);
+		printf("while share locked from %s:%d\n",
+		    fixup_filename(instance->li_file),
+		    instance->li_line);
+		kassert_panic("share->uexcl");
+	}
+	/* If we are recursed, unrecurse. */
+	if ((instance->li_flags & LI_RECURSEMASK) > 0) {
+		CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__,
+		    td->td_proc->p_pid, instance->li_lock->lo_name,
+		    instance->li_flags);
+		instance->li_flags--;
+		return;
+	}
+	/* The lock is now being dropped, check for NORELEASE flag */
+	if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
+		printf("forbidden unlock of (%s) %s @ %s:%d\n", class->lc_name,
+		    lock->lo_name, fixup_filename(file), line);
+		kassert_panic("lock marked norelease");
+	}
+
+	/* Otherwise, remove this item from the list. */
+	s = intr_disable();
+	CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__,
+	    td->td_proc->p_pid, instance->li_lock->lo_name,
+	    (*lock_list)->ll_count - 1);
+	for (j = i; j < (*lock_list)->ll_count - 1; j++)
+		(*lock_list)->ll_children[j] =
+		    (*lock_list)->ll_children[j + 1];
+	(*lock_list)->ll_count--;
+	intr_restore(s);
+
+	/*
+	 * In order to reduce contention on w_mtx, we want to keep always an
+	 * head object into lists so that frequent allocation from the 
+	 * free witness pool (and subsequent locking) is avoided.
+	 * In order to maintain the current code simple, when the head
+	 * object is totally unloaded it means also that we do not have
+	 * further objects in the list, so the list ownership needs to be
+	 * hand over to another object if the current head needs to be freed.
+	 */
+	if ((*lock_list)->ll_count == 0) {
+		if (*lock_list == lle) {
+			if (lle->ll_next == NULL)
+				return;
+		} else
+			lle = *lock_list;
+		*lock_list = lle->ll_next;
+		CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__,
+		    td->td_proc->p_pid, lle);
+		witness_lock_list_free(lle);
+	}
+}
+
+void
+witness_thread_exit(struct thread *td)
+{
+	struct lock_list_entry *lle;
+	int i, n;
+
+	lle = td->td_sleeplocks;
+	if (lle == NULL || panicstr != NULL)
+		return;
+	if (lle->ll_count != 0) {
+		for (n = 0; lle != NULL; lle = lle->ll_next)
+			for (i = lle->ll_count - 1; i >= 0; i--) {
+				if (n == 0)
+		printf("Thread %p exiting with the following locks held:\n",
+					    td);
+				n++;
+				witness_list_lock(&lle->ll_children[i], printf);
+				
+			}
+		kassert_panic(
+		    "Thread %p cannot exit while holding sleeplocks\n", td);
+	}
+	witness_lock_list_free(lle);
+}
+
+/*
+ * Warn if any locks other than 'lock' are held.  Flags can be passed in to
+ * exempt Giant and sleepable locks from the checks as well.  If any
+ * non-exempt locks are held, then a supplied message is printed to the
+ * console along with a list of the offending locks.  If indicated in the
+ * flags then a failure results in a panic as well.
+ */
+int
+witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
+{
+	struct lock_list_entry *lock_list, *lle;
+	struct lock_instance *lock1;
+	struct thread *td;
+	va_list ap;
+	int i, n;
+
+	if (witness_cold || witness_watch < 1 || panicstr != NULL)
+		return (0);
+	n = 0;
+	td = curthread;
+	for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next)
+		for (i = lle->ll_count - 1; i >= 0; i--) {
+			lock1 = &lle->ll_children[i];
+			if (lock1->li_lock == lock)
+				continue;
+			if (flags & WARN_GIANTOK &&
+			    lock1->li_lock == &Giant.lock_object)
+				continue;
+			if (flags & WARN_SLEEPOK &&
+			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
+				continue;
+			if (n == 0) {
+				va_start(ap, fmt);
+				vprintf(fmt, ap);
+				va_end(ap);
+				printf(" with the following");
+				if (flags & WARN_SLEEPOK)
+					printf(" non-sleepable");
+				printf(" locks held:\n");
+			}
+			n++;
+			witness_list_lock(lock1, printf);
+		}
+
+	/*
+	 * Pin the thread in order to avoid problems with thread migration.
+	 * Once that all verifies are passed about spinlocks ownership,
+	 * the thread is in a safe path and it can be unpinned.
+	 */
+	sched_pin();
+	lock_list = PCPU_GET(spinlocks);
+	if (lock_list != NULL && lock_list->ll_count != 0) {
+		sched_unpin();
+
+		/*
+		 * We should only have one spinlock and as long as
+		 * the flags cannot match for this locks class,
+		 * check if the first spinlock is the one curthread
+		 * should hold.
+		 */
+		lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
+		if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
+		    lock1->li_lock == lock && n == 0)
+			return (0);
+
+		va_start(ap, fmt);
+		vprintf(fmt, ap);
+		va_end(ap);
+		printf(" with the following");
+		if (flags & WARN_SLEEPOK)
+			printf(" non-sleepable");
+		printf(" locks held:\n");
+		n += witness_list_locks(&lock_list, printf);
+	} else
+		sched_unpin();
+	if (flags & WARN_PANIC && n)
+		kassert_panic("%s", __func__);
+	else
+		witness_debugger(n);
+	return (n);
+}
+
+const char *
+witness_file(struct lock_object *lock)
+{
+	struct witness *w;
+
+	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
+		return ("?");
+	w = lock->lo_witness;
+	return (w->w_file);
+}
+
+int
+witness_line(struct lock_object *lock)
+{
+	struct witness *w;
+
+	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
+		return (0);
+	w = lock->lo_witness;
+	return (w->w_line);
+}
+
+static struct witness *
+enroll(const char *description, struct lock_class *lock_class)
+{
+	struct witness *w;
+	struct witness_list *typelist;
+
+	MPASS(description != NULL);
+
+	if (witness_watch == -1 || panicstr != NULL)
+		return (NULL);
+	if ((lock_class->lc_flags & LC_SPINLOCK)) {
+		if (witness_skipspin)
+			return (NULL);
+		else
+			typelist = &w_spin;
+	} else if ((lock_class->lc_flags & LC_SLEEPLOCK)) {
+		typelist = &w_sleep;
+	} else {
+		kassert_panic("lock class %s is not sleep or spin",
+		    lock_class->lc_name);
+		return (NULL);
+	}
+
+	mtx_lock_spin(&w_mtx);
+	w = witness_hash_get(description);
+	if (w)
+		goto found;
+	if ((w = witness_get()) == NULL)
+		return (NULL);
+	MPASS(strlen(description) < MAX_W_NAME);
+	strcpy(w->w_name, description);
+	w->w_class = lock_class;
+	w->w_refcount = 1;
+	STAILQ_INSERT_HEAD(&w_all, w, w_list);
+	if (lock_class->lc_flags & LC_SPINLOCK) {
+		STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
+		w_spin_cnt++;
+	} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
+		STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
+		w_sleep_cnt++;
+	}
+
+	/* Insert new witness into the hash */
+	witness_hash_put(w);
+	witness_increment_graph_generation();
+	mtx_unlock_spin(&w_mtx);
+	return (w);
+found:
+	w->w_refcount++;
+	mtx_unlock_spin(&w_mtx);
+	if (lock_class != w->w_class)
+		kassert_panic(
+			"lock (%s) %s does not match earlier (%s) lock",
+			description, lock_class->lc_name,
+			w->w_class->lc_name);
+	return (w);
+}
+
+static void
+depart(struct witness *w)
+{
+	struct witness_list *list;
+
+	MPASS(w->w_refcount == 0);
+	if (w->w_class->lc_flags & LC_SLEEPLOCK) {
+		list = &w_sleep;
+		w_sleep_cnt--;
+	} else {
+		list = &w_spin;
+		w_spin_cnt--;
+	}
+	/*
+	 * Set file to NULL as it may point into a loadable module.
+	 */
+	w->w_file = NULL;
+	w->w_line = 0;
+	witness_increment_graph_generation();
+}
+
+
+static void
+adopt(struct witness *parent, struct witness *child)
+{
+	int pi, ci, i, j;
+
+	if (witness_cold == 0)
+		mtx_assert(&w_mtx, MA_OWNED);
+
+	/* If the relationship is already known, there's no work to be done. */
+	if (isitmychild(parent, child))
+		return;
+
+	/* When the structure of the graph changes, bump up the generation. */
+	witness_increment_graph_generation();
+
+	/*
+	 * The hard part ... create the direct relationship, then propagate all
+	 * indirect relationships.
+	 */
+	pi = parent->w_index;
+	ci = child->w_index;
+	WITNESS_INDEX_ASSERT(pi);
+	WITNESS_INDEX_ASSERT(ci);
+	MPASS(pi != ci);
+	w_rmatrix[pi][ci] |= WITNESS_PARENT;
+	w_rmatrix[ci][pi] |= WITNESS_CHILD;
+
+	/*
+	 * If parent was not already an ancestor of child,
+	 * then we increment the descendant and ancestor counters.
+	 */
+	if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
+		parent->w_num_descendants++;
+		child->w_num_ancestors++;
+	}
+
+	/* 
+	 * Find each ancestor of 'pi'. Note that 'pi' itself is counted as 
+	 * an ancestor of 'pi' during this loop.
+	 */
+	for (i = 1; i <= w_max_used_index; i++) {
+		if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && 
+		    (i != pi))
+			continue;
+
+		/* Find each descendant of 'i' and mark it as a descendant. */
+		for (j = 1; j <= w_max_used_index; j++) {
+
+			/* 
+			 * Skip children that are already marked as
+			 * descendants of 'i'.
+			 */
+			if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
+				continue;
+
+			/*
+			 * We are only interested in descendants of 'ci'. Note
+			 * that 'ci' itself is counted as a descendant of 'ci'.
+			 */
+			if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && 
+			    (j != ci))
+				continue;
+			w_rmatrix[i][j] |= WITNESS_ANCESTOR;
+			w_rmatrix[j][i] |= WITNESS_DESCENDANT;
+			w_data[i].w_num_descendants++;
+			w_data[j].w_num_ancestors++;
+
+			/* 
+			 * Make sure we aren't marking a node as both an
+			 * ancestor and descendant. We should have caught 
+			 * this as a lock order reversal earlier.
+			 */
+			if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
+			    (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
+				printf("witness rmatrix paradox! [%d][%d]=%d "
+				    "both ancestor and descendant\n",
+				    i, j, w_rmatrix[i][j]); 
+				kdb_backtrace();
+				printf("Witness disabled.\n");
+				witness_watch = -1;
+			}
+			if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
+			    (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
+				printf("witness rmatrix paradox! [%d][%d]=%d "
+				    "both ancestor and descendant\n",
+				    j, i, w_rmatrix[j][i]); 
+				kdb_backtrace();
+				printf("Witness disabled.\n");
+				witness_watch = -1;
+			}
+		}
+	}
+}
+
+static void
+itismychild(struct witness *parent, struct witness *child)
+{
+	int unlocked;
+
+	MPASS(child != NULL && parent != NULL);
+	if (witness_cold == 0)
+		mtx_assert(&w_mtx, MA_OWNED);
+
+	if (!witness_lock_type_equal(parent, child)) {
+		if (witness_cold == 0) {
+			unlocked = 1;
+			mtx_unlock_spin(&w_mtx);
+		} else {
+			unlocked = 0;
+		}
+		kassert_panic(
+		    "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
+		    "the same lock type", __func__, parent->w_name,
+		    parent->w_class->lc_name, child->w_name,
+		    child->w_class->lc_name);
+		if (unlocked)
+			mtx_lock_spin(&w_mtx);
+	}
+	adopt(parent, child);
+}
+
+/*
+ * Generic code for the isitmy*() functions. The rmask parameter is the
+ * expected relationship of w1 to w2.
+ */
+static int
+_isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
+{
+	unsigned char r1, r2;
+	int i1, i2;
+
+	i1 = w1->w_index;
+	i2 = w2->w_index;
+	WITNESS_INDEX_ASSERT(i1);
+	WITNESS_INDEX_ASSERT(i2);
+	r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
+	r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
+
+	/* The flags on one better be the inverse of the flags on the other */
+	if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) ||
+		(WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
+		printf("%s: rmatrix mismatch between %s (index %d) and %s "
+		    "(index %d): w_rmatrix[%d][%d] == %hhx but "
+		    "w_rmatrix[%d][%d] == %hhx\n",
+		    fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1,
+		    i2, i1, r2);
+		kdb_backtrace();
+		printf("Witness disabled.\n");
+		witness_watch = -1;
+	}
+	return (r1 & rmask);
+}
+
+/*
+ * Checks if @child is a direct child of @parent.
+ */
+static int
+isitmychild(struct witness *parent, struct witness *child)
+{
+
+	return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
+}
+
+/*
+ * Checks if @descendant is a direct or inderect descendant of @ancestor.
+ */
+static int
+isitmydescendant(struct witness *ancestor, struct witness *descendant)
+{
+
+	return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
+	    __func__));
+}
+
+#ifdef BLESSING
+static int
+blessed(struct witness *w1, struct witness *w2)
+{
+	int i;
+	struct witness_blessed *b;
+
+	for (i = 0; i < blessed_count; i++) {
+		b = &blessed_list[i];
+		if (strcmp(w1->w_name, b->b_lock1) == 0) {
+			if (strcmp(w2->w_name, b->b_lock2) == 0)
+				return (1);
+			continue;
+		}
+		if (strcmp(w1->w_name, b->b_lock2) == 0)
+			if (strcmp(w2->w_name, b->b_lock1) == 0)
+				return (1);
+	}
+	return (0);
+}
+#endif
+
+static struct witness *
+witness_get(void)
+{
+	struct witness *w;
+	int index;
+
+	if (witness_cold == 0)
+		mtx_assert(&w_mtx, MA_OWNED);
+
+	if (witness_watch == -1) {
+		mtx_unlock_spin(&w_mtx);
+		return (NULL);
+	}
+	if (STAILQ_EMPTY(&w_free)) {
+		witness_watch = -1;
+		mtx_unlock_spin(&w_mtx);
+		printf("WITNESS: unable to allocate a new witness object\n");
+		return (NULL);
+	}
+	w = STAILQ_FIRST(&w_free);
+	STAILQ_REMOVE_HEAD(&w_free, w_list);
+	w_free_cnt--;
+	index = w->w_index;
+	MPASS(index > 0 && index == w_max_used_index+1 &&
+	    index < WITNESS_COUNT);
+	bzero(w, sizeof(*w));
+	w->w_index = index;
+	if (index > w_max_used_index)
+		w_max_used_index = index;
+	return (w);
+}
+
+static void
+witness_free(struct witness *w)
+{
+
+	STAILQ_INSERT_HEAD(&w_free, w, w_list);
+	w_free_cnt++;
+}
+
+static struct lock_list_entry *
+witness_lock_list_get(void)
+{
+	struct lock_list_entry *lle;
+
+	if (witness_watch == -1)
+		return (NULL);
+	mtx_lock_spin(&w_mtx);
+	lle = w_lock_list_free;
+	if (lle == NULL) {
+		witness_watch = -1;
+		mtx_unlock_spin(&w_mtx);
+		printf("%s: witness exhausted\n", __func__);
+		return (NULL);
+	}
+	w_lock_list_free = lle->ll_next;
+	mtx_unlock_spin(&w_mtx);
+	bzero(lle, sizeof(*lle));
+	return (lle);
+}
+		
+static void
+witness_lock_list_free(struct lock_list_entry *lle)
+{
+
+	mtx_lock_spin(&w_mtx);
+	lle->ll_next = w_lock_list_free;
+	w_lock_list_free = lle;
+	mtx_unlock_spin(&w_mtx);
+}
+
+static struct lock_instance *
+find_instance(struct lock_list_entry *list, const struct lock_object *lock)
+{
+	struct lock_list_entry *lle;
+	struct lock_instance *instance;
+	int i;
+
+	for (lle = list; lle != NULL; lle = lle->ll_next)
+		for (i = lle->ll_count - 1; i >= 0; i--) {
+			instance = &lle->ll_children[i];
+			if (instance->li_lock == lock)
+				return (instance);
+		}
+	return (NULL);
+}
+
+static void
+witness_list_lock(struct lock_instance *instance,
+    int (*prnt)(const char *fmt, ...))
+{
+	struct lock_object *lock;
+
+	lock = instance->li_lock;
+	prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
+	    "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
+	if (lock->lo_witness->w_name != lock->lo_name)
+		prnt(" (%s)", lock->lo_witness->w_name);
+	prnt(" r = %d (%p) locked @ %s:%d\n",
+	    instance->li_flags & LI_RECURSEMASK, lock,
+	    fixup_filename(instance->li_file), instance->li_line);
+}
+
+#ifdef DDB
+static int
+witness_thread_has_locks(struct thread *td)
+{
+
+	if (td->td_sleeplocks == NULL)
+		return (0);
+	return (td->td_sleeplocks->ll_count != 0);
+}
+
+static int
+witness_proc_has_locks(struct proc *p)
+{
+	struct thread *td;
+
+	FOREACH_THREAD_IN_PROC(p, td) {
+		if (witness_thread_has_locks(td))
+			return (1);
+	}
+	return (0);
+}
+#endif
+
+int
+witness_list_locks(struct lock_list_entry **lock_list,
+    int (*prnt)(const char *fmt, ...))
+{
+	struct lock_list_entry *lle;
+	int i, nheld;
+
+	nheld = 0;
+	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
+		for (i = lle->ll_count - 1; i >= 0; i--) {
+			witness_list_lock(&lle->ll_children[i], prnt);
+			nheld++;
+		}
+	return (nheld);
+}
+
+/*
+ * This is a bit risky at best.  We call this function when we have timed
+ * out acquiring a spin lock, and we assume that the other CPU is stuck
+ * with this lock held.  So, we go groveling around in the other CPU's
+ * per-cpu data to try to find the lock instance for this spin lock to
+ * see when it was last acquired.
+ */
+void
+witness_display_spinlock(struct lock_object *lock, struct thread *owner,
+    int (*prnt)(const char *fmt, ...))
+{
+	struct lock_instance *instance;
+	struct pcpu *pc;
+
+	if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU)
+		return;
+	pc = pcpu_find(owner->td_oncpu);
+	instance = find_instance(pc->pc_spinlocks, lock);
+	if (instance != NULL)
+		witness_list_lock(instance, prnt);
+}
+
+void
+witness_save(struct lock_object *lock, const char **filep, int *linep)
+{
+	struct lock_list_entry *lock_list;
+	struct lock_instance *instance;
+	struct lock_class *class;
+
+	/*
+	 * This function is used independently in locking code to deal with
+	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
+	 * is gone.
+	 */
+	if (SCHEDULER_STOPPED())
+		return;
+	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
+	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
+		return;
+	class = LOCK_CLASS(lock);
+	if (class->lc_flags & LC_SLEEPLOCK)
+		lock_list = curthread->td_sleeplocks;
+	else {
+		if (witness_skipspin)
+			return;
+		lock_list = PCPU_GET(spinlocks);
+	}
+	instance = find_instance(lock_list, lock);
+	if (instance == NULL) {
+		kassert_panic("%s: lock (%s) %s not locked", __func__,
+		    class->lc_name, lock->lo_name);
+		return;
+	}
+	*filep = instance->li_file;
+	*linep = instance->li_line;
+}
+
+void
+witness_restore(struct lock_object *lock, const char *file, int line)
+{
+	struct lock_list_entry *lock_list;
+	struct lock_instance *instance;
+	struct lock_class *class;
+
+	/*
+	 * This function is used independently in locking code to deal with
+	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
+	 * is gone.
+	 */
+	if (SCHEDULER_STOPPED())
+		return;
+	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
+	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
+		return;
+	class = LOCK_CLASS(lock);
+	if (class->lc_flags & LC_SLEEPLOCK)
+		lock_list = curthread->td_sleeplocks;
+	else {
+		if (witness_skipspin)
+			return;
+		lock_list = PCPU_GET(spinlocks);
+	}
+	instance = find_instance(lock_list, lock);
+	if (instance == NULL)
+		kassert_panic("%s: lock (%s) %s not locked", __func__,
+		    class->lc_name, lock->lo_name);
+	lock->lo_witness->w_file = file;
+	lock->lo_witness->w_line = line;
+	if (instance == NULL)
+		return;
+	instance->li_file = file;
+	instance->li_line = line;
+}
+
+void
+witness_assert(const struct lock_object *lock, int flags, const char *file,
+    int line)
+{
+#ifdef INVARIANT_SUPPORT
+	struct lock_instance *instance;
+	struct lock_class *class;
+
+	if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL)
+		return;
+	class = LOCK_CLASS(lock);
+	if ((class->lc_flags & LC_SLEEPLOCK) != 0)
+		instance = find_instance(curthread->td_sleeplocks, lock);
+	else if ((class->lc_flags & LC_SPINLOCK) != 0)
+		instance = find_instance(PCPU_GET(spinlocks), lock);
+	else {
+		kassert_panic("Lock (%s) %s is not sleep or spin!",
+		    class->lc_name, lock->lo_name);
+		return;
+	}
+	switch (flags) {
+	case LA_UNLOCKED:
+		if (instance != NULL)
+			kassert_panic("Lock (%s) %s locked @ %s:%d.",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+		break;
+	case LA_LOCKED:
+	case LA_LOCKED | LA_RECURSED:
+	case LA_LOCKED | LA_NOTRECURSED:
+	case LA_SLOCKED:
+	case LA_SLOCKED | LA_RECURSED:
+	case LA_SLOCKED | LA_NOTRECURSED:
+	case LA_XLOCKED:
+	case LA_XLOCKED | LA_RECURSED:
+	case LA_XLOCKED | LA_NOTRECURSED:
+		if (instance == NULL) {
+			kassert_panic("Lock (%s) %s not locked @ %s:%d.",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+			break;
+		}
+		if ((flags & LA_XLOCKED) != 0 &&
+		    (instance->li_flags & LI_EXCLUSIVE) == 0)
+			kassert_panic(
+			    "Lock (%s) %s not exclusively locked @ %s:%d.",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+		if ((flags & LA_SLOCKED) != 0 &&
+		    (instance->li_flags & LI_EXCLUSIVE) != 0)
+			kassert_panic(
+			    "Lock (%s) %s exclusively locked @ %s:%d.",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+		if ((flags & LA_RECURSED) != 0 &&
+		    (instance->li_flags & LI_RECURSEMASK) == 0)
+			kassert_panic("Lock (%s) %s not recursed @ %s:%d.",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+		if ((flags & LA_NOTRECURSED) != 0 &&
+		    (instance->li_flags & LI_RECURSEMASK) != 0)
+			kassert_panic("Lock (%s) %s recursed @ %s:%d.",
+			    class->lc_name, lock->lo_name,
+			    fixup_filename(file), line);
+		break;
+	default:
+		kassert_panic("Invalid lock assertion at %s:%d.",
+		    fixup_filename(file), line);
+
+	}
+#endif	/* INVARIANT_SUPPORT */
+}
+
+static void
+witness_setflag(struct lock_object *lock, int flag, int set)
+{
+	struct lock_list_entry *lock_list;
+	struct lock_instance *instance;
+	struct lock_class *class;
+
+	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
+		return;
+	class = LOCK_CLASS(lock);
+	if (class->lc_flags & LC_SLEEPLOCK)
+		lock_list = curthread->td_sleeplocks;
+	else {
+		if (witness_skipspin)
+			return;
+		lock_list = PCPU_GET(spinlocks);
+	}
+	instance = find_instance(lock_list, lock);
+	if (instance == NULL) {
+		kassert_panic("%s: lock (%s) %s not locked", __func__,
+		    class->lc_name, lock->lo_name);
+		return;
+	}
+
+	if (set)
+		instance->li_flags |= flag;
+	else
+		instance->li_flags &= ~flag;
+}
+
+void
+witness_norelease(struct lock_object *lock)
+{
+
+	witness_setflag(lock, LI_NORELEASE, 1);
+}
+
+void
+witness_releaseok(struct lock_object *lock)
+{
+
+	witness_setflag(lock, LI_NORELEASE, 0);
+}
+
+#ifdef DDB
+static void
+witness_ddb_list(struct thread *td)
+{
+
+	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
+	KASSERT(kdb_active, ("%s: not in the debugger", __func__));
+
+	if (witness_watch < 1)
+		return;
+
+	witness_list_locks(&td->td_sleeplocks, db_printf);
+
+	/*
+	 * We only handle spinlocks if td == curthread.  This is somewhat broken
+	 * if td is currently executing on some other CPU and holds spin locks
+	 * as we won't display those locks.  If we had a MI way of getting
+	 * the per-cpu data for a given cpu then we could use
+	 * td->td_oncpu to get the list of spinlocks for this thread
+	 * and "fix" this.
+	 *
+	 * That still wouldn't really fix this unless we locked the scheduler
+	 * lock or stopped the other CPU to make sure it wasn't changing the
+	 * list out from under us.  It is probably best to just not try to
+	 * handle threads on other CPU's for now.
+	 */
+	if (td == curthread && PCPU_GET(spinlocks) != NULL)
+		witness_list_locks(PCPU_PTR(spinlocks), db_printf);
+}
+
+DB_SHOW_COMMAND(locks, db_witness_list)
+{
+	struct thread *td;
+
+	if (have_addr)
+		td = db_lookup_thread(addr, TRUE);
+	else
+		td = kdb_thread;
+	witness_ddb_list(td);
+}
+
+DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
+{
+	struct thread *td;
+	struct proc *p;
+
+	/*
+	 * It would be nice to list only threads and processes that actually
+	 * held sleep locks, but that information is currently not exported
+	 * by WITNESS.
+	 */
+	FOREACH_PROC_IN_SYSTEM(p) {
+		if (!witness_proc_has_locks(p))
+			continue;
+		FOREACH_THREAD_IN_PROC(p, td) {
+			if (!witness_thread_has_locks(td))
+				continue;
+			db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
+			    p->p_comm, td, td->td_tid);
+			witness_ddb_list(td);
+			if (db_pager_quit)
+				return;
+		}
+	}
+}
+DB_SHOW_ALIAS(alllocks, db_witness_list_all)
+
+DB_SHOW_COMMAND(witness, db_witness_display)
+{
+
+	witness_ddb_display(db_printf);
+}
+#endif
+
+static int
+sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
+{
+	struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2;
+	struct witness *tmp_w1, *tmp_w2, *w1, *w2;
+	struct sbuf *sb;
+	u_int w_rmatrix1, w_rmatrix2;
+	int error, generation, i, j;
+
+	tmp_data1 = NULL;
+	tmp_data2 = NULL;
+	tmp_w1 = NULL;
+	tmp_w2 = NULL;
+	if (witness_watch < 1) {
+		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
+		return (error);
+	}
+	if (witness_cold) {
+		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
+		return (error);
+	}
+	error = 0;
+	sb = sbuf_new(NULL, NULL, BADSTACK_SBUF_SIZE, SBUF_AUTOEXTEND);
+	if (sb == NULL)
+		return (ENOMEM);
+
+	/* Allocate and init temporary storage space. */
+	tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
+	tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
+	tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
+	    M_WAITOK | M_ZERO);
+	tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
+	    M_WAITOK | M_ZERO);
+	stack_zero(&tmp_data1->wlod_stack);
+	stack_zero(&tmp_data2->wlod_stack);
+
+restart:
+	mtx_lock_spin(&w_mtx);
+	generation = w_generation;
+	mtx_unlock_spin(&w_mtx);
+	sbuf_printf(sb, "Number of known direct relationships is %d\n",
+	    w_lohash.wloh_count);
+	for (i = 1; i < w_max_used_index; i++) {
+		mtx_lock_spin(&w_mtx);
+		if (generation != w_generation) {
+			mtx_unlock_spin(&w_mtx);
+
+			/* The graph has changed, try again. */
+			req->oldidx = 0;
+			sbuf_clear(sb);
+			goto restart;
+		}
+
+		w1 = &w_data[i];
+		if (w1->w_reversed == 0) {
+			mtx_unlock_spin(&w_mtx);
+			continue;
+		}
+
+		/* Copy w1 locally so we can release the spin lock. */
+		*tmp_w1 = *w1;
+		mtx_unlock_spin(&w_mtx);
+
+		if (tmp_w1->w_reversed == 0)
+			continue;
+		for (j = 1; j < w_max_used_index; j++) {
+			if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
+				continue;
+
+			mtx_lock_spin(&w_mtx);
+			if (generation != w_generation) {
+				mtx_unlock_spin(&w_mtx);
+
+				/* The graph has changed, try again. */
+				req->oldidx = 0;
+				sbuf_clear(sb);
+				goto restart;
+			}
+
+			w2 = &w_data[j];
+			data1 = witness_lock_order_get(w1, w2);
+			data2 = witness_lock_order_get(w2, w1);
+
+			/*
+			 * Copy information locally so we can release the
+			 * spin lock.
+			 */
+			*tmp_w2 = *w2;
+			w_rmatrix1 = (unsigned int)w_rmatrix[i][j];
+			w_rmatrix2 = (unsigned int)w_rmatrix[j][i];
+
+			if (data1) {
+				stack_zero(&tmp_data1->wlod_stack);
+				stack_copy(&data1->wlod_stack,
+				    &tmp_data1->wlod_stack);
+			}
+			if (data2 && data2 != data1) {
+				stack_zero(&tmp_data2->wlod_stack);
+				stack_copy(&data2->wlod_stack,
+				    &tmp_data2->wlod_stack);
+			}
+			mtx_unlock_spin(&w_mtx);
+
+			sbuf_printf(sb,
+	    "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
+			    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
+			    tmp_w2->w_name, tmp_w2->w_class->lc_name);
+#if 0
+ 			sbuf_printf(sb,
+			"w_rmatrix[%s][%s] == %x, w_rmatrix[%s][%s] == %x\n",
+ 			    tmp_w1->name, tmp_w2->w_name, w_rmatrix1,
+ 			    tmp_w2->name, tmp_w1->w_name, w_rmatrix2);
+#endif
+			if (data1) {
+				sbuf_printf(sb,
+			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
+				    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
+				    tmp_w2->w_name, tmp_w2->w_class->lc_name);
+				stack_sbuf_print(sb, &tmp_data1->wlod_stack);
+				sbuf_printf(sb, "\n");
+			}
+			if (data2 && data2 != data1) {
+				sbuf_printf(sb,
+			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
+				    tmp_w2->w_name, tmp_w2->w_class->lc_name, 
+				    tmp_w1->w_name, tmp_w1->w_class->lc_name);
+				stack_sbuf_print(sb, &tmp_data2->wlod_stack);
+				sbuf_printf(sb, "\n");
+			}
+		}
+	}
+	mtx_lock_spin(&w_mtx);
+	if (generation != w_generation) {
+		mtx_unlock_spin(&w_mtx);
+
+		/*
+		 * The graph changed while we were printing stack data,
+		 * try again.
+		 */
+		req->oldidx = 0;
+		sbuf_clear(sb);
+		goto restart;
+	}
+	mtx_unlock_spin(&w_mtx);
+
+	/* Free temporary storage space. */
+	free(tmp_data1, M_TEMP);
+	free(tmp_data2, M_TEMP);
+	free(tmp_w1, M_TEMP);
+	free(tmp_w2, M_TEMP);
+
+	sbuf_finish(sb);
+	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+	sbuf_delete(sb);
+
+	return (error);
+}
+
+static int
+sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS)
+{
+	struct witness *w;
+	struct sbuf *sb;
+	int error;
+
+	if (witness_watch < 1) {
+		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
+		return (error);
+	}
+	if (witness_cold) {
+		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
+		return (error);
+	}
+	error = 0;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req);
+	if (sb == NULL)
+		return (ENOMEM);
+	sbuf_printf(sb, "\n");
+
+	mtx_lock_spin(&w_mtx);
+	STAILQ_FOREACH(w, &w_all, w_list)
+		w->w_displayed = 0;
+	STAILQ_FOREACH(w, &w_all, w_list)
+		witness_add_fullgraph(sb, w);
+	mtx_unlock_spin(&w_mtx);
+
+	/*
+	 * Close the sbuf and return to userland.
+	 */
+	error = sbuf_finish(sb);
+	sbuf_delete(sb);
+
+	return (error);
+}
+
+static int
+sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS)
+{
+	int error, value;
+
+	value = witness_watch;
+	error = sysctl_handle_int(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (value > 1 || value < -1 ||
+	    (witness_watch == -1 && value != witness_watch))
+		return (EINVAL);
+	witness_watch = value;
+	return (0);
+}
+
+static void
+witness_add_fullgraph(struct sbuf *sb, struct witness *w)
+{
+	int i;
+
+	if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0))
+		return;
+	w->w_displayed = 1;
+
+	WITNESS_INDEX_ASSERT(w->w_index);
+	for (i = 1; i <= w_max_used_index; i++) {
+		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
+			sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name,
+			    w_data[i].w_name);
+			witness_add_fullgraph(sb, &w_data[i]);
+		}
+	}
+}
+
+/*
+ * A simple hash function. Takes a key pointer and a key size. If size == 0,
+ * interprets the key as a string and reads until the null
+ * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
+ * hash value computed from the key.
+ */
+static uint32_t
+witness_hash_djb2(const uint8_t *key, uint32_t size)
+{
+	unsigned int hash = 5381;
+	int i;
+
+	/* hash = hash * 33 + key[i] */
+	if (size)
+		for (i = 0; i < size; i++)
+			hash = ((hash << 5) + hash) + (unsigned int)key[i];
+	else
+		for (i = 0; key[i] != 0; i++)
+			hash = ((hash << 5) + hash) + (unsigned int)key[i];
+
+	return (hash);
+}
+
+
+/*
+ * Initializes the two witness hash tables. Called exactly once from
+ * witness_initialize().
+ */
+static void
+witness_init_hash_tables(void)
+{
+	int i;
+
+	MPASS(witness_cold);
+
+	/* Initialize the hash tables. */
+	for (i = 0; i < WITNESS_HASH_SIZE; i++)
+		w_hash.wh_array[i] = NULL;
+
+	w_hash.wh_size = WITNESS_HASH_SIZE;
+	w_hash.wh_count = 0;
+
+	/* Initialize the lock order data hash. */
+	w_lofree = NULL;
+	for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
+		memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
+		w_lodata[i].wlod_next = w_lofree;
+		w_lofree = &w_lodata[i];
+	}
+	w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
+	w_lohash.wloh_count = 0;
+	for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
+		w_lohash.wloh_array[i] = NULL;
+}
+
+static struct witness *
+witness_hash_get(const char *key)
+{
+	struct witness *w;
+	uint32_t hash;
+	
+	MPASS(key != NULL);
+	if (witness_cold == 0)
+		mtx_assert(&w_mtx, MA_OWNED);
+	hash = witness_hash_djb2(key, 0) % w_hash.wh_size;
+	w = w_hash.wh_array[hash];
+	while (w != NULL) {
+		if (strcmp(w->w_name, key) == 0)
+			goto out;
+		w = w->w_hash_next;
+	}
+
+out:
+	return (w);
+}
+
+static void
+witness_hash_put(struct witness *w)
+{
+	uint32_t hash;
+
+	MPASS(w != NULL);
+	MPASS(w->w_name != NULL);
+	if (witness_cold == 0)
+		mtx_assert(&w_mtx, MA_OWNED);
+	KASSERT(witness_hash_get(w->w_name) == NULL,
+	    ("%s: trying to add a hash entry that already exists!", __func__));
+	KASSERT(w->w_hash_next == NULL,
+	    ("%s: w->w_hash_next != NULL", __func__));
+
+	hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size;
+	w->w_hash_next = w_hash.wh_array[hash];
+	w_hash.wh_array[hash] = w;
+	w_hash.wh_count++;
+}
+
+
+static struct witness_lock_order_data *
+witness_lock_order_get(struct witness *parent, struct witness *child)
+{
+	struct witness_lock_order_data *data = NULL;
+	struct witness_lock_order_key key;
+	unsigned int hash;
+
+	MPASS(parent != NULL && child != NULL);
+	key.from = parent->w_index;
+	key.to = child->w_index;
+	WITNESS_INDEX_ASSERT(key.from);
+	WITNESS_INDEX_ASSERT(key.to);
+	if ((w_rmatrix[parent->w_index][child->w_index]
+	    & WITNESS_LOCK_ORDER_KNOWN) == 0)
+		goto out;
+
+	hash = witness_hash_djb2((const char*)&key,
+	    sizeof(key)) % w_lohash.wloh_size;
+	data = w_lohash.wloh_array[hash];
+	while (data != NULL) {
+		if (witness_lock_order_key_equal(&data->wlod_key, &key))
+			break;
+		data = data->wlod_next;
+	}
+
+out:
+	return (data);
+}
+
+/*
+ * Verify that parent and child have a known relationship, are not the same,
+ * and child is actually a child of parent.  This is done without w_mtx
+ * to avoid contention in the common case.
+ */
+static int
+witness_lock_order_check(struct witness *parent, struct witness *child)
+{
+
+	if (parent != child &&
+	    w_rmatrix[parent->w_index][child->w_index]
+	    & WITNESS_LOCK_ORDER_KNOWN &&
+	    isitmychild(parent, child))
+		return (1);
+
+	return (0);
+}
+
+static int
+witness_lock_order_add(struct witness *parent, struct witness *child)
+{
+	struct witness_lock_order_data *data = NULL;
+	struct witness_lock_order_key key;
+	unsigned int hash;
+	
+	MPASS(parent != NULL && child != NULL);
+	key.from = parent->w_index;
+	key.to = child->w_index;
+	WITNESS_INDEX_ASSERT(key.from);
+	WITNESS_INDEX_ASSERT(key.to);
+	if (w_rmatrix[parent->w_index][child->w_index]
+	    & WITNESS_LOCK_ORDER_KNOWN)
+		return (1);
+
+	hash = witness_hash_djb2((const char*)&key,
+	    sizeof(key)) % w_lohash.wloh_size;
+	w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
+	data = w_lofree;
+	if (data == NULL)
+		return (0);
+	w_lofree = data->wlod_next;
+	data->wlod_next = w_lohash.wloh_array[hash];
+	data->wlod_key = key;
+	w_lohash.wloh_array[hash] = data;
+	w_lohash.wloh_count++;
+	stack_zero(&data->wlod_stack);
+	stack_save(&data->wlod_stack);
+	return (1);
+}
+
+/* Call this whenver the structure of the witness graph changes. */
+static void
+witness_increment_graph_generation(void)
+{
+
+	if (witness_cold == 0)
+		mtx_assert(&w_mtx, MA_OWNED);
+	w_generation++;
+}
+
+#ifdef KDB
+static void
+_witness_debugger(int cond, const char *msg)
+{
+
+	if (witness_trace && cond)
+		kdb_backtrace();
+	if (witness_kdb && cond)
+		kdb_enter(KDB_WHY_WITNESS, msg);
+}
+#endif
diff --git a/sys/kern/sys_capability.c b/sys/kern/sys_capability.c
new file mode 100644
index 0000000..7a82017
--- /dev/null
+++ b/sys/kern/sys_capability.c
@@ -0,0 +1,613 @@
+/*-
+ * Copyright (c) 2008-2011 Robert N. M. Watson
+ * Copyright (c) 2010-2011 Jonathan Anderson
+ * Copyright (c) 2012 FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed at the University of Cambridge Computer
+ * Laboratory with support from a grant from Google, Inc.
+ *
+ * Portions of this software were developed by Pawel Jakub Dawidek under
+ * sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * FreeBSD kernel capability facility.
+ *
+ * Two kernel features are implemented here: capability mode, a sandboxed mode
+ * of execution for processes, and capabilities, a refinement on file
+ * descriptors that allows fine-grained control over operations on the file
+ * descriptor.  Collectively, these allow processes to run in the style of a
+ * historic "capability system" in which they can use only resources
+ * explicitly delegated to them.  This model is enforced by restricting access
+ * to global namespaces in capability mode.
+ *
+ * Capabilities wrap other file descriptor types, binding them to a constant
+ * rights mask set when the capability is created.  New capabilities may be
+ * derived from existing capabilities, but only if they have the same or a
+ * strict subset of the rights on the original capability.
+ *
+ * System calls permitted in capability mode are defined in capabilities.conf;
+ * calls must be carefully audited for safety to ensure that they don't allow
+ * escape from a sandbox.  Some calls permit only a subset of operations in
+ * capability mode -- for example, shm_open(2) is limited to creating
+ * anonymous, rather than named, POSIX shared memory objects.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/ucred.h>
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+
+#include <security/audit/audit.h>
+
+#include <vm/uma.h>
+#include <vm/vm.h>
+
+#ifdef CAPABILITY_MODE
+
+FEATURE(security_capability_mode, "Capsicum Capability Mode");
+
+/*
+ * System call to enter capability mode for the process.
+ */
+int
+sys_cap_enter(struct thread *td, struct cap_enter_args *uap)
+{
+	struct ucred *newcred, *oldcred;
+	struct proc *p;
+
+	if (IN_CAPABILITY_MODE(td))
+		return (0);
+
+	newcred = crget();
+	p = td->td_proc;
+	PROC_LOCK(p);
+	oldcred = p->p_ucred;
+	crcopy(newcred, oldcred);
+	newcred->cr_flags |= CRED_FLAG_CAPMODE;
+	p->p_ucred = newcred;
+	PROC_UNLOCK(p);
+	crfree(oldcred);
+	return (0);
+}
+
+/*
+ * System call to query whether the process is in capability mode.
+ */
+int
+sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap)
+{
+	u_int i;
+
+	i = IN_CAPABILITY_MODE(td) ? 1 : 0;
+	return (copyout(&i, uap->modep, sizeof(i)));
+}
+
+#else /* !CAPABILITY_MODE */
+
+int
+sys_cap_enter(struct thread *td, struct cap_enter_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+int
+sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+#endif /* CAPABILITY_MODE */
+
+#ifdef CAPABILITIES
+
+FEATURE(security_capabilities, "Capsicum Capabilities");
+
+MALLOC_DECLARE(M_FILECAPS);
+
+static inline int
+_cap_check(const cap_rights_t *havep, const cap_rights_t *needp,
+    enum ktr_cap_fail_type type)
+{
+	int i;
+
+	for (i = 0; i < nitems(havep->cr_rights); i++) {
+		if (!cap_rights_contains(havep, needp)) {
+#ifdef KTRACE
+			if (KTRPOINT(curthread, KTR_CAPFAIL))
+				ktrcapfail(type, needp, havep);
+#endif
+			return (ENOTCAPABLE);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Test whether a capability grants the requested rights.
+ */
+int
+cap_check(const cap_rights_t *havep, const cap_rights_t *needp)
+{
+
+	return (_cap_check(havep, needp, CAPFAIL_NOTCAPABLE));
+}
+
+/*
+ * Convert capability rights into VM access flags.
+ */
+u_char
+cap_rights_to_vmprot(cap_rights_t *havep)
+{
+	u_char maxprot;
+
+	maxprot = VM_PROT_NONE;
+	if (cap_rights_is_set(havep, CAP_MMAP_R))
+		maxprot |= VM_PROT_READ;
+	if (cap_rights_is_set(havep, CAP_MMAP_W))
+		maxprot |= VM_PROT_WRITE;
+	if (cap_rights_is_set(havep, CAP_MMAP_X))
+		maxprot |= VM_PROT_EXECUTE;
+
+	return (maxprot);
+}
+
+/*
+ * Extract rights from a capability for monitoring purposes -- not for use in
+ * any other way, as we want to keep all capability permission evaluation in
+ * this one file.
+ */
+cap_rights_t *
+cap_rights(struct filedesc *fdp, int fd)
+{
+
+	return (&fdp->fd_ofiles[fd].fde_rights);
+}
+
+/*
+ * System call to limit rights of the given capability.
+ */
+int
+sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap)
+{
+	struct filedesc *fdp;
+	cap_rights_t rights;
+	int error, fd, version;
+
+	cap_rights_init(&rights);
+
+	error = copyin(uap->rightsp, &rights, sizeof(rights.cr_rights[0]));
+	if (error != 0)
+		return (error);
+	version = CAPVER(&rights);
+	if (version != CAP_RIGHTS_VERSION_00)
+		return (EINVAL);
+
+	error = copyin(uap->rightsp, &rights,
+	    sizeof(rights.cr_rights[0]) * CAPARSIZE(&rights));
+	if (error != 0)
+		return (error);
+	/* Check for race. */
+	if (CAPVER(&rights) != version)
+		return (EINVAL);
+
+	if (!cap_rights_is_valid(&rights))
+		return (EINVAL);
+
+	if (version != CAP_RIGHTS_VERSION) {
+		rights.cr_rights[0] &= ~(0x3ULL << 62);
+		rights.cr_rights[0] |= ((uint64_t)CAP_RIGHTS_VERSION << 62);
+	}
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_STRUCT))
+		ktrcaprights(&rights);
+#endif
+
+	fd = uap->fd;
+
+	AUDIT_ARG_FD(fd);
+	AUDIT_ARG_RIGHTS(&rights);
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_XLOCK(fdp);
+	if (fget_locked(fdp, fd) == NULL) {
+		FILEDESC_XUNLOCK(fdp);
+		return (EBADF);
+	}
+	error = _cap_check(cap_rights(fdp, fd), &rights, CAPFAIL_INCREASE);
+	if (error == 0) {
+		fdp->fd_ofiles[fd].fde_rights = rights;
+		if (!cap_rights_is_set(&rights, CAP_IOCTL)) {
+			free(fdp->fd_ofiles[fd].fde_ioctls, M_FILECAPS);
+			fdp->fd_ofiles[fd].fde_ioctls = NULL;
+			fdp->fd_ofiles[fd].fde_nioctls = 0;
+		}
+		if (!cap_rights_is_set(&rights, CAP_FCNTL))
+			fdp->fd_ofiles[fd].fde_fcntls = 0;
+	}
+	FILEDESC_XUNLOCK(fdp);
+	return (error);
+}
+
+/*
+ * System call to query the rights mask associated with a capability.
+ */
+int
+sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap)
+{
+	struct filedesc *fdp;
+	cap_rights_t rights;
+	int error, fd, i, n;
+
+	if (uap->version != CAP_RIGHTS_VERSION_00)
+		return (EINVAL);
+
+	fd = uap->fd;
+
+	AUDIT_ARG_FD(fd);
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_SLOCK(fdp);
+	if (fget_locked(fdp, fd) == NULL) {
+		FILEDESC_SUNLOCK(fdp);
+		return (EBADF);
+	}
+	rights = *cap_rights(fdp, fd);
+	FILEDESC_SUNLOCK(fdp);
+	n = uap->version + 2;
+	if (uap->version != CAPVER(&rights)) {
+		/*
+		 * For older versions we need to check if the descriptor
+		 * doesn't contain rights not understood by the caller.
+		 * If it does, we have to return an error.
+		 */
+		for (i = n; i < CAPARSIZE(&rights); i++) {
+			if ((rights.cr_rights[i] & ~(0x7FULL << 57)) != 0)
+				return (EINVAL);
+		}
+	}
+	error = copyout(&rights, uap->rightsp, sizeof(rights.cr_rights[0]) * n);
+#ifdef KTRACE
+	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
+		ktrcaprights(&rights);
+#endif
+	return (error);
+}
+
+/*
+ * Test whether a capability grants the given ioctl command.
+ * If descriptor doesn't have CAP_IOCTL, then ioctls list is empty and
+ * ENOTCAPABLE will be returned.
+ */
+int
+cap_ioctl_check(struct filedesc *fdp, int fd, u_long cmd)
+{
+	u_long *cmds;
+	ssize_t ncmds;
+	long i;
+
+	FILEDESC_LOCK_ASSERT(fdp);
+	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+	    ("%s: invalid fd=%d", __func__, fd));
+
+	ncmds = fdp->fd_ofiles[fd].fde_nioctls;
+	if (ncmds == -1)
+		return (0);
+
+	cmds = fdp->fd_ofiles[fd].fde_ioctls;
+	for (i = 0; i < ncmds; i++) {
+		if (cmds[i] == cmd)
+			return (0);
+	}
+
+	return (ENOTCAPABLE);
+}
+
+/*
+ * Check if the current ioctls list can be replaced by the new one.
+ */
+static int
+cap_ioctl_limit_check(struct filedesc *fdp, int fd, const u_long *cmds,
+    size_t ncmds)
+{
+	u_long *ocmds;
+	ssize_t oncmds;
+	u_long i;
+	long j;
+
+	oncmds = fdp->fd_ofiles[fd].fde_nioctls;
+	if (oncmds == -1)
+		return (0);
+	if (oncmds < (ssize_t)ncmds)
+		return (ENOTCAPABLE);
+
+	ocmds = fdp->fd_ofiles[fd].fde_ioctls;
+	for (i = 0; i < ncmds; i++) {
+		for (j = 0; j < oncmds; j++) {
+			if (cmds[i] == ocmds[j])
+				break;
+		}
+		if (j == oncmds)
+			return (ENOTCAPABLE);
+	}
+
+	return (0);
+}
+
+int
+kern_cap_ioctls_limit(struct thread *td, int fd, u_long *cmds, size_t ncmds)
+{
+	struct filedesc *fdp;
+	u_long *ocmds;
+	int error;
+
+	AUDIT_ARG_FD(fd);
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_XLOCK(fdp);
+
+	if (fget_locked(fdp, fd) == NULL) {
+		error = EBADF;
+		goto out;
+	}
+
+	error = cap_ioctl_limit_check(fdp, fd, cmds, ncmds);
+	if (error != 0)
+		goto out;
+
+	ocmds = fdp->fd_ofiles[fd].fde_ioctls;
+	fdp->fd_ofiles[fd].fde_ioctls = cmds;
+	fdp->fd_ofiles[fd].fde_nioctls = ncmds;
+
+	cmds = ocmds;
+	error = 0;
+out:
+	FILEDESC_XUNLOCK(fdp);
+	free(cmds, M_FILECAPS);
+	return (error);
+}
+
+int
+sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap)
+{
+	u_long *cmds;
+	size_t ncmds;
+	int error;
+
+	ncmds = uap->ncmds;
+
+	if (ncmds > 256)	/* XXX: Is 256 sane? */
+		return (EINVAL);
+
+	if (ncmds == 0) {
+		cmds = NULL;
+	} else {
+		cmds = malloc(sizeof(cmds[0]) * ncmds, M_FILECAPS, M_WAITOK);
+		error = copyin(uap->cmds, cmds, sizeof(cmds[0]) * ncmds);
+		if (error != 0) {
+			free(cmds, M_FILECAPS);
+			return (error);
+		}
+	}
+
+	return (kern_cap_ioctls_limit(td, uap->fd, cmds, ncmds));
+}
+
+int
+sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap)
+{
+	struct filedesc *fdp;
+	struct filedescent *fdep;
+	u_long *cmds;
+	size_t maxcmds;
+	int error, fd;
+
+	fd = uap->fd;
+	cmds = uap->cmds;
+	maxcmds = uap->maxcmds;
+
+	AUDIT_ARG_FD(fd);
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_SLOCK(fdp);
+
+	if (fget_locked(fdp, fd) == NULL) {
+		error = EBADF;
+		goto out;
+	}
+
+	/*
+	 * If all ioctls are allowed (fde_nioctls == -1 && fde_ioctls == NULL)
+	 * the only sane thing we can do is to not populate the given array and
+	 * return CAP_IOCTLS_ALL.
+	 */
+
+	fdep = &fdp->fd_ofiles[fd];
+	if (cmds != NULL && fdep->fde_ioctls != NULL) {
+		error = copyout(fdep->fde_ioctls, cmds,
+		    sizeof(cmds[0]) * MIN(fdep->fde_nioctls, maxcmds));
+		if (error != 0)
+			goto out;
+	}
+	if (fdep->fde_nioctls == -1)
+		td->td_retval[0] = CAP_IOCTLS_ALL;
+	else
+		td->td_retval[0] = fdep->fde_nioctls;
+
+	error = 0;
+out:
+	FILEDESC_SUNLOCK(fdp);
+	return (error);
+}
+
+/*
+ * Test whether a capability grants the given fcntl command.
+ */
+int
+cap_fcntl_check(struct filedesc *fdp, int fd, int cmd)
+{
+	uint32_t fcntlcap;
+
+	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+	    ("%s: invalid fd=%d", __func__, fd));
+
+	fcntlcap = (1 << cmd);
+	KASSERT((CAP_FCNTL_ALL & fcntlcap) != 0,
+	    ("Unsupported fcntl=%d.", cmd));
+
+	if ((fdp->fd_ofiles[fd].fde_fcntls & fcntlcap) != 0)
+		return (0);
+
+	return (ENOTCAPABLE);
+}
+
+int
+sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap)
+{
+	struct filedesc *fdp;
+	uint32_t fcntlrights;
+	int fd;
+
+	fd = uap->fd;
+	fcntlrights = uap->fcntlrights;
+
+	AUDIT_ARG_FD(fd);
+	AUDIT_ARG_FCNTL_RIGHTS(fcntlrights);
+
+	if ((fcntlrights & ~CAP_FCNTL_ALL) != 0)
+		return (EINVAL);
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_XLOCK(fdp);
+
+	if (fget_locked(fdp, fd) == NULL) {
+		FILEDESC_XUNLOCK(fdp);
+		return (EBADF);
+	}
+
+	if ((fcntlrights & ~fdp->fd_ofiles[fd].fde_fcntls) != 0) {
+		FILEDESC_XUNLOCK(fdp);
+		return (ENOTCAPABLE);
+	}
+
+	fdp->fd_ofiles[fd].fde_fcntls = fcntlrights;
+	FILEDESC_XUNLOCK(fdp);
+
+	return (0);
+}
+
+int
+sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap)
+{
+	struct filedesc *fdp;
+	uint32_t rights;
+	int fd;
+
+	fd = uap->fd;
+
+	AUDIT_ARG_FD(fd);
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_SLOCK(fdp);
+	if (fget_locked(fdp, fd) == NULL) {
+		FILEDESC_SUNLOCK(fdp);
+		return (EBADF);
+	}
+	rights = fdp->fd_ofiles[fd].fde_fcntls;
+	FILEDESC_SUNLOCK(fdp);
+
+	return (copyout(&rights, uap->fcntlrightsp, sizeof(rights)));
+}
+
+#else /* !CAPABILITIES */
+
+/*
+ * Stub Capability functions for when options CAPABILITIES isn't compiled
+ * into the kernel.
+ */
+
+int
+sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+int
+sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+int
+sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+int
+sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+int
+sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+int
+sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+#endif /* CAPABILITIES */
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
new file mode 100644
index 0000000..d4d6293
--- /dev/null
+++ b/sys/kern/sys_generic.c
@@ -0,0 +1,1815 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/capability.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/lock.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/resourcevar.h>
+#include <sys/selinfo.h>
+#include <sys/sleepqueue.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/vnode.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/condvar.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <security/audit/audit.h>
+
+int iosize_max_clamp = 1;
+SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW,
+    &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX");
+/*
+ * Assert that the return value of read(2) and write(2) syscalls fits
+ * into a register.  If not, an architecture will need to provide the
+ * usermode wrappers to reconstruct the result.
+ */
+CTASSERT(sizeof(register_t) >= sizeof(size_t));
+
+static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
+static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
+MALLOC_DEFINE(M_IOV, "iov", "large iov's");
+
+static int	pollout(struct thread *, struct pollfd *, struct pollfd *,
+		    u_int);
+static int	pollscan(struct thread *, struct pollfd *, u_int);
+static int	pollrescan(struct thread *);
+static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
+static int	selrescan(struct thread *, fd_mask **, fd_mask **);
+static void	selfdalloc(struct thread *, void *);
+static void	selfdfree(struct seltd *, struct selfd *);
+static int	dofileread(struct thread *, int, struct file *, struct uio *,
+		    off_t, int);
+static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
+		    off_t, int);
+static void	doselwakeup(struct selinfo *, int);
+static void	seltdinit(struct thread *);
+static int	seltdwait(struct thread *, sbintime_t, sbintime_t);
+static void	seltdclear(struct thread *);
+
+/*
+ * One seltd per-thread allocated on demand as needed.
+ *
+ *	t - protected by st_mtx
+ * 	k - Only accessed by curthread or read-only
+ */
+struct seltd {
+	STAILQ_HEAD(, selfd)	st_selq;	/* (k) List of selfds. */
+	struct selfd		*st_free1;	/* (k) free fd for read set. */
+	struct selfd		*st_free2;	/* (k) free fd for write set. */
+	struct mtx		st_mtx;		/* Protects struct seltd */
+	struct cv		st_wait;	/* (t) Wait channel. */
+	int			st_flags;	/* (t) SELTD_ flags. */
+};
+
+#define	SELTD_PENDING	0x0001			/* We have pending events. */
+#define	SELTD_RESCAN	0x0002			/* Doing a rescan. */
+
+/*
+ * One selfd allocated per-thread per-file-descriptor.
+ *	f - protected by sf_mtx
+ */
+struct selfd {
+	STAILQ_ENTRY(selfd)	sf_link;	/* (k) fds owned by this td. */
+	TAILQ_ENTRY(selfd)	sf_threads;	/* (f) fds on this selinfo. */
+	struct selinfo		*sf_si;		/* (f) selinfo when linked. */
+	struct mtx		*sf_mtx;	/* Pointer to selinfo mtx. */
+	struct seltd		*sf_td;		/* (k) owning seltd. */
+	void			*sf_cookie;	/* (k) fd or pollfd. */
+};
+
+static uma_zone_t selfd_zone;
+static struct mtx_pool *mtxpool_select;
+
+#ifndef _SYS_SYSPROTO_H_
+struct read_args {
+	int	fd;
+	void	*buf;
+	size_t	nbyte;
+};
+#endif
+int
+sys_read(td, uap)
+	struct thread *td;
+	struct read_args *uap;
+{
+	struct uio auio;
+	struct iovec aiov;
+	int error;
+
+	if (uap->nbyte > IOSIZE_MAX)
+		return (EINVAL);
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->nbyte;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_resid = uap->nbyte;
+	auio.uio_segflg = UIO_USERSPACE;
+	error = kern_readv(td, uap->fd, &auio);
+	return(error);
+}
+
+/*
+ * Positioned read system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pread_args {
+	int	fd;
+	void	*buf;
+	size_t	nbyte;
+	int	pad;
+	off_t	offset;
+};
+#endif
+int
+sys_pread(td, uap)
+	struct thread *td;
+	struct pread_args *uap;
+{
+	struct uio auio;
+	struct iovec aiov;
+	int error;
+
+	if (uap->nbyte > IOSIZE_MAX)
+		return (EINVAL);
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->nbyte;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_resid = uap->nbyte;
+	auio.uio_segflg = UIO_USERSPACE;
+	error = kern_preadv(td, uap->fd, &auio, uap->offset);
+	return(error);
+}
+
+int
+freebsd6_pread(td, uap)
+	struct thread *td;
+	struct freebsd6_pread_args *uap;
+{
+	struct pread_args oargs;
+
+	oargs.fd = uap->fd;
+	oargs.buf = uap->buf;
+	oargs.nbyte = uap->nbyte;
+	oargs.offset = uap->offset;
+	return (sys_pread(td, &oargs));
+}
+
+/*
+ * Scatter read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readv_args {
+	int	fd;
+	struct	iovec *iovp;
+	u_int	iovcnt;
+};
+#endif
+int
+sys_readv(struct thread *td, struct readv_args *uap)
+{
+	struct uio *auio;
+	int error;
+
+	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
+	if (error)
+		return (error);
+	error = kern_readv(td, uap->fd, auio);
+	free(auio, M_IOV);
+	return (error);
+}
+
+int
+kern_readv(struct thread *td, int fd, struct uio *auio)
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	error = fget_read(td, fd, cap_rights_init(&rights, CAP_READ), &fp);
+	if (error)
+		return (error);
+	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Scatter positioned read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct preadv_args {
+	int	fd;
+	struct	iovec *iovp;
+	u_int	iovcnt;
+	off_t	offset;
+};
+#endif
+int
+sys_preadv(struct thread *td, struct preadv_args *uap)
+{
+	struct uio *auio;
+	int error;
+
+	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
+	if (error)
+		return (error);
+	error = kern_preadv(td, uap->fd, auio, uap->offset);
+	free(auio, M_IOV);
+	return (error);
+}
+
+int
+kern_preadv(td, fd, auio, offset)
+	struct thread *td;
+	int fd;
+	struct uio *auio;
+	off_t offset;
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp);
+	if (error)
+		return (error);
+	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
+		error = ESPIPE;
+	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
+		error = EINVAL;
+	else
+		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common code for readv and preadv that reads data in
+ * from a file using the passed in uio, offset, and flags.
+ */
+static int
+dofileread(td, fd, fp, auio, offset, flags)
+	struct thread *td;
+	int fd;
+	struct file *fp;
+	struct uio *auio;
+	off_t offset;
+	int flags;
+{
+	ssize_t cnt;
+	int error;
+#ifdef KTRACE
+	struct uio *ktruio = NULL;
+#endif
+
+	/* Finish zero length reads right here */
+	if (auio->uio_resid == 0) {
+		td->td_retval[0] = 0;
+		return(0);
+	}
+	auio->uio_rw = UIO_READ;
+	auio->uio_offset = offset;
+	auio->uio_td = td;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_GENIO)) 
+		ktruio = cloneuio(auio);
+#endif
+	cnt = auio->uio_resid;
+	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
+		if (auio->uio_resid != cnt && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+	}
+	cnt -= auio->uio_resid;
+#ifdef KTRACE
+	if (ktruio != NULL) {
+		ktruio->uio_resid = cnt;
+		ktrgenio(fd, UIO_READ, ktruio, error);
+	}
+#endif
+	td->td_retval[0] = cnt;
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct write_args {
+	int	fd;
+	const void *buf;
+	size_t	nbyte;
+};
+#endif
+int
+sys_write(td, uap)
+	struct thread *td;
+	struct write_args *uap;
+{
+	struct uio auio;
+	struct iovec aiov;
+	int error;
+
+	if (uap->nbyte > IOSIZE_MAX)
+		return (EINVAL);
+	aiov.iov_base = (void *)(uintptr_t)uap->buf;
+	aiov.iov_len = uap->nbyte;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_resid = uap->nbyte;
+	auio.uio_segflg = UIO_USERSPACE;
+	error = kern_writev(td, uap->fd, &auio);
+	return(error);
+}
+
+/*
+ * Positioned write system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pwrite_args {
+	int	fd;
+	const void *buf;
+	size_t	nbyte;
+	int	pad;
+	off_t	offset;
+};
+#endif
+int
+sys_pwrite(td, uap)
+	struct thread *td;
+	struct pwrite_args *uap;
+{
+	struct uio auio;
+	struct iovec aiov;
+	int error;
+
+	if (uap->nbyte > IOSIZE_MAX)
+		return (EINVAL);
+	aiov.iov_base = (void *)(uintptr_t)uap->buf;
+	aiov.iov_len = uap->nbyte;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_resid = uap->nbyte;
+	auio.uio_segflg = UIO_USERSPACE;
+	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
+	return(error);
+}
+
+int
+freebsd6_pwrite(td, uap)
+	struct thread *td;
+	struct freebsd6_pwrite_args *uap;
+{
+	struct pwrite_args oargs;
+
+	oargs.fd = uap->fd;
+	oargs.buf = uap->buf;
+	oargs.nbyte = uap->nbyte;
+	oargs.offset = uap->offset;
+	return (sys_pwrite(td, &oargs));
+}
+
+/*
+ * Gather write system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct writev_args {
+	int	fd;
+	struct	iovec *iovp;
+	u_int	iovcnt;
+};
+#endif
+int
+sys_writev(struct thread *td, struct writev_args *uap)
+{
+	struct uio *auio;
+	int error;
+
+	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
+	if (error)
+		return (error);
+	error = kern_writev(td, uap->fd, auio);
+	free(auio, M_IOV);
+	return (error);
+}
+
+int
+kern_writev(struct thread *td, int fd, struct uio *auio)
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	error = fget_write(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
+	if (error)
+		return (error);
+	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Gather positioned write system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pwritev_args {
+	int	fd;
+	struct	iovec *iovp;
+	u_int	iovcnt;
+	off_t	offset;
+};
+#endif
+int
+sys_pwritev(struct thread *td, struct pwritev_args *uap)
+{
+	struct uio *auio;
+	int error;
+
+	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
+	if (error)
+		return (error);
+	error = kern_pwritev(td, uap->fd, auio, uap->offset);
+	free(auio, M_IOV);
+	return (error);
+}
+
+int
+kern_pwritev(td, fd, auio, offset)
+	struct thread *td;
+	struct uio *auio;
+	int fd;
+	off_t offset;
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp);
+	if (error)
+		return (error);
+	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
+		error = ESPIPE;
+	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
+		error = EINVAL;
+	else
+		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common code for writev and pwritev that writes data to
+ * a file using the passed in uio, offset, and flags.
+ */
+static int
+dofilewrite(td, fd, fp, auio, offset, flags)
+	struct thread *td;
+	int fd;
+	struct file *fp;
+	struct uio *auio;
+	off_t offset;
+	int flags;
+{
+	ssize_t cnt;
+	int error;
+#ifdef KTRACE
+	struct uio *ktruio = NULL;
+#endif
+
+	auio->uio_rw = UIO_WRITE;
+	auio->uio_td = td;
+	auio->uio_offset = offset;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_GENIO))
+		ktruio = cloneuio(auio);
+#endif
+	cnt = auio->uio_resid;
+	if (fp->f_type == DTYPE_VNODE &&
+	    (fp->f_vnread_flags & FDEVFS_VNODE) == 0)
+		bwillwrite();
+	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
+		if (auio->uio_resid != cnt && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+		/* Socket layer is responsible for issuing SIGPIPE. */
+		if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
+			PROC_LOCK(td->td_proc);
+			tdsignal(td, SIGPIPE);
+			PROC_UNLOCK(td->td_proc);
+		}
+	}
+	cnt -= auio->uio_resid;
+#ifdef KTRACE
+	if (ktruio != NULL) {
+		ktruio->uio_resid = cnt;
+		ktrgenio(fd, UIO_WRITE, ktruio, error);
+	}
+#endif
+	td->td_retval[0] = cnt;
+	return (error);
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ *
+ * Can't use fget_write() here, since must return EINVAL and not EBADF if the
+ * descriptor isn't writable.
+ */
+int
+kern_ftruncate(td, fd, length)
+	struct thread *td;
+	int fd;
+	off_t length;
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(fd);
+	if (length < 0)
+		return (EINVAL);
+	error = fget(td, fd, cap_rights_init(&rights, CAP_FTRUNCATE), &fp);
+	if (error)
+		return (error);
+	AUDIT_ARG_FILE(td->td_proc, fp);
+	if (!(fp->f_flag & FWRITE)) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	error = fo_truncate(fp, length, td->td_ucred, td);
+	fdrop(fp, td);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+	int	fd;
+	int	pad;
+	off_t	length;
+};
+#endif
+int
+sys_ftruncate(td, uap)
+	struct thread *td;
+	struct ftruncate_args *uap;
+{
+
+	return (kern_ftruncate(td, uap->fd, uap->length));
+}
+
+#if defined(COMPAT_43)
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+	int	fd;
+	long	length;
+};
+#endif
+int
+oftruncate(td, uap)
+	struct thread *td;
+	struct oftruncate_args *uap;
+{
+
+	return (kern_ftruncate(td, uap->fd, uap->length));
+}
+#endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct ioctl_args {
+	int	fd;
+	u_long	com;
+	caddr_t	data;
+};
+#endif
+/* ARGSUSED */
+int
+sys_ioctl(struct thread *td, struct ioctl_args *uap)
+{
+	u_long com;
+	int arg, error;
+	u_int size;
+	caddr_t data;
+
+	if (uap->com > 0xffffffff) {
+		printf(
+		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
+		    td->td_proc->p_pid, td->td_name, uap->com);
+		uap->com &= 0xffffffff;
+	}
+	com = uap->com;
+
+	/*
+	 * Interpret high order word to find amount of data to be
+	 * copied to/from the user's address space.
+	 */
+	size = IOCPARM_LEN(com);
+	if ((size > IOCPARM_MAX) ||
+	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
+#if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+	    ((com & IOC_OUT) && size == 0) ||
+#else
+	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
+#endif
+	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
+		return (ENOTTY);
+
+	if (size > 0) {
+		if (com & IOC_VOID) {
+			/* Integer argument. */
+			arg = (intptr_t)uap->data;
+			data = (void *)&arg;
+			size = 0;
+		} else
+			data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
+	} else
+		data = (void *)&uap->data;
+	if (com & IOC_IN) {
+		error = copyin(uap->data, data, (u_int)size);
+		if (error) {
+			if (size > 0)
+				free(data, M_IOCTLOPS);
+			return (error);
+		}
+	} else if (com & IOC_OUT) {
+		/*
+		 * Zero the buffer so the user always
+		 * gets back something deterministic.
+		 */
+		bzero(data, size);
+	}
+
+	error = kern_ioctl(td, uap->fd, com, data);
+
+	if (error == 0 && (com & IOC_OUT))
+		error = copyout(data, uap->data, (u_int)size);
+
+	if (size > 0)
+		free(data, M_IOCTLOPS);
+	return (error);
+}
+
+int
+kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
+{
+	struct file *fp;
+	struct filedesc *fdp;
+#ifndef CAPABILITIES
+	cap_rights_t rights;
+#endif
+	int error, tmp, locked;
+
+	AUDIT_ARG_FD(fd);
+	AUDIT_ARG_CMD(com);
+
+	fdp = td->td_proc->p_fd;
+
+	switch (com) {
+	case FIONCLEX:
+	case FIOCLEX:
+		FILEDESC_XLOCK(fdp);
+		locked = LA_XLOCKED;
+		break;
+	default:
+#ifdef CAPABILITIES
+		FILEDESC_SLOCK(fdp);
+		locked = LA_SLOCKED;
+#else
+		locked = LA_UNLOCKED;
+#endif
+		break;
+	}
+
+#ifdef CAPABILITIES
+	if ((fp = fget_locked(fdp, fd)) == NULL) {
+		error = EBADF;
+		goto out;
+	}
+	if ((error = cap_ioctl_check(fdp, fd, com)) != 0) {
+		fp = NULL;	/* fhold() was not called yet */
+		goto out;
+	}
+	fhold(fp);
+	if (locked == LA_SLOCKED) {
+		FILEDESC_SUNLOCK(fdp);
+		locked = LA_UNLOCKED;
+	}
+#else
+	error = fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
+	if (error != 0) {
+		fp = NULL;
+		goto out;
+	}
+#endif
+	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
+		error = EBADF;
+		goto out;
+	}
+
+	switch (com) {
+	case FIONCLEX:
+		fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE;
+		goto out;
+	case FIOCLEX:
+		fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE;
+		goto out;
+	case FIONBIO:
+		if ((tmp = *(int *)data))
+			atomic_set_int(&fp->f_flag, FNONBLOCK);
+		else
+			atomic_clear_int(&fp->f_flag, FNONBLOCK);
+		data = (void *)&tmp;
+		break;
+	case FIOASYNC:
+		if ((tmp = *(int *)data))
+			atomic_set_int(&fp->f_flag, FASYNC);
+		else
+			atomic_clear_int(&fp->f_flag, FASYNC);
+		data = (void *)&tmp;
+		break;
+	}
+
+	error = fo_ioctl(fp, com, data, td->td_ucred, td);
+out:
+	switch (locked) {
+	case LA_XLOCKED:
+		FILEDESC_XUNLOCK(fdp);
+		break;
+#ifdef CAPABILITIES
+	case LA_SLOCKED:
+		FILEDESC_SUNLOCK(fdp);
+		break;
+#endif
+	default:
+		FILEDESC_UNLOCK_ASSERT(fdp);
+		break;
+	}
+	if (fp != NULL)
+		fdrop(fp, td);
+	return (error);
+}
+
+int
+poll_no_poll(int events)
+{
+	/*
+	 * Return true for read/write.  If the user asked for something
+	 * special, return POLLNVAL, so that clients have a way of
+	 * determining reliably whether or not the extended
+	 * functionality is present without hard-coding knowledge
+	 * of specific filesystem implementations.
+	 */
+	if (events & ~POLLSTANDARD)
+		return (POLLNVAL);
+
+	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+int
+sys_pselect(struct thread *td, struct pselect_args *uap)
+{
+	struct timespec ts;
+	struct timeval tv, *tvp;
+	sigset_t set, *uset;
+	int error;
+
+	if (uap->ts != NULL) {
+		error = copyin(uap->ts, &ts, sizeof(ts));
+		if (error != 0)
+		    return (error);
+		TIMESPEC_TO_TIMEVAL(&tv, &ts);
+		tvp = &tv;
+	} else
+		tvp = NULL;
+	if (uap->sm != NULL) {
+		error = copyin(uap->sm, &set, sizeof(set));
+		if (error != 0)
+			return (error);
+		uset = &set;
+	} else
+		uset = NULL;
+	return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
+	    uset, NFDBITS));
+}
+
+int
+kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex,
+    struct timeval *tvp, sigset_t *uset, int abi_nfdbits)
+{
+	int error;
+
+	if (uset != NULL) {
+		error = kern_sigprocmask(td, SIG_SETMASK, uset,
+		    &td->td_oldsigmask, 0);
+		if (error != 0)
+			return (error);
+		td->td_pflags |= TDP_OLDMASK;
+		/*
+		 * Make sure that ast() is called on return to
+		 * usermode and TDP_OLDMASK is cleared, restoring old
+		 * sigmask.
+		 */
+		thread_lock(td);
+		td->td_flags |= TDF_ASTPENDING;
+		thread_unlock(td);
+	}
+	error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct select_args {
+	int	nd;
+	fd_set	*in, *ou, *ex;
+	struct	timeval *tv;
+};
+#endif
+int
+sys_select(struct thread *td, struct select_args *uap)
+{
+	struct timeval tv, *tvp;
+	int error;
+
+	if (uap->tv != NULL) {
+		error = copyin(uap->tv, &tv, sizeof(tv));
+		if (error)
+			return (error);
+		tvp = &tv;
+	} else
+		tvp = NULL;
+
+	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
+	    NFDBITS));
+}
+
+/*
+ * In the unlikely case when user specified n greater then the last
+ * open file descriptor, check that no bits are set after the last
+ * valid fd.  We must return EBADF if any is set.
+ *
+ * There are applications that rely on the behaviour.
+ *
+ * nd is fd_lastfile + 1.
+ */
+static int
+select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits)
+{
+	char *addr, *oaddr;
+	int b, i, res;
+	uint8_t bits;
+
+	if (nd >= ndu || fd_in == NULL)
+		return (0);
+
+	oaddr = NULL;
+	bits = 0; /* silence gcc */
+	for (i = nd; i < ndu; i++) {
+		b = i / NBBY;
+#if BYTE_ORDER == LITTLE_ENDIAN
+		addr = (char *)fd_in + b;
+#else
+		addr = (char *)fd_in;
+		if (abi_nfdbits == NFDBITS) {
+			addr += rounddown(b, sizeof(fd_mask)) +
+			    sizeof(fd_mask) - 1 - b % sizeof(fd_mask);
+		} else {
+			addr += rounddown(b, sizeof(uint32_t)) +
+			    sizeof(uint32_t) - 1 - b % sizeof(uint32_t);
+		}
+#endif
+		if (addr != oaddr) {
+			res = fubyte(addr);
+			if (res == -1)
+				return (EFAULT);
+			oaddr = addr;
+			bits = res;
+		}
+		if ((bits & (1 << (i % NBBY))) != 0)
+			return (EBADF);
+	}
+	return (0);
+}
+
+int
+kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
+    fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits)
+{
+	struct filedesc *fdp;
+	/*
+	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
+	 * infds with the new FD_SETSIZE of 1024, and more than enough for
+	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
+	 * of 256.
+	 */
+	fd_mask s_selbits[howmany(2048, NFDBITS)];
+	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
+	struct timeval rtv;
+	sbintime_t asbt, precision, rsbt;
+	u_int nbufbytes, ncpbytes, ncpubytes, nfdbits;
+	int error, lf, ndu;
+
+	if (nd < 0)
+		return (EINVAL);
+	fdp = td->td_proc->p_fd;
+	ndu = nd;
+	lf = fdp->fd_lastfile;
+	if (nd > lf + 1)
+		nd = lf + 1;
+
+	error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits);
+	if (error != 0)
+		return (error);
+	error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits);
+	if (error != 0)
+		return (error);
+	error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Allocate just enough bits for the non-null fd_sets.  Use the
+	 * preallocated auto buffer if possible.
+	 */
+	nfdbits = roundup(nd, NFDBITS);
+	ncpbytes = nfdbits / NBBY;
+	ncpubytes = roundup(nd, abi_nfdbits) / NBBY;
+	nbufbytes = 0;
+	if (fd_in != NULL)
+		nbufbytes += 2 * ncpbytes;
+	if (fd_ou != NULL)
+		nbufbytes += 2 * ncpbytes;
+	if (fd_ex != NULL)
+		nbufbytes += 2 * ncpbytes;
+	if (nbufbytes <= sizeof s_selbits)
+		selbits = &s_selbits[0];
+	else
+		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
+
+	/*
+	 * Assign pointers into the bit buffers and fetch the input bits.
+	 * Put the output buffers together so that they can be bzeroed
+	 * together.
+	 */
+	sbp = selbits;
+#define	getbits(name, x) \
+	do {								\
+		if (name == NULL) {					\
+			ibits[x] = NULL;				\
+			obits[x] = NULL;				\
+		} else {						\
+			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
+			obits[x] = sbp;					\
+			sbp += ncpbytes / sizeof *sbp;			\
+			error = copyin(name, ibits[x], ncpubytes);	\
+			if (error != 0)					\
+				goto done;				\
+			bzero((char *)ibits[x] + ncpubytes,		\
+			    ncpbytes - ncpubytes);			\
+		}							\
+	} while (0)
+	getbits(fd_in, 0);
+	getbits(fd_ou, 1);
+	getbits(fd_ex, 2);
+#undef	getbits
+
+#if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__)
+	/*
+	 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS,
+	 * we are running under 32-bit emulation. This should be more
+	 * generic.
+	 */
+#define swizzle_fdset(bits)						\
+	if (abi_nfdbits != NFDBITS && bits != NULL) {			\
+		int i;							\
+		for (i = 0; i < ncpbytes / sizeof *sbp; i++)		\
+			bits[i] = (bits[i] >> 32) | (bits[i] << 32);	\
+	}
+#else
+#define swizzle_fdset(bits)
+#endif
+
+	/* Make sure the bit order makes it through an ABI transition */
+	swizzle_fdset(ibits[0]);
+	swizzle_fdset(ibits[1]);
+	swizzle_fdset(ibits[2]);
+	
+	if (nbufbytes != 0)
+		bzero(selbits, nbufbytes / 2);
+
+	precision = 0;
+	if (tvp != NULL) {
+		rtv = *tvp;
+		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
+		    rtv.tv_usec >= 1000000) {
+			error = EINVAL;
+			goto done;
+		}
+		if (!timevalisset(&rtv))
+			asbt = 0;
+		else if (rtv.tv_sec <= INT32_MAX) {
+			rsbt = tvtosbt(rtv);
+			precision = rsbt;
+			precision >>= tc_precexp;
+			if (TIMESEL(&asbt, rsbt))
+				asbt += tc_tick_sbt;
+			if (asbt <= INT64_MAX - rsbt)
+				asbt += rsbt;
+			else
+				asbt = -1;
+		} else
+			asbt = -1;
+	} else
+		asbt = -1;
+	seltdinit(td);
+	/* Iterate until the timeout expires or descriptors become ready. */
+	for (;;) {
+		error = selscan(td, ibits, obits, nd);
+		if (error || td->td_retval[0] != 0)
+			break;
+		error = seltdwait(td, asbt, precision);
+		if (error)
+			break;
+		error = selrescan(td, ibits, obits);
+		if (error || td->td_retval[0] != 0)
+			break;
+	}
+	seltdclear(td);
+
+done:
+	/* select is not restarted after signals... */
+	if (error == ERESTART)
+		error = EINTR;
+	if (error == EWOULDBLOCK)
+		error = 0;
+
+	/* swizzle bit order back, if necessary */
+	swizzle_fdset(obits[0]);
+	swizzle_fdset(obits[1]);
+	swizzle_fdset(obits[2]);
+#undef swizzle_fdset
+
+#define	putbits(name, x) \
+	if (name && (error2 = copyout(obits[x], name, ncpubytes))) \
+		error = error2;
+	if (error == 0) {
+		int error2;
+
+		putbits(fd_in, 0);
+		putbits(fd_ou, 1);
+		putbits(fd_ex, 2);
+#undef putbits
+	}
+	if (selbits != &s_selbits[0])
+		free(selbits, M_SELECT);
+
+	return (error);
+}
+/* 
+ * Convert a select bit set to poll flags.
+ *
+ * The backend always returns POLLHUP/POLLERR if appropriate and we
+ * return this as a set bit in any set.
+ */
+static int select_flags[3] = {
+    POLLRDNORM | POLLHUP | POLLERR,
+    POLLWRNORM | POLLHUP | POLLERR,
+    POLLRDBAND | POLLERR
+};
+
+/*
+ * Compute the fo_poll flags required for a fd given by the index and
+ * bit position in the fd_mask array.
+ */
+static __inline int
+selflags(fd_mask **ibits, int idx, fd_mask bit)
+{
+	int flags;
+	int msk;
+
+	flags = 0;
+	for (msk = 0; msk < 3; msk++) {
+		if (ibits[msk] == NULL)
+			continue;
+		if ((ibits[msk][idx] & bit) == 0)
+			continue;
+		flags |= select_flags[msk];
+	}
+	return (flags);
+}
+
+/*
+ * Set the appropriate output bits given a mask of fired events and the
+ * input bits originally requested.
+ */
+static __inline int
+selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events)
+{
+	int msk;
+	int n;
+
+	n = 0;
+	for (msk = 0; msk < 3; msk++) {
+		if ((events & select_flags[msk]) == 0)
+			continue;
+		if (ibits[msk] == NULL)
+			continue;
+		if ((ibits[msk][idx] & bit) == 0)
+			continue;
+		/*
+		 * XXX Check for a duplicate set.  This can occur because a
+		 * socket calls selrecord() twice for each poll() call
+		 * resulting in two selfds per real fd.  selrescan() will
+		 * call selsetbits twice as a result.
+		 */
+		if ((obits[msk][idx] & bit) != 0)
+			continue;
+		obits[msk][idx] |= bit;
+		n++;
+	}
+
+	return (n);
+}
+
+static __inline int
+getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp)
+{
+	cap_rights_t rights;
+
+	return (fget_unlocked(fdp, fd, cap_rights_init(&rights, CAP_POLL_EVENT),
+	    0, fpp, NULL));
+}
+
+/*
+ * Traverse the list of fds attached to this thread's seltd and check for
+ * completion.
+ */
+static int
+selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
+{
+	struct filedesc *fdp;
+	struct selinfo *si;
+	struct seltd *stp;
+	struct selfd *sfp;
+	struct selfd *sfn;
+	struct file *fp;
+	fd_mask bit;
+	int fd, ev, n, idx;
+	int error;
+
+	fdp = td->td_proc->p_fd;
+	stp = td->td_sel;
+	n = 0;
+	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
+		fd = (int)(uintptr_t)sfp->sf_cookie;
+		si = sfp->sf_si;
+		selfdfree(stp, sfp);
+		/* If the selinfo wasn't cleared the event didn't fire. */
+		if (si != NULL)
+			continue;
+		error = getselfd_cap(fdp, fd, &fp);
+		if (error)
+			return (error);
+		idx = fd / NFDBITS;
+		bit = (fd_mask)1 << (fd % NFDBITS);
+		ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
+		fdrop(fp, td);
+		if (ev != 0)
+			n += selsetbits(ibits, obits, idx, bit, ev);
+	}
+	stp->st_flags = 0;
+	td->td_retval[0] = n;
+	return (0);
+}
+
+/*
+ * Perform the initial filedescriptor scan and register ourselves with
+ * each selinfo.
+ */
+static int
+selscan(td, ibits, obits, nfd)
+	struct thread *td;
+	fd_mask **ibits, **obits;
+	int nfd;
+{
+	struct filedesc *fdp;
+	struct file *fp;
+	fd_mask bit;
+	int ev, flags, end, fd;
+	int n, idx;
+	int error;
+
+	fdp = td->td_proc->p_fd;
+	n = 0;
+	for (idx = 0, fd = 0; fd < nfd; idx++) {
+		end = imin(fd + NFDBITS, nfd);
+		for (bit = 1; fd < end; bit <<= 1, fd++) {
+			/* Compute the list of events we're interested in. */
+			flags = selflags(ibits, idx, bit);
+			if (flags == 0)
+				continue;
+			error = getselfd_cap(fdp, fd, &fp);
+			if (error)
+				return (error);
+			selfdalloc(td, (void *)(uintptr_t)fd);
+			ev = fo_poll(fp, flags, td->td_ucred, td);
+			fdrop(fp, td);
+			if (ev != 0)
+				n += selsetbits(ibits, obits, idx, bit, ev);
+		}
+	}
+
+	td->td_retval[0] = n;
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct poll_args {
+	struct pollfd *fds;
+	u_int	nfds;
+	int	timeout;
+};
+#endif
+int
+sys_poll(td, uap)
+	struct thread *td;
+	struct poll_args *uap;
+{
+	struct pollfd *bits;
+	struct pollfd smallbits[32];
+	sbintime_t asbt, precision, rsbt;
+	u_int nfds;
+	int error;
+	size_t ni;
+
+	nfds = uap->nfds;
+	if (nfds > maxfilesperproc && nfds > FD_SETSIZE) 
+		return (EINVAL);
+	ni = nfds * sizeof(struct pollfd);
+	if (ni > sizeof(smallbits))
+		bits = malloc(ni, M_TEMP, M_WAITOK);
+	else
+		bits = smallbits;
+	error = copyin(uap->fds, bits, ni);
+	if (error)
+		goto done;
+	precision = 0;
+	if (uap->timeout != INFTIM) {
+		if (uap->timeout < 0) {
+			error = EINVAL;
+			goto done;
+		}
+		if (uap->timeout == 0)
+			asbt = 0;
+		else {
+			rsbt = SBT_1MS * uap->timeout;
+			precision = rsbt;
+			precision >>= tc_precexp;
+			if (TIMESEL(&asbt, rsbt))
+				asbt += tc_tick_sbt;
+			asbt += rsbt;
+		}
+	} else
+		asbt = -1;
+	seltdinit(td);
+	/* Iterate until the timeout expires or descriptors become ready. */
+	for (;;) {
+		error = pollscan(td, bits, nfds);
+		if (error || td->td_retval[0] != 0)
+			break;
+		error = seltdwait(td, asbt, precision);
+		if (error)
+			break;
+		error = pollrescan(td);
+		if (error || td->td_retval[0] != 0)
+			break;
+	}
+	seltdclear(td);
+
+done:
+	/* poll is not restarted after signals... */
+	if (error == ERESTART)
+		error = EINTR;
+	if (error == EWOULDBLOCK)
+		error = 0;
+	if (error == 0) {
+		error = pollout(td, bits, uap->fds, nfds);
+		if (error)
+			goto out;
+	}
+out:
+	if (ni > sizeof(smallbits))
+		free(bits, M_TEMP);
+	return (error);
+}
+
+static int
+pollrescan(struct thread *td)
+{
+	struct seltd *stp;
+	struct selfd *sfp;
+	struct selfd *sfn;
+	struct selinfo *si;
+	struct filedesc *fdp;
+	struct file *fp;
+	struct pollfd *fd;
+#ifdef CAPABILITIES
+	cap_rights_t rights;
+#endif
+	int n;
+
+	n = 0;
+	fdp = td->td_proc->p_fd;
+	stp = td->td_sel;
+	FILEDESC_SLOCK(fdp);
+	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
+		fd = (struct pollfd *)sfp->sf_cookie;
+		si = sfp->sf_si;
+		selfdfree(stp, sfp);
+		/* If the selinfo wasn't cleared the event didn't fire. */
+		if (si != NULL)
+			continue;
+		fp = fdp->fd_ofiles[fd->fd].fde_file;
+#ifdef CAPABILITIES
+		if (fp == NULL ||
+		    cap_check(cap_rights(fdp, fd->fd),
+		    cap_rights_init(&rights, CAP_POLL_EVENT)) != 0)
+#else
+		if (fp == NULL)
+#endif
+		{
+			fd->revents = POLLNVAL;
+			n++;
+			continue;
+		}
+
+		/*
+		 * Note: backend also returns POLLHUP and
+		 * POLLERR if appropriate.
+		 */
+		fd->revents = fo_poll(fp, fd->events, td->td_ucred, td);
+		if (fd->revents != 0)
+			n++;
+	}
+	FILEDESC_SUNLOCK(fdp);
+	stp->st_flags = 0;
+	td->td_retval[0] = n;
+	return (0);
+}
+
+
+static int
+pollout(td, fds, ufds, nfd)
+	struct thread *td;
+	struct pollfd *fds;
+	struct pollfd *ufds;
+	u_int nfd;
+{
+	int error = 0;
+	u_int i = 0;
+	u_int n = 0;
+
+	for (i = 0; i < nfd; i++) {
+		error = copyout(&fds->revents, &ufds->revents,
+		    sizeof(ufds->revents));
+		if (error)
+			return (error);
+		if (fds->revents != 0)
+			n++;
+		fds++;
+		ufds++;
+	}
+	td->td_retval[0] = n;
+	return (0);
+}
+
+static int
+pollscan(td, fds, nfd)
+	struct thread *td;
+	struct pollfd *fds;
+	u_int nfd;
+{
+	struct filedesc *fdp = td->td_proc->p_fd;
+	struct file *fp;
+#ifdef CAPABILITIES
+	cap_rights_t rights;
+#endif
+	int i, n = 0;
+
+	FILEDESC_SLOCK(fdp);
+	for (i = 0; i < nfd; i++, fds++) {
+		if (fds->fd >= fdp->fd_nfiles) {
+			fds->revents = POLLNVAL;
+			n++;
+		} else if (fds->fd < 0) {
+			fds->revents = 0;
+		} else {
+			fp = fdp->fd_ofiles[fds->fd].fde_file;
+#ifdef CAPABILITIES
+			if (fp == NULL ||
+			    cap_check(cap_rights(fdp, fds->fd),
+			    cap_rights_init(&rights, CAP_POLL_EVENT)) != 0)
+#else
+			if (fp == NULL)
+#endif
+			{
+				fds->revents = POLLNVAL;
+				n++;
+			} else {
+				/*
+				 * Note: backend also returns POLLHUP and
+				 * POLLERR if appropriate.
+				 */
+				selfdalloc(td, fds);
+				fds->revents = fo_poll(fp, fds->events,
+				    td->td_ucred, td);
+				/*
+				 * POSIX requires POLLOUT to be never
+				 * set simultaneously with POLLHUP.
+				 */
+				if ((fds->revents & POLLHUP) != 0)
+					fds->revents &= ~POLLOUT;
+
+				if (fds->revents != 0)
+					n++;
+			}
+		}
+	}
+	FILEDESC_SUNLOCK(fdp);
+	td->td_retval[0] = n;
+	return (0);
+}
+
+/*
+ * OpenBSD poll system call.
+ *
+ * XXX this isn't quite a true representation..  OpenBSD uses select ops.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct openbsd_poll_args {
+	struct pollfd *fds;
+	u_int	nfds;
+	int	timeout;
+};
+#endif
+int
+sys_openbsd_poll(td, uap)
+	register struct thread *td;
+	register struct openbsd_poll_args *uap;
+{
+	return (sys_poll(td, (struct poll_args *)uap));
+}
+
+/*
+ * XXX This was created specifically to support netncp and netsmb.  This
+ * allows the caller to specify a socket to wait for events on.  It returns
+ * 0 if any events matched and an error otherwise.  There is no way to
+ * determine which events fired.
+ */
+int
+selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
+{
+	struct timeval rtv;
+	sbintime_t asbt, precision, rsbt;
+	int error;
+
+	precision = 0;	/* stupid gcc! */
+	if (tvp != NULL) {
+		rtv = *tvp;
+		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 
+		    rtv.tv_usec >= 1000000)
+			return (EINVAL);
+		if (!timevalisset(&rtv))
+			asbt = 0;
+		else if (rtv.tv_sec <= INT32_MAX) {
+			rsbt = tvtosbt(rtv);
+			precision = rsbt;
+			precision >>= tc_precexp;
+			if (TIMESEL(&asbt, rsbt))
+				asbt += tc_tick_sbt;
+			if (asbt <= INT64_MAX - rsbt)
+				asbt += rsbt;
+			else
+				asbt = -1;
+		} else
+			asbt = -1;
+	} else
+		asbt = -1;
+	seltdinit(td);
+	/*
+	 * Iterate until the timeout expires or the socket becomes ready.
+	 */
+	for (;;) {
+		selfdalloc(td, NULL);
+		error = sopoll(so, events, NULL, td);
+		/* error here is actually the ready events. */
+		if (error)
+			return (0);
+		error = seltdwait(td, asbt, precision);
+		if (error)
+			break;
+	}
+	seltdclear(td);
+	/* XXX Duplicates ncp/smb behavior. */
+	if (error == ERESTART)
+		error = 0;
+	return (error);
+}
+
+/*
+ * Preallocate two selfds associated with 'cookie'.  Some fo_poll routines
+ * have two select sets, one for read and another for write.
+ */
+static void
+selfdalloc(struct thread *td, void *cookie)
+{
+	struct seltd *stp;
+
+	stp = td->td_sel;
+	if (stp->st_free1 == NULL)
+		stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
+	stp->st_free1->sf_td = stp;
+	stp->st_free1->sf_cookie = cookie;
+	if (stp->st_free2 == NULL)
+		stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
+	stp->st_free2->sf_td = stp;
+	stp->st_free2->sf_cookie = cookie;
+}
+
+static void
+selfdfree(struct seltd *stp, struct selfd *sfp)
+{
+	STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link);
+	mtx_lock(sfp->sf_mtx);
+	if (sfp->sf_si)
+		TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads);
+	mtx_unlock(sfp->sf_mtx);
+	uma_zfree(selfd_zone, sfp);
+}
+
+/* Drain the waiters tied to all the selfd belonging the specified selinfo. */
+void
+seldrain(sip)
+        struct selinfo *sip;
+{
+
+	/*
+	 * This feature is already provided by doselwakeup(), thus it is
+	 * enough to go for it.
+	 * Eventually, the context, should take care to avoid races
+	 * between thread calling select()/poll() and file descriptor
+	 * detaching, but, again, the races are just the same as
+	 * selwakeup().
+	 */
+        doselwakeup(sip, -1);
+}
+
+/*
+ * Record a select request.
+ */
+void
+selrecord(selector, sip)
+	struct thread *selector;
+	struct selinfo *sip;
+{
+	struct selfd *sfp;
+	struct seltd *stp;
+	struct mtx *mtxp;
+
+	stp = selector->td_sel;
+	/*
+	 * Don't record when doing a rescan.
+	 */
+	if (stp->st_flags & SELTD_RESCAN)
+		return;
+	/*
+	 * Grab one of the preallocated descriptors.
+	 */
+	sfp = NULL;
+	if ((sfp = stp->st_free1) != NULL)
+		stp->st_free1 = NULL;
+	else if ((sfp = stp->st_free2) != NULL)
+		stp->st_free2 = NULL;
+	else
+		panic("selrecord: No free selfd on selq");
+	mtxp = sip->si_mtx;
+	if (mtxp == NULL)
+		mtxp = mtx_pool_find(mtxpool_select, sip);
+	/*
+	 * Initialize the sfp and queue it in the thread.
+	 */
+	sfp->sf_si = sip;
+	sfp->sf_mtx = mtxp;
+	STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link);
+	/*
+	 * Now that we've locked the sip, check for initialization.
+	 */
+	mtx_lock(mtxp);
+	if (sip->si_mtx == NULL) {
+		sip->si_mtx = mtxp;
+		TAILQ_INIT(&sip->si_tdlist);
+	}
+	/*
+	 * Add this thread to the list of selfds listening on this selinfo.
+	 */
+	TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads);
+	mtx_unlock(sip->si_mtx);
+}
+
+/* Wake up a selecting thread. */
+void
+selwakeup(sip)
+	struct selinfo *sip;
+{
+	doselwakeup(sip, -1);
+}
+
+/* Wake up a selecting thread, and set its priority. */
+void
+selwakeuppri(sip, pri)
+	struct selinfo *sip;
+	int pri;
+{
+	doselwakeup(sip, pri);
+}
+
+/*
+ * Do a wakeup when a selectable event occurs.
+ */
+static void
+doselwakeup(sip, pri)
+	struct selinfo *sip;
+	int pri;
+{
+	struct selfd *sfp;
+	struct selfd *sfn;
+	struct seltd *stp;
+
+	/* If it's not initialized there can't be any waiters. */
+	if (sip->si_mtx == NULL)
+		return;
+	/*
+	 * Locking the selinfo locks all selfds associated with it.
+	 */
+	mtx_lock(sip->si_mtx);
+	TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) {
+		/*
+		 * Once we remove this sfp from the list and clear the
+		 * sf_si seltdclear will know to ignore this si.
+		 */
+		TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads);
+		sfp->sf_si = NULL;
+		stp = sfp->sf_td;
+		mtx_lock(&stp->st_mtx);
+		stp->st_flags |= SELTD_PENDING;
+		cv_broadcastpri(&stp->st_wait, pri);
+		mtx_unlock(&stp->st_mtx);
+	}
+	mtx_unlock(sip->si_mtx);
+}
+
+static void
+seltdinit(struct thread *td)
+{
+	struct seltd *stp;
+
+	if ((stp = td->td_sel) != NULL)
+		goto out;
+	td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO);
+	mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF);
+	cv_init(&stp->st_wait, "select");
+out:
+	stp->st_flags = 0;
+	STAILQ_INIT(&stp->st_selq);
+}
+
+static int
+seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision)
+{
+	struct seltd *stp;
+	int error;
+
+	stp = td->td_sel;
+	/*
+	 * An event of interest may occur while we do not hold the seltd
+	 * locked so check the pending flag before we sleep.
+	 */
+	mtx_lock(&stp->st_mtx);
+	/*
+	 * Any further calls to selrecord will be a rescan.
+	 */
+	stp->st_flags |= SELTD_RESCAN;
+	if (stp->st_flags & SELTD_PENDING) {
+		mtx_unlock(&stp->st_mtx);
+		return (0);
+	}
+	if (sbt == 0)
+		error = EWOULDBLOCK;
+	else if (sbt != -1)
+		error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx,
+		    sbt, precision, C_ABSOLUTE);
+	else
+		error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
+	mtx_unlock(&stp->st_mtx);
+
+	return (error);
+}
+
+void
+seltdfini(struct thread *td)
+{
+	struct seltd *stp;
+
+	stp = td->td_sel;
+	if (stp == NULL)
+		return;
+	if (stp->st_free1)
+		uma_zfree(selfd_zone, stp->st_free1);
+	if (stp->st_free2)
+		uma_zfree(selfd_zone, stp->st_free2);
+	td->td_sel = NULL;
+	free(stp, M_SELECT);
+}
+
+/*
+ * Remove the references to the thread from all of the objects we were
+ * polling.
+ */
+static void
+seltdclear(struct thread *td)
+{
+	struct seltd *stp;
+	struct selfd *sfp;
+	struct selfd *sfn;
+
+	stp = td->td_sel;
+	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn)
+		selfdfree(stp, sfp);
+	stp->st_flags = 0;
+}
+
+static void selectinit(void *);
+SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL);
+static void
+selectinit(void *dummy __unused)
+{
+
+	selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF);
+}
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
new file mode 100644
index 0000000..76c295e
--- /dev/null
+++ b/sys/kern/sys_pipe.c
@@ -0,0 +1,1834 @@
+/*-
+ * Copyright (c) 1996 John S. Dyson
+ * Copyright (c) 2012 Giovanni Trematerra
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice immediately at the beginning of the file, without modification,
+ *    this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ *    John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ *    are met.
+ */
+
+/*
+ * This file contains a high-performance replacement for the socket-based
+ * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
+ * all features of sockets, but does do everything that pipes normally
+ * do.
+ */
+
+/*
+ * This code has two modes of operation, a small write mode and a large
+ * write mode.  The small write mode acts like conventional pipes with
+ * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
+ * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
+ * and PIPE_SIZE in size, the sending process pins the underlying pages in
+ * memory, and the receiving process copies directly from these pinned pages
+ * in the sending process.
+ *
+ * If the sending process receives a signal, it is possible that it will
+ * go away, and certainly its address space can change, because control
+ * is returned back to the user-mode side.  In that case, the pipe code
+ * arranges to copy the buffer supplied by the user process, to a pageable
+ * kernel buffer, and the receiving process will grab the data from the
+ * pageable kernel buffer.  Since signals don't happen all that often,
+ * the copy operation is normally eliminated.
+ *
+ * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
+ * happen for small transfers so that the system will not spend all of
+ * its time context switching.
+ *
+ * In order to limit the resource use of pipes, two sysctls exist:
+ *
+ * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
+ * address space available to us in pipe_map. This value is normally
+ * autotuned, but may also be loader tuned.
+ *
+ * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
+ * memory in use by pipes.
+ *
+ * Based on how large pipekva is relative to maxpipekva, the following
+ * will happen:
+ *
+ * 0% - 50%:
+ *     New pipes are given 16K of memory backing, pipes may dynamically
+ *     grow to as large as 64K where needed.
+ * 50% - 75%:
+ *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
+ *     existing pipes may NOT grow.
+ * 75% - 100%:
+ *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
+ *     existing pipes will be shrunk down to 4K whenever possible.
+ *
+ * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0.  If
+ * that is set,  the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
+ * resize which MUST occur for reverse-direction pipes when they are
+ * first used.
+ *
+ * Additional information about the current state of pipes may be obtained
+ * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
+ * and kern.ipc.piperesizefail.
+ *
+ * Locking rules:  There are two locks present here:  A mutex, used via
+ * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
+ * the flag, as mutexes can not persist over uiomove.  The mutex
+ * exists only to guard access to the flag, and is not in itself a
+ * locking mechanism.  Also note that there is only a single mutex for
+ * both directions of a pipe.
+ *
+ * As pipelock() may have to sleep before it can acquire the flag, it
+ * is important to reread all data after a call to pipelock(); everything
+ * in the structure may have changed.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/ttycom.h>
+#include <sys/stat.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/selinfo.h>
+#include <sys/signalvar.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/pipe.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/event.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+/*
+ * Use this define if you want to disable *fancy* VM things.  Expect an
+ * approx 30% decrease in transfer rate.  This could be useful for
+ * NetBSD or OpenBSD.
+ */
+/* #define PIPE_NODIRECT */
+
+#define PIPE_PEER(pipe)	\
+	(((pipe)->pipe_state & PIPE_NAMED) ? (pipe) : ((pipe)->pipe_peer))
+
+/*
+ * interfaces to the outside world
+ */
+static fo_rdwr_t	pipe_read;
+static fo_rdwr_t	pipe_write;
+static fo_truncate_t	pipe_truncate;
+static fo_ioctl_t	pipe_ioctl;
+static fo_poll_t	pipe_poll;
+static fo_kqfilter_t	pipe_kqfilter;
+static fo_stat_t	pipe_stat;
+static fo_close_t	pipe_close;
+static fo_chmod_t	pipe_chmod;
+static fo_chown_t	pipe_chown;
+
+struct fileops pipeops = {
+	.fo_read = pipe_read,
+	.fo_write = pipe_write,
+	.fo_truncate = pipe_truncate,
+	.fo_ioctl = pipe_ioctl,
+	.fo_poll = pipe_poll,
+	.fo_kqfilter = pipe_kqfilter,
+	.fo_stat = pipe_stat,
+	.fo_close = pipe_close,
+	.fo_chmod = pipe_chmod,
+	.fo_chown = pipe_chown,
+	.fo_sendfile = invfo_sendfile,
+	.fo_flags = DFLAG_PASSABLE
+};
+
+static void	filt_pipedetach(struct knote *kn);
+static void	filt_pipedetach_notsup(struct knote *kn);
+static int	filt_pipenotsup(struct knote *kn, long hint);
+static int	filt_piperead(struct knote *kn, long hint);
+static int	filt_pipewrite(struct knote *kn, long hint);
+
+static struct filterops pipe_nfiltops = {
+	.f_isfd = 1,
+	.f_detach = filt_pipedetach_notsup,
+	.f_event = filt_pipenotsup
+};
+static struct filterops pipe_rfiltops = {
+	.f_isfd = 1,
+	.f_detach = filt_pipedetach,
+	.f_event = filt_piperead
+};
+static struct filterops pipe_wfiltops = {
+	.f_isfd = 1,
+	.f_detach = filt_pipedetach,
+	.f_event = filt_pipewrite
+};
+
+/*
+ * Default pipe buffer size(s), this can be kind-of large now because pipe
+ * space is pageable.  The pipe code will try to maintain locality of
+ * reference for performance reasons, so small amounts of outstanding I/O
+ * will not wipe the cache.
+ */
+#define MINPIPESIZE (PIPE_SIZE/3)
+#define MAXPIPESIZE (2*PIPE_SIZE/3)
+
+static long amountpipekva;
+static int pipefragretry;
+static int pipeallocfail;
+static int piperesizefail;
+static int piperesizeallowed = 1;
+
+SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
+	   &maxpipekva, 0, "Pipe KVA limit");
+SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
+	   &amountpipekva, 0, "Pipe KVA usage");
+SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
+	  &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
+SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
+	  &pipeallocfail, 0, "Pipe allocation failures");
+SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
+	  &piperesizefail, 0, "Pipe resize failures");
+SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
+	  &piperesizeallowed, 0, "Pipe resizing allowed");
+
+static void pipeinit(void *dummy __unused);
+static void pipeclose(struct pipe *cpipe);
+static void pipe_free_kmem(struct pipe *cpipe);
+static int pipe_create(struct pipe *pipe, int backing);
+static int pipe_paircreate(struct thread *td, struct pipepair **p_pp);
+static __inline int pipelock(struct pipe *cpipe, int catch);
+static __inline void pipeunlock(struct pipe *cpipe);
+#ifndef PIPE_NODIRECT
+static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
+static void pipe_destroy_write_buffer(struct pipe *wpipe);
+static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
+static void pipe_clone_write_buffer(struct pipe *wpipe);
+#endif
+static int pipespace(struct pipe *cpipe, int size);
+static int pipespace_new(struct pipe *cpipe, int size);
+
+static int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
+static int	pipe_zone_init(void *mem, int size, int flags);
+static void	pipe_zone_fini(void *mem, int size);
+
+static uma_zone_t pipe_zone;
+static struct unrhdr *pipeino_unr;
+static dev_t pipedev_ino;
+
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
+
+static void
+pipeinit(void *dummy __unused)
+{
+
+	pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
+	    pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
+	    UMA_ALIGN_PTR, 0);
+	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
+	pipeino_unr = new_unrhdr(1, INT32_MAX, NULL);
+	KASSERT(pipeino_unr != NULL, ("pipe fake inodes not initialized"));
+	pipedev_ino = devfs_alloc_cdp_inode();
+	KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized"));
+}
+
+static int
+pipe_zone_ctor(void *mem, int size, void *arg, int flags)
+{
+	struct pipepair *pp;
+	struct pipe *rpipe, *wpipe;
+
+	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
+
+	pp = (struct pipepair *)mem;
+
+	/*
+	 * We zero both pipe endpoints to make sure all the kmem pointers
+	 * are NULL, flag fields are zero'd, etc.  We timestamp both
+	 * endpoints with the same time.
+	 */
+	rpipe = &pp->pp_rpipe;
+	bzero(rpipe, sizeof(*rpipe));
+	vfs_timestamp(&rpipe->pipe_ctime);
+	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
+
+	wpipe = &pp->pp_wpipe;
+	bzero(wpipe, sizeof(*wpipe));
+	wpipe->pipe_ctime = rpipe->pipe_ctime;
+	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
+
+	rpipe->pipe_peer = wpipe;
+	rpipe->pipe_pair = pp;
+	wpipe->pipe_peer = rpipe;
+	wpipe->pipe_pair = pp;
+
+	/*
+	 * Mark both endpoints as present; they will later get free'd
+	 * one at a time.  When both are free'd, then the whole pair
+	 * is released.
+	 */
+	rpipe->pipe_present = PIPE_ACTIVE;
+	wpipe->pipe_present = PIPE_ACTIVE;
+
+	/*
+	 * Eventually, the MAC Framework may initialize the label
+	 * in ctor or init, but for now we do it elswhere to avoid
+	 * blocking in ctor or init.
+	 */
+	pp->pp_label = NULL;
+
+	return (0);
+}
+
+static int
+pipe_zone_init(void *mem, int size, int flags)
+{
+	struct pipepair *pp;
+
+	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
+
+	pp = (struct pipepair *)mem;
+
+	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
+	return (0);
+}
+
+static void
+pipe_zone_fini(void *mem, int size)
+{
+	struct pipepair *pp;
+
+	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
+
+	pp = (struct pipepair *)mem;
+
+	mtx_destroy(&pp->pp_mtx);
+}
+
+static int
+pipe_paircreate(struct thread *td, struct pipepair **p_pp)
+{
+	struct pipepair *pp;
+	struct pipe *rpipe, *wpipe;
+	int error;
+
+	*p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK);
+#ifdef MAC
+	/*
+	 * The MAC label is shared between the connected endpoints.  As a
+	 * result mac_pipe_init() and mac_pipe_create() are called once
+	 * for the pair, and not on the endpoints.
+	 */
+	mac_pipe_init(pp);
+	mac_pipe_create(td->td_ucred, pp);
+#endif
+	rpipe = &pp->pp_rpipe;
+	wpipe = &pp->pp_wpipe;
+
+	knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe));
+	knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe));
+
+	/* Only the forward direction pipe is backed by default */
+	if ((error = pipe_create(rpipe, 1)) != 0 ||
+	    (error = pipe_create(wpipe, 0)) != 0) {
+		pipeclose(rpipe);
+		pipeclose(wpipe);
+		return (error);
+	}
+
+	rpipe->pipe_state |= PIPE_DIRECTOK;
+	wpipe->pipe_state |= PIPE_DIRECTOK;
+	return (0);
+}
+
+int
+pipe_named_ctor(struct pipe **ppipe, struct thread *td)
+{
+	struct pipepair *pp;
+	int error;
+
+	error = pipe_paircreate(td, &pp);
+	if (error != 0)
+		return (error);
+	pp->pp_rpipe.pipe_state |= PIPE_NAMED;
+	*ppipe = &pp->pp_rpipe;
+	return (0);
+}
+
+void
+pipe_dtor(struct pipe *dpipe)
+{
+	ino_t ino;
+
+	ino = dpipe->pipe_ino;
+	funsetown(&dpipe->pipe_sigio);
+	pipeclose(dpipe);
+	if (dpipe->pipe_state & PIPE_NAMED) {
+		dpipe = dpipe->pipe_peer;
+		funsetown(&dpipe->pipe_sigio);
+		pipeclose(dpipe);
+	}
+	if (ino != 0 && ino != (ino_t)-1)
+		free_unr(pipeino_unr, ino);
+}
+
+/*
+ * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail, let
+ * the zone pick up the pieces via pipeclose().
+ */
+int
+kern_pipe(struct thread *td, int fildes[2])
+{
+
+	return (kern_pipe2(td, fildes, 0));
+}
+
+int
+kern_pipe2(struct thread *td, int fildes[2], int flags)
+{
+	struct filedesc *fdp; 
+	struct file *rf, *wf;
+	struct pipe *rpipe, *wpipe;
+	struct pipepair *pp;
+	int fd, fflags, error;
+
+	fdp = td->td_proc->p_fd;
+	error = pipe_paircreate(td, &pp);
+	if (error != 0)
+		return (error);
+	rpipe = &pp->pp_rpipe;
+	wpipe = &pp->pp_wpipe;
+	error = falloc(td, &rf, &fd, flags);
+	if (error) {
+		pipeclose(rpipe);
+		pipeclose(wpipe);
+		return (error);
+	}
+	/* An extra reference on `rf' has been held for us by falloc(). */
+	fildes[0] = fd;
+
+	fflags = FREAD | FWRITE;
+	if ((flags & O_NONBLOCK) != 0)
+		fflags |= FNONBLOCK;
+
+	/*
+	 * Warning: once we've gotten past allocation of the fd for the
+	 * read-side, we can only drop the read side via fdrop() in order
+	 * to avoid races against processes which manage to dup() the read
+	 * side while we are blocked trying to allocate the write side.
+	 */
+	finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops);
+	error = falloc(td, &wf, &fd, flags);
+	if (error) {
+		fdclose(fdp, rf, fildes[0], td);
+		fdrop(rf, td);
+		/* rpipe has been closed by fdrop(). */
+		pipeclose(wpipe);
+		return (error);
+	}
+	/* An extra reference on `wf' has been held for us by falloc(). */
+	finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops);
+	fdrop(wf, td);
+	fildes[1] = fd;
+	fdrop(rf, td);
+
+	return (0);
+}
+
+/* ARGSUSED */
+int
+sys_pipe(struct thread *td, struct pipe_args *uap)
+{
+	int error;
+	int fildes[2];
+
+	error = kern_pipe(td, fildes);
+	if (error)
+		return (error);
+
+	td->td_retval[0] = fildes[0];
+	td->td_retval[1] = fildes[1];
+
+	return (0);
+}
+
+int
+sys_pipe2(struct thread *td, struct pipe2_args *uap)
+{
+	int error, fildes[2];
+
+	if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK))
+		return (EINVAL);
+	error = kern_pipe2(td, fildes, uap->flags);
+	if (error)
+		return (error);
+	error = copyout(fildes, uap->fildes, 2 * sizeof(int));
+	if (error) {
+		(void)kern_close(td, fildes[0]);
+		(void)kern_close(td, fildes[1]);
+	}
+	return (error);
+}
+
+/*
+ * Allocate kva for pipe circular buffer, the space is pageable
+ * This routine will 'realloc' the size of a pipe safely, if it fails
+ * it will retain the old buffer.
+ * If it fails it will return ENOMEM.
+ */
+static int
+pipespace_new(cpipe, size)
+	struct pipe *cpipe;
+	int size;
+{
+	caddr_t buffer;
+	int error, cnt, firstseg;
+	static int curfail = 0;
+	static struct timeval lastfail;
+
+	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
+	KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
+		("pipespace: resize of direct writes not allowed"));
+retry:
+	cnt = cpipe->pipe_buffer.cnt;
+	if (cnt > size)
+		size = cnt;
+
+	size = round_page(size);
+	buffer = (caddr_t) vm_map_min(pipe_map);
+
+	error = vm_map_find(pipe_map, NULL, 0,
+		(vm_offset_t *) &buffer, size, 1,
+		VM_PROT_ALL, VM_PROT_ALL, 0);
+	if (error != KERN_SUCCESS) {
+		if ((cpipe->pipe_buffer.buffer == NULL) &&
+			(size > SMALL_PIPE_SIZE)) {
+			size = SMALL_PIPE_SIZE;
+			pipefragretry++;
+			goto retry;
+		}
+		if (cpipe->pipe_buffer.buffer == NULL) {
+			pipeallocfail++;
+			if (ppsratecheck(&lastfail, &curfail, 1))
+				printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
+		} else {
+			piperesizefail++;
+		}
+		return (ENOMEM);
+	}
+
+	/* copy data, then free old resources if we're resizing */
+	if (cnt > 0) {
+		if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
+			firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
+			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
+				buffer, firstseg);
+			if ((cnt - firstseg) > 0)
+				bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
+					cpipe->pipe_buffer.in);
+		} else {
+			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
+				buffer, cnt);
+		}
+	}
+	pipe_free_kmem(cpipe);
+	cpipe->pipe_buffer.buffer = buffer;
+	cpipe->pipe_buffer.size = size;
+	cpipe->pipe_buffer.in = cnt;
+	cpipe->pipe_buffer.out = 0;
+	cpipe->pipe_buffer.cnt = cnt;
+	atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size);
+	return (0);
+}
+
+/*
+ * Wrapper for pipespace_new() that performs locking assertions.
+ */
+static int
+pipespace(cpipe, size)
+	struct pipe *cpipe;
+	int size;
+{
+
+	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
+		("Unlocked pipe passed to pipespace"));
+	return (pipespace_new(cpipe, size));
+}
+
+/*
+ * lock a pipe for I/O, blocking other access
+ */
+static __inline int
+pipelock(cpipe, catch)
+	struct pipe *cpipe;
+	int catch;
+{
+	int error;
+
+	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
+	while (cpipe->pipe_state & PIPE_LOCKFL) {
+		cpipe->pipe_state |= PIPE_LWANT;
+		error = msleep(cpipe, PIPE_MTX(cpipe),
+		    catch ? (PRIBIO | PCATCH) : PRIBIO,
+		    "pipelk", 0);
+		if (error != 0)
+			return (error);
+	}
+	cpipe->pipe_state |= PIPE_LOCKFL;
+	return (0);
+}
+
+/*
+ * unlock a pipe I/O lock
+ */
+static __inline void
+pipeunlock(cpipe)
+	struct pipe *cpipe;
+{
+
+	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
+	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
+		("Unlocked pipe passed to pipeunlock"));
+	cpipe->pipe_state &= ~PIPE_LOCKFL;
+	if (cpipe->pipe_state & PIPE_LWANT) {
+		cpipe->pipe_state &= ~PIPE_LWANT;
+		wakeup(cpipe);
+	}
+}
+
+void
+pipeselwakeup(cpipe)
+	struct pipe *cpipe;
+{
+
+	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
+	if (cpipe->pipe_state & PIPE_SEL) {
+		selwakeuppri(&cpipe->pipe_sel, PSOCK);
+		if (!SEL_WAITING(&cpipe->pipe_sel))
+			cpipe->pipe_state &= ~PIPE_SEL;
+	}
+	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
+		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
+	KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
+}
+
+/*
+ * Initialize and allocate VM and memory for pipe.  The structure
+ * will start out zero'd from the ctor, so we just manage the kmem.
+ */
+static int
+pipe_create(pipe, backing)
+	struct pipe *pipe;
+	int backing;
+{
+	int error;
+
+	if (backing) {
+		if (amountpipekva > maxpipekva / 2)
+			error = pipespace_new(pipe, SMALL_PIPE_SIZE);
+		else
+			error = pipespace_new(pipe, PIPE_SIZE);
+	} else {
+		/* If we're not backing this pipe, no need to do anything. */
+		error = 0;
+	}
+	pipe->pipe_ino = -1;
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+pipe_read(fp, uio, active_cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *active_cred;
+	struct thread *td;
+	int flags;
+{
+	struct pipe *rpipe;
+	int error;
+	int nread = 0;
+	int size;
+
+	rpipe = fp->f_data;
+	PIPE_LOCK(rpipe);
+	++rpipe->pipe_busy;
+	error = pipelock(rpipe, 1);
+	if (error)
+		goto unlocked_error;
+
+#ifdef MAC
+	error = mac_pipe_check_read(active_cred, rpipe->pipe_pair);
+	if (error)
+		goto locked_error;
+#endif
+	if (amountpipekva > (3 * maxpipekva) / 4) {
+		if (!(rpipe->pipe_state & PIPE_DIRECTW) &&
+			(rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
+			(rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
+			(piperesizeallowed == 1)) {
+			PIPE_UNLOCK(rpipe);
+			pipespace(rpipe, SMALL_PIPE_SIZE);
+			PIPE_LOCK(rpipe);
+		}
+	}
+
+	while (uio->uio_resid) {
+		/*
+		 * normal pipe buffer receive
+		 */
+		if (rpipe->pipe_buffer.cnt > 0) {
+			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
+			if (size > rpipe->pipe_buffer.cnt)
+				size = rpipe->pipe_buffer.cnt;
+			if (size > uio->uio_resid)
+				size = uio->uio_resid;
+
+			PIPE_UNLOCK(rpipe);
+			error = uiomove(
+			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
+			    size, uio);
+			PIPE_LOCK(rpipe);
+			if (error)
+				break;
+
+			rpipe->pipe_buffer.out += size;
+			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
+				rpipe->pipe_buffer.out = 0;
+
+			rpipe->pipe_buffer.cnt -= size;
+
+			/*
+			 * If there is no more to read in the pipe, reset
+			 * its pointers to the beginning.  This improves
+			 * cache hit stats.
+			 */
+			if (rpipe->pipe_buffer.cnt == 0) {
+				rpipe->pipe_buffer.in = 0;
+				rpipe->pipe_buffer.out = 0;
+			}
+			nread += size;
+#ifndef PIPE_NODIRECT
+		/*
+		 * Direct copy, bypassing a kernel buffer.
+		 */
+		} else if ((size = rpipe->pipe_map.cnt) &&
+			   (rpipe->pipe_state & PIPE_DIRECTW)) {
+			if (size > uio->uio_resid)
+				size = (u_int) uio->uio_resid;
+
+			PIPE_UNLOCK(rpipe);
+			error = uiomove_fromphys(rpipe->pipe_map.ms,
+			    rpipe->pipe_map.pos, size, uio);
+			PIPE_LOCK(rpipe);
+			if (error)
+				break;
+			nread += size;
+			rpipe->pipe_map.pos += size;
+			rpipe->pipe_map.cnt -= size;
+			if (rpipe->pipe_map.cnt == 0) {
+				rpipe->pipe_state &= ~(PIPE_DIRECTW|PIPE_WANTW);
+				wakeup(rpipe);
+			}
+#endif
+		} else {
+			/*
+			 * detect EOF condition
+			 * read returns 0 on EOF, no need to set error
+			 */
+			if (rpipe->pipe_state & PIPE_EOF)
+				break;
+
+			/*
+			 * If the "write-side" has been blocked, wake it up now.
+			 */
+			if (rpipe->pipe_state & PIPE_WANTW) {
+				rpipe->pipe_state &= ~PIPE_WANTW;
+				wakeup(rpipe);
+			}
+
+			/*
+			 * Break if some data was read.
+			 */
+			if (nread > 0)
+				break;
+
+			/*
+			 * Unlock the pipe buffer for our remaining processing.
+			 * We will either break out with an error or we will
+			 * sleep and relock to loop.
+			 */
+			pipeunlock(rpipe);
+
+			/*
+			 * Handle non-blocking mode operation or
+			 * wait for more data.
+			 */
+			if (fp->f_flag & FNONBLOCK) {
+				error = EAGAIN;
+			} else {
+				rpipe->pipe_state |= PIPE_WANTR;
+				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
+				    PRIBIO | PCATCH,
+				    "piperd", 0)) == 0)
+					error = pipelock(rpipe, 1);
+			}
+			if (error)
+				goto unlocked_error;
+		}
+	}
+#ifdef MAC
+locked_error:
+#endif
+	pipeunlock(rpipe);
+
+	/* XXX: should probably do this before getting any locks. */
+	if (error == 0)
+		vfs_timestamp(&rpipe->pipe_atime);
+unlocked_error:
+	--rpipe->pipe_busy;
+
+	/*
+	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
+	 */
+	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
+		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
+		wakeup(rpipe);
+	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
+		/*
+		 * Handle write blocking hysteresis.
+		 */
+		if (rpipe->pipe_state & PIPE_WANTW) {
+			rpipe->pipe_state &= ~PIPE_WANTW;
+			wakeup(rpipe);
+		}
+	}
+
+	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
+		pipeselwakeup(rpipe);
+
+	PIPE_UNLOCK(rpipe);
+	return (error);
+}
+
+#ifndef PIPE_NODIRECT
+/*
+ * Map the sending processes' buffer into kernel space and wire it.
+ * This is similar to a physical write operation.
+ */
+static int
+pipe_build_write_buffer(wpipe, uio)
+	struct pipe *wpipe;
+	struct uio *uio;
+{
+	u_int size;
+	int i;
+
+	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
+	KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
+		("Clone attempt on non-direct write pipe!"));
+
+	if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size)
+                size = wpipe->pipe_buffer.size;
+	else
+                size = uio->uio_iov->iov_len;
+
+	if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
+	    (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ,
+	    wpipe->pipe_map.ms, PIPENPAGES)) < 0)
+		return (EFAULT);
+
+/*
+ * set up the control block
+ */
+	wpipe->pipe_map.npages = i;
+	wpipe->pipe_map.pos =
+	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
+	wpipe->pipe_map.cnt = size;
+
+/*
+ * and update the uio data
+ */
+
+	uio->uio_iov->iov_len -= size;
+	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
+	if (uio->uio_iov->iov_len == 0)
+		uio->uio_iov++;
+	uio->uio_resid -= size;
+	uio->uio_offset += size;
+	return (0);
+}
+
+/*
+ * unmap and unwire the process buffer
+ */
+static void
+pipe_destroy_write_buffer(wpipe)
+	struct pipe *wpipe;
+{
+
+	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
+	vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages);
+	wpipe->pipe_map.npages = 0;
+}
+
+/*
+ * In the case of a signal, the writing process might go away.  This
+ * code copies the data into the circular buffer so that the source
+ * pages can be freed without loss of data.
+ */
+static void
+pipe_clone_write_buffer(wpipe)
+	struct pipe *wpipe;
+{
+	struct uio uio;
+	struct iovec iov;
+	int size;
+	int pos;
+
+	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
+	size = wpipe->pipe_map.cnt;
+	pos = wpipe->pipe_map.pos;
+
+	wpipe->pipe_buffer.in = size;
+	wpipe->pipe_buffer.out = 0;
+	wpipe->pipe_buffer.cnt = size;
+	wpipe->pipe_state &= ~PIPE_DIRECTW;
+
+	PIPE_UNLOCK(wpipe);
+	iov.iov_base = wpipe->pipe_buffer.buffer;
+	iov.iov_len = size;
+	uio.uio_iov = &iov;
+	uio.uio_iovcnt = 1;
+	uio.uio_offset = 0;
+	uio.uio_resid = size;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_rw = UIO_READ;
+	uio.uio_td = curthread;
+	uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
+	PIPE_LOCK(wpipe);
+	pipe_destroy_write_buffer(wpipe);
+}
+
+/*
+ * This implements the pipe buffer write mechanism.  Note that only
+ * a direct write OR a normal pipe write can be pending at any given time.
+ * If there are any characters in the pipe buffer, the direct write will
+ * be deferred until the receiving process grabs all of the bytes from
+ * the pipe buffer.  Then the direct mapping write is set-up.
+ */
+static int
+pipe_direct_write(wpipe, uio)
+	struct pipe *wpipe;
+	struct uio *uio;
+{
+	int error;
+
+retry:
+	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
+	error = pipelock(wpipe, 1);
+	if (wpipe->pipe_state & PIPE_EOF)
+		error = EPIPE;
+	if (error) {
+		pipeunlock(wpipe);
+		goto error1;
+	}
+	while (wpipe->pipe_state & PIPE_DIRECTW) {
+		if (wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+		pipeselwakeup(wpipe);
+		wpipe->pipe_state |= PIPE_WANTW;
+		pipeunlock(wpipe);
+		error = msleep(wpipe, PIPE_MTX(wpipe),
+		    PRIBIO | PCATCH, "pipdww", 0);
+		if (error)
+			goto error1;
+		else
+			goto retry;
+	}
+	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
+	if (wpipe->pipe_buffer.cnt > 0) {
+		if (wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+		pipeselwakeup(wpipe);
+		wpipe->pipe_state |= PIPE_WANTW;
+		pipeunlock(wpipe);
+		error = msleep(wpipe, PIPE_MTX(wpipe),
+		    PRIBIO | PCATCH, "pipdwc", 0);
+		if (error)
+			goto error1;
+		else
+			goto retry;
+	}
+
+	wpipe->pipe_state |= PIPE_DIRECTW;
+
+	PIPE_UNLOCK(wpipe);
+	error = pipe_build_write_buffer(wpipe, uio);
+	PIPE_LOCK(wpipe);
+	if (error) {
+		wpipe->pipe_state &= ~PIPE_DIRECTW;
+		pipeunlock(wpipe);
+		goto error1;
+	}
+
+	error = 0;
+	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
+		if (wpipe->pipe_state & PIPE_EOF) {
+			pipe_destroy_write_buffer(wpipe);
+			pipeselwakeup(wpipe);
+			pipeunlock(wpipe);
+			error = EPIPE;
+			goto error1;
+		}
+		if (wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+		pipeselwakeup(wpipe);
+		wpipe->pipe_state |= PIPE_WANTW;
+		pipeunlock(wpipe);
+		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
+		    "pipdwt", 0);
+		pipelock(wpipe, 0);
+	}
+
+	if (wpipe->pipe_state & PIPE_EOF)
+		error = EPIPE;
+	if (wpipe->pipe_state & PIPE_DIRECTW) {
+		/*
+		 * this bit of trickery substitutes a kernel buffer for
+		 * the process that might be going away.
+		 */
+		pipe_clone_write_buffer(wpipe);
+	} else {
+		pipe_destroy_write_buffer(wpipe);
+	}
+	pipeunlock(wpipe);
+	return (error);
+
+error1:
+	wakeup(wpipe);
+	return (error);
+}
+#endif
+
+static int
+pipe_write(fp, uio, active_cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *active_cred;
+	struct thread *td;
+	int flags;
+{
+	int error = 0;
+	int desiredsize;
+	ssize_t orig_resid;
+	struct pipe *wpipe, *rpipe;
+
+	rpipe = fp->f_data;
+	wpipe = PIPE_PEER(rpipe);
+	PIPE_LOCK(rpipe);
+	error = pipelock(wpipe, 1);
+	if (error) {
+		PIPE_UNLOCK(rpipe);
+		return (error);
+	}
+	/*
+	 * detect loss of pipe read side, issue SIGPIPE if lost.
+	 */
+	if (wpipe->pipe_present != PIPE_ACTIVE ||
+	    (wpipe->pipe_state & PIPE_EOF)) {
+		pipeunlock(wpipe);
+		PIPE_UNLOCK(rpipe);
+		return (EPIPE);
+	}
+#ifdef MAC
+	error = mac_pipe_check_write(active_cred, wpipe->pipe_pair);
+	if (error) {
+		pipeunlock(wpipe);
+		PIPE_UNLOCK(rpipe);
+		return (error);
+	}
+#endif
+	++wpipe->pipe_busy;
+
+	/* Choose a larger size if it's advantageous */
+	desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
+	while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
+		if (piperesizeallowed != 1)
+			break;
+		if (amountpipekva > maxpipekva / 2)
+			break;
+		if (desiredsize == BIG_PIPE_SIZE)
+			break;
+		desiredsize = desiredsize * 2;
+	}
+
+	/* Choose a smaller size if we're in a OOM situation */
+	if ((amountpipekva > (3 * maxpipekva) / 4) &&
+		(wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
+		(wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
+		(piperesizeallowed == 1))
+		desiredsize = SMALL_PIPE_SIZE;
+
+	/* Resize if the above determined that a new size was necessary */
+	if ((desiredsize != wpipe->pipe_buffer.size) &&
+		((wpipe->pipe_state & PIPE_DIRECTW) == 0)) {
+		PIPE_UNLOCK(wpipe);
+		pipespace(wpipe, desiredsize);
+		PIPE_LOCK(wpipe);
+	}
+	if (wpipe->pipe_buffer.size == 0) {
+		/*
+		 * This can only happen for reverse direction use of pipes
+		 * in a complete OOM situation.
+		 */
+		error = ENOMEM;
+		--wpipe->pipe_busy;
+		pipeunlock(wpipe);
+		PIPE_UNLOCK(wpipe);
+		return (error);
+	}
+
+	pipeunlock(wpipe);
+
+	orig_resid = uio->uio_resid;
+
+	while (uio->uio_resid) {
+		int space;
+
+		pipelock(wpipe, 0);
+		if (wpipe->pipe_state & PIPE_EOF) {
+			pipeunlock(wpipe);
+			error = EPIPE;
+			break;
+		}
+#ifndef PIPE_NODIRECT
+		/*
+		 * If the transfer is large, we can gain performance if
+		 * we do process-to-process copies directly.
+		 * If the write is non-blocking, we don't use the
+		 * direct write mechanism.
+		 *
+		 * The direct write mechanism will detect the reader going
+		 * away on us.
+		 */
+		if (uio->uio_segflg == UIO_USERSPACE &&
+		    uio->uio_iov->iov_len >= PIPE_MINDIRECT &&
+		    wpipe->pipe_buffer.size >= PIPE_MINDIRECT &&
+		    (fp->f_flag & FNONBLOCK) == 0) {
+			pipeunlock(wpipe);
+			error = pipe_direct_write(wpipe, uio);
+			if (error)
+				break;
+			continue;
+		}
+#endif
+
+		/*
+		 * Pipe buffered writes cannot be coincidental with
+		 * direct writes.  We wait until the currently executing
+		 * direct write is completed before we start filling the
+		 * pipe buffer.  We break out if a signal occurs or the
+		 * reader goes away.
+		 */
+		if (wpipe->pipe_state & PIPE_DIRECTW) {
+			if (wpipe->pipe_state & PIPE_WANTR) {
+				wpipe->pipe_state &= ~PIPE_WANTR;
+				wakeup(wpipe);
+			}
+			pipeselwakeup(wpipe);
+			wpipe->pipe_state |= PIPE_WANTW;
+			pipeunlock(wpipe);
+			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
+			    "pipbww", 0);
+			if (error)
+				break;
+			else
+				continue;
+		}
+
+		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
+
+		/* Writes of size <= PIPE_BUF must be atomic. */
+		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
+			space = 0;
+
+		if (space > 0) {
+			int size;	/* Transfer size */
+			int segsize;	/* first segment to transfer */
+
+			/*
+			 * Transfer size is minimum of uio transfer
+			 * and free space in pipe buffer.
+			 */
+			if (space > uio->uio_resid)
+				size = uio->uio_resid;
+			else
+				size = space;
+			/*
+			 * First segment to transfer is minimum of
+			 * transfer size and contiguous space in
+			 * pipe buffer.  If first segment to transfer
+			 * is less than the transfer size, we've got
+			 * a wraparound in the buffer.
+			 */
+			segsize = wpipe->pipe_buffer.size -
+				wpipe->pipe_buffer.in;
+			if (segsize > size)
+				segsize = size;
+
+			/* Transfer first segment */
+
+			PIPE_UNLOCK(rpipe);
+			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
+					segsize, uio);
+			PIPE_LOCK(rpipe);
+
+			if (error == 0 && segsize < size) {
+				KASSERT(wpipe->pipe_buffer.in + segsize ==
+					wpipe->pipe_buffer.size,
+					("Pipe buffer wraparound disappeared"));
+				/*
+				 * Transfer remaining part now, to
+				 * support atomic writes.  Wraparound
+				 * happened.
+				 */
+
+				PIPE_UNLOCK(rpipe);
+				error = uiomove(
+				    &wpipe->pipe_buffer.buffer[0],
+				    size - segsize, uio);
+				PIPE_LOCK(rpipe);
+			}
+			if (error == 0) {
+				wpipe->pipe_buffer.in += size;
+				if (wpipe->pipe_buffer.in >=
+				    wpipe->pipe_buffer.size) {
+					KASSERT(wpipe->pipe_buffer.in ==
+						size - segsize +
+						wpipe->pipe_buffer.size,
+						("Expected wraparound bad"));
+					wpipe->pipe_buffer.in = size - segsize;
+				}
+
+				wpipe->pipe_buffer.cnt += size;
+				KASSERT(wpipe->pipe_buffer.cnt <=
+					wpipe->pipe_buffer.size,
+					("Pipe buffer overflow"));
+			}
+			pipeunlock(wpipe);
+			if (error != 0)
+				break;
+		} else {
+			/*
+			 * If the "read-side" has been blocked, wake it up now.
+			 */
+			if (wpipe->pipe_state & PIPE_WANTR) {
+				wpipe->pipe_state &= ~PIPE_WANTR;
+				wakeup(wpipe);
+			}
+
+			/*
+			 * don't block on non-blocking I/O
+			 */
+			if (fp->f_flag & FNONBLOCK) {
+				error = EAGAIN;
+				pipeunlock(wpipe);
+				break;
+			}
+
+			/*
+			 * We have no more space and have something to offer,
+			 * wake up select/poll.
+			 */
+			pipeselwakeup(wpipe);
+
+			wpipe->pipe_state |= PIPE_WANTW;
+			pipeunlock(wpipe);
+			error = msleep(wpipe, PIPE_MTX(rpipe),
+			    PRIBIO | PCATCH, "pipewr", 0);
+			if (error != 0)
+				break;
+		}
+	}
+
+	pipelock(wpipe, 0);
+	--wpipe->pipe_busy;
+
+	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
+		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
+		wakeup(wpipe);
+	} else if (wpipe->pipe_buffer.cnt > 0) {
+		/*
+		 * If we have put any characters in the buffer, we wake up
+		 * the reader.
+		 */
+		if (wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+	}
+
+	/*
+	 * Don't return EPIPE if I/O was successful
+	 */
+	if ((wpipe->pipe_buffer.cnt == 0) &&
+	    (uio->uio_resid == 0) &&
+	    (error == EPIPE)) {
+		error = 0;
+	}
+
+	if (error == 0)
+		vfs_timestamp(&wpipe->pipe_mtime);
+
+	/*
+	 * We have something to offer,
+	 * wake up select/poll.
+	 */
+	if (wpipe->pipe_buffer.cnt)
+		pipeselwakeup(wpipe);
+
+	pipeunlock(wpipe);
+	PIPE_UNLOCK(rpipe);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+pipe_truncate(fp, length, active_cred, td)
+	struct file *fp;
+	off_t length;
+	struct ucred *active_cred;
+	struct thread *td;
+{
+
+	/* For named pipes call the vnode operation. */
+	if (fp->f_vnode != NULL)
+		return (vnops.fo_truncate(fp, length, active_cred, td));
+	return (EINVAL);
+}
+
+/*
+ * we implement a very minimal set of ioctls for compatibility with sockets.
+ */
+static int
+pipe_ioctl(fp, cmd, data, active_cred, td)
+	struct file *fp;
+	u_long cmd;
+	void *data;
+	struct ucred *active_cred;
+	struct thread *td;
+{
+	struct pipe *mpipe = fp->f_data;
+	int error;
+
+	PIPE_LOCK(mpipe);
+
+#ifdef MAC
+	error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
+	if (error) {
+		PIPE_UNLOCK(mpipe);
+		return (error);
+	}
+#endif
+
+	error = 0;
+	switch (cmd) {
+
+	case FIONBIO:
+		break;
+
+	case FIOASYNC:
+		if (*(int *)data) {
+			mpipe->pipe_state |= PIPE_ASYNC;
+		} else {
+			mpipe->pipe_state &= ~PIPE_ASYNC;
+		}
+		break;
+
+	case FIONREAD:
+		if (!(fp->f_flag & FREAD)) {
+			*(int *)data = 0;
+			PIPE_UNLOCK(mpipe);
+			return (0);
+		}
+		if (mpipe->pipe_state & PIPE_DIRECTW)
+			*(int *)data = mpipe->pipe_map.cnt;
+		else
+			*(int *)data = mpipe->pipe_buffer.cnt;
+		break;
+
+	case FIOSETOWN:
+		PIPE_UNLOCK(mpipe);
+		error = fsetown(*(int *)data, &mpipe->pipe_sigio);
+		goto out_unlocked;
+
+	case FIOGETOWN:
+		*(int *)data = fgetown(&mpipe->pipe_sigio);
+		break;
+
+	/* This is deprecated, FIOSETOWN should be used instead. */
+	case TIOCSPGRP:
+		PIPE_UNLOCK(mpipe);
+		error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
+		goto out_unlocked;
+
+	/* This is deprecated, FIOGETOWN should be used instead. */
+	case TIOCGPGRP:
+		*(int *)data = -fgetown(&mpipe->pipe_sigio);
+		break;
+
+	default:
+		error = ENOTTY;
+		break;
+	}
+	PIPE_UNLOCK(mpipe);
+out_unlocked:
+	return (error);
+}
+
+static int
+pipe_poll(fp, events, active_cred, td)
+	struct file *fp;
+	int events;
+	struct ucred *active_cred;
+	struct thread *td;
+{
+	struct pipe *rpipe;
+	struct pipe *wpipe;
+	int levents, revents;
+#ifdef MAC
+	int error;
+#endif
+
+	revents = 0;
+	rpipe = fp->f_data;
+	wpipe = PIPE_PEER(rpipe);
+	PIPE_LOCK(rpipe);
+#ifdef MAC
+	error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
+	if (error)
+		goto locked_error;
+#endif
+	if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM))
+		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
+		    (rpipe->pipe_buffer.cnt > 0))
+			revents |= events & (POLLIN | POLLRDNORM);
+
+	if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM))
+		if (wpipe->pipe_present != PIPE_ACTIVE ||
+		    (wpipe->pipe_state & PIPE_EOF) ||
+		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
+		     ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF ||
+			 wpipe->pipe_buffer.size == 0)))
+			revents |= events & (POLLOUT | POLLWRNORM);
+
+	levents = events &
+	    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND);
+	if (rpipe->pipe_state & PIPE_NAMED && fp->f_flag & FREAD && levents &&
+	    fp->f_seqcount == rpipe->pipe_wgen)
+		events |= POLLINIGNEOF;
+
+	if ((events & POLLINIGNEOF) == 0) {
+		if (rpipe->pipe_state & PIPE_EOF) {
+			revents |= (events & (POLLIN | POLLRDNORM));
+			if (wpipe->pipe_present != PIPE_ACTIVE ||
+			    (wpipe->pipe_state & PIPE_EOF))
+				revents |= POLLHUP;
+		}
+	}
+
+	if (revents == 0) {
+		if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) {
+			selrecord(td, &rpipe->pipe_sel);
+			if (SEL_WAITING(&rpipe->pipe_sel))
+				rpipe->pipe_state |= PIPE_SEL;
+		}
+
+		if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) {
+			selrecord(td, &wpipe->pipe_sel);
+			if (SEL_WAITING(&wpipe->pipe_sel))
+				wpipe->pipe_state |= PIPE_SEL;
+		}
+	}
+#ifdef MAC
+locked_error:
+#endif
+	PIPE_UNLOCK(rpipe);
+
+	return (revents);
+}
+
+/*
+ * We shouldn't need locks here as we're doing a read and this should
+ * be a natural race.
+ */
+static int
+pipe_stat(fp, ub, active_cred, td)
+	struct file *fp;
+	struct stat *ub;
+	struct ucred *active_cred;
+	struct thread *td;
+{
+	struct pipe *pipe;
+	int new_unr;
+#ifdef MAC
+	int error;
+#endif
+
+	pipe = fp->f_data;
+	PIPE_LOCK(pipe);
+#ifdef MAC
+	error = mac_pipe_check_stat(active_cred, pipe->pipe_pair);
+	if (error) {
+		PIPE_UNLOCK(pipe);
+		return (error);
+	}
+#endif
+
+	/* For named pipes ask the underlying filesystem. */
+	if (pipe->pipe_state & PIPE_NAMED) {
+		PIPE_UNLOCK(pipe);
+		return (vnops.fo_stat(fp, ub, active_cred, td));
+	}
+
+	/*
+	 * Lazily allocate an inode number for the pipe.  Most pipe
+	 * users do not call fstat(2) on the pipe, which means that
+	 * postponing the inode allocation until it is must be
+	 * returned to userland is useful.  If alloc_unr failed,
+	 * assign st_ino zero instead of returning an error.
+	 * Special pipe_ino values:
+	 *  -1 - not yet initialized;
+	 *  0  - alloc_unr failed, return 0 as st_ino forever.
+	 */
+	if (pipe->pipe_ino == (ino_t)-1) {
+		new_unr = alloc_unr(pipeino_unr);
+		if (new_unr != -1)
+			pipe->pipe_ino = new_unr;
+		else
+			pipe->pipe_ino = 0;
+	}
+	PIPE_UNLOCK(pipe);
+
+	bzero(ub, sizeof(*ub));
+	ub->st_mode = S_IFIFO;
+	ub->st_blksize = PAGE_SIZE;
+	if (pipe->pipe_state & PIPE_DIRECTW)
+		ub->st_size = pipe->pipe_map.cnt;
+	else
+		ub->st_size = pipe->pipe_buffer.cnt;
+	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
+	ub->st_atim = pipe->pipe_atime;
+	ub->st_mtim = pipe->pipe_mtime;
+	ub->st_ctim = pipe->pipe_ctime;
+	ub->st_uid = fp->f_cred->cr_uid;
+	ub->st_gid = fp->f_cred->cr_gid;
+	ub->st_dev = pipedev_ino;
+	ub->st_ino = pipe->pipe_ino;
+	/*
+	 * Left as 0: st_nlink, st_rdev, st_flags, st_gen.
+	 */
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+pipe_close(fp, td)
+	struct file *fp;
+	struct thread *td;
+{
+
+	if (fp->f_vnode != NULL) 
+		return vnops.fo_close(fp, td);
+	fp->f_ops = &badfileops;
+	pipe_dtor(fp->f_data);
+	fp->f_data = NULL;
+	return (0);
+}
+
+static int
+pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td)
+{
+	struct pipe *cpipe;
+	int error;
+
+	cpipe = fp->f_data;
+	if (cpipe->pipe_state & PIPE_NAMED)
+		error = vn_chmod(fp, mode, active_cred, td);
+	else
+		error = invfo_chmod(fp, mode, active_cred, td);
+	return (error);
+}
+
+static int
+pipe_chown(fp, uid, gid, active_cred, td)
+	struct file *fp;
+	uid_t uid;
+	gid_t gid;
+	struct ucred *active_cred;
+	struct thread *td;
+{
+	struct pipe *cpipe;
+	int error;
+
+	cpipe = fp->f_data;
+	if (cpipe->pipe_state & PIPE_NAMED)
+		error = vn_chown(fp, uid, gid, active_cred, td);
+	else
+		error = invfo_chown(fp, uid, gid, active_cred, td);
+	return (error);
+}
+
+static void
+pipe_free_kmem(cpipe)
+	struct pipe *cpipe;
+{
+
+	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
+	    ("pipe_free_kmem: pipe mutex locked"));
+
+	if (cpipe->pipe_buffer.buffer != NULL) {
+		atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
+		vm_map_remove(pipe_map,
+		    (vm_offset_t)cpipe->pipe_buffer.buffer,
+		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
+		cpipe->pipe_buffer.buffer = NULL;
+	}
+#ifndef PIPE_NODIRECT
+	{
+		cpipe->pipe_map.cnt = 0;
+		cpipe->pipe_map.pos = 0;
+		cpipe->pipe_map.npages = 0;
+	}
+#endif
+}
+
+/*
+ * shutdown the pipe
+ */
+static void
+pipeclose(cpipe)
+	struct pipe *cpipe;
+{
+	struct pipepair *pp;
+	struct pipe *ppipe;
+
+	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
+
+	PIPE_LOCK(cpipe);
+	pipelock(cpipe, 0);
+	pp = cpipe->pipe_pair;
+
+	pipeselwakeup(cpipe);
+
+	/*
+	 * If the other side is blocked, wake it up saying that
+	 * we want to close it down.
+	 */
+	cpipe->pipe_state |= PIPE_EOF;
+	while (cpipe->pipe_busy) {
+		wakeup(cpipe);
+		cpipe->pipe_state |= PIPE_WANT;
+		pipeunlock(cpipe);
+		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
+		pipelock(cpipe, 0);
+	}
+
+
+	/*
+	 * Disconnect from peer, if any.
+	 */
+	ppipe = cpipe->pipe_peer;
+	if (ppipe->pipe_present == PIPE_ACTIVE) {
+		pipeselwakeup(ppipe);
+
+		ppipe->pipe_state |= PIPE_EOF;
+		wakeup(ppipe);
+		KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
+	}
+
+	/*
+	 * Mark this endpoint as free.  Release kmem resources.  We
+	 * don't mark this endpoint as unused until we've finished
+	 * doing that, or the pipe might disappear out from under
+	 * us.
+	 */
+	PIPE_UNLOCK(cpipe);
+	pipe_free_kmem(cpipe);
+	PIPE_LOCK(cpipe);
+	cpipe->pipe_present = PIPE_CLOSING;
+	pipeunlock(cpipe);
+
+	/*
+	 * knlist_clear() may sleep dropping the PIPE_MTX. Set the
+	 * PIPE_FINALIZED, that allows other end to free the
+	 * pipe_pair, only after the knotes are completely dismantled.
+	 */
+	knlist_clear(&cpipe->pipe_sel.si_note, 1);
+	cpipe->pipe_present = PIPE_FINALIZED;
+	seldrain(&cpipe->pipe_sel);
+	knlist_destroy(&cpipe->pipe_sel.si_note);
+
+	/*
+	 * If both endpoints are now closed, release the memory for the
+	 * pipe pair.  If not, unlock.
+	 */
+	if (ppipe->pipe_present == PIPE_FINALIZED) {
+		PIPE_UNLOCK(cpipe);
+#ifdef MAC
+		mac_pipe_destroy(pp);
+#endif
+		uma_zfree(pipe_zone, cpipe->pipe_pair);
+	} else
+		PIPE_UNLOCK(cpipe);
+}
+
+/*ARGSUSED*/
+static int
+pipe_kqfilter(struct file *fp, struct knote *kn)
+{
+	struct pipe *cpipe;
+
+	/*
+	 * If a filter is requested that is not supported by this file
+	 * descriptor, don't return an error, but also don't ever generate an
+	 * event.
+	 */
+	if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) {
+		kn->kn_fop = &pipe_nfiltops;
+		return (0);
+	}
+	if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) {
+		kn->kn_fop = &pipe_nfiltops;
+		return (0);
+	}
+	cpipe = fp->f_data;
+	PIPE_LOCK(cpipe);
+	switch (kn->kn_filter) {
+	case EVFILT_READ:
+		kn->kn_fop = &pipe_rfiltops;
+		break;
+	case EVFILT_WRITE:
+		kn->kn_fop = &pipe_wfiltops;
+		if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) {
+			/* other end of pipe has been closed */
+			PIPE_UNLOCK(cpipe);
+			return (EPIPE);
+		}
+		cpipe = PIPE_PEER(cpipe);
+		break;
+	default:
+		PIPE_UNLOCK(cpipe);
+		return (EINVAL);
+	}
+
+	kn->kn_hook = cpipe; 
+	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
+	PIPE_UNLOCK(cpipe);
+	return (0);
+}
+
+static void
+filt_pipedetach(struct knote *kn)
+{
+	struct pipe *cpipe = kn->kn_hook;
+
+	PIPE_LOCK(cpipe);
+	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
+	PIPE_UNLOCK(cpipe);
+}
+
+/*ARGSUSED*/
+static int
+filt_piperead(struct knote *kn, long hint)
+{
+	struct pipe *rpipe = kn->kn_hook;
+	struct pipe *wpipe = rpipe->pipe_peer;
+	int ret;
+
+	PIPE_LOCK(rpipe);
+	kn->kn_data = rpipe->pipe_buffer.cnt;
+	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
+		kn->kn_data = rpipe->pipe_map.cnt;
+
+	if ((rpipe->pipe_state & PIPE_EOF) ||
+	    wpipe->pipe_present != PIPE_ACTIVE ||
+	    (wpipe->pipe_state & PIPE_EOF)) {
+		kn->kn_flags |= EV_EOF;
+		PIPE_UNLOCK(rpipe);
+		return (1);
+	}
+	ret = kn->kn_data > 0;
+	PIPE_UNLOCK(rpipe);
+	return ret;
+}
+
+/*ARGSUSED*/
+static int
+filt_pipewrite(struct knote *kn, long hint)
+{
+	struct pipe *wpipe;
+   
+	wpipe = kn->kn_hook;
+	PIPE_LOCK(wpipe);
+	if (wpipe->pipe_present != PIPE_ACTIVE ||
+	    (wpipe->pipe_state & PIPE_EOF)) {
+		kn->kn_data = 0;
+		kn->kn_flags |= EV_EOF;
+		PIPE_UNLOCK(wpipe);
+		return (1);
+	}
+	kn->kn_data = (wpipe->pipe_buffer.size > 0) ?
+	    (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) : PIPE_BUF;
+	if (wpipe->pipe_state & PIPE_DIRECTW)
+		kn->kn_data = 0;
+
+	PIPE_UNLOCK(wpipe);
+	return (kn->kn_data >= PIPE_BUF);
+}
+
+static void
+filt_pipedetach_notsup(struct knote *kn)
+{
+
+}
+
+static int
+filt_pipenotsup(struct knote *kn, long hint)
+{
+
+	return (0);
+}
diff --git a/sys/kern/sys_procdesc.c b/sys/kern/sys_procdesc.c
new file mode 100644
index 0000000..4bafeab
--- /dev/null
+++ b/sys/kern/sys_procdesc.c
@@ -0,0 +1,535 @@
+/*-
+ * Copyright (c) 2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed at the University of Cambridge Computer
+ * Laboratory with support from a grant from Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*-
+ * FreeBSD process descriptor facility.
+ *
+ * Some processes are represented by a file descriptor, which will be used in
+ * preference to signaling and pids for the purposes of process management,
+ * and is, in effect, a form of capability.  When a process descriptor is
+ * used with a process, it ceases to be visible to certain traditional UNIX
+ * process facilities, such as waitpid(2).
+ *
+ * Some semantics:
+ *
+ * - At most one process descriptor will exist for any process, although
+ *   references to that descriptor may be held from many processes (or even
+ *   be in flight between processes over a local domain socket).
+ * - Last close on the process descriptor will terminate the process using
+ *   SIGKILL and reparent it to init so that there's a process to reap it
+ *   when it's done exiting.
+ * - If the process exits before the descriptor is closed, it will not
+ *   generate SIGCHLD on termination, or be picked up by waitpid().
+ * - The pdkill(2) system call may be used to deliver a signal to the process
+ *   using its process descriptor.
+ * - The pdwait4(2) system call may be used to block (or not) on a process
+ *   descriptor to collect termination information.
+ *
+ * Open questions:
+ *
+ * - How to handle ptrace(2)?
+ * - Will we want to add a pidtoprocdesc(2) system call to allow process
+ *   descriptors to be created for processes without pfork(2)?
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_procdesc.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/procdesc.h>
+#include <sys/resourcevar.h>
+#include <sys/stat.h>
+#include <sys/sysproto.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/ucred.h>
+
+#include <security/audit/audit.h>
+
+#include <vm/uma.h>
+
+#ifdef PROCDESC
+
+FEATURE(process_descriptors, "Process Descriptors");
+
+static uma_zone_t procdesc_zone;
+
+static fo_rdwr_t	procdesc_read;
+static fo_rdwr_t	procdesc_write;
+static fo_truncate_t	procdesc_truncate;
+static fo_ioctl_t	procdesc_ioctl;
+static fo_poll_t	procdesc_poll;
+static fo_kqfilter_t	procdesc_kqfilter;
+static fo_stat_t	procdesc_stat;
+static fo_close_t	procdesc_close;
+static fo_chmod_t	procdesc_chmod;
+static fo_chown_t	procdesc_chown;
+
+static struct fileops procdesc_ops = {
+	.fo_read = procdesc_read,
+	.fo_write = procdesc_write,
+	.fo_truncate = procdesc_truncate,
+	.fo_ioctl = procdesc_ioctl,
+	.fo_poll = procdesc_poll,
+	.fo_kqfilter = procdesc_kqfilter,
+	.fo_stat = procdesc_stat,
+	.fo_close = procdesc_close,
+	.fo_chmod = procdesc_chmod,
+	.fo_chown = procdesc_chown,
+	.fo_sendfile = invfo_sendfile,
+	.fo_flags = DFLAG_PASSABLE,
+};
+
+/*
+ * Initialize with VFS so that process descriptors are available along with
+ * other file descriptor types.  As long as it runs before init(8) starts,
+ * there shouldn't be a problem.
+ */
+static void
+procdesc_init(void *dummy __unused)
+{
+
+	procdesc_zone = uma_zcreate("procdesc", sizeof(struct procdesc),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	if (procdesc_zone == NULL)
+		panic("procdesc_init: procdesc_zone not initialized");
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, procdesc_init, NULL);
+
+/*
+ * Return a locked process given a process descriptor, or ESRCH if it has
+ * died.
+ */
+int
+procdesc_find(struct thread *td, int fd, cap_rights_t *rightsp,
+    struct proc **p)
+{
+	struct procdesc *pd;
+	struct file *fp;
+	int error;
+
+	error = fget(td, fd, rightsp, &fp);
+	if (error)
+		return (error);
+	if (fp->f_type != DTYPE_PROCDESC) {
+		error = EBADF;
+		goto out;
+	}
+	pd = fp->f_data;
+	sx_slock(&proctree_lock);
+	if (pd->pd_proc != NULL) {
+		*p = pd->pd_proc;
+		PROC_LOCK(*p);
+	} else
+		error = ESRCH;
+	sx_sunlock(&proctree_lock);
+out:
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Function to be used by procstat(1) sysctls when returning procdesc
+ * information.
+ */
+pid_t
+procdesc_pid(struct file *fp_procdesc)
+{
+	struct procdesc *pd;
+
+	KASSERT(fp_procdesc->f_type == DTYPE_PROCDESC,
+	   ("procdesc_pid: !procdesc"));
+
+	pd = fp_procdesc->f_data;
+	return (pd->pd_pid);
+}
+
+/*
+ * Retrieve the PID associated with a process descriptor.
+ */
+int
+kern_pdgetpid(struct thread *td, int fd, cap_rights_t *rightsp, pid_t *pidp)
+{
+	struct file *fp;
+	int error;
+
+	error = fget(td, fd, rightsp, &fp);
+	if (error)
+		return (error);
+	if (fp->f_type != DTYPE_PROCDESC) {
+		error = EBADF;
+		goto out;
+	}
+	*pidp = procdesc_pid(fp);
+out:
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * System call to return the pid of a process given its process descriptor.
+ */
+int
+sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap)
+{
+	cap_rights_t rights;
+	pid_t pid;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	error = kern_pdgetpid(td, uap->fd,
+	    cap_rights_init(&rights, CAP_PDGETPID), &pid);
+	if (error == 0)
+		error = copyout(&pid, uap->pidp, sizeof(pid));
+	return (error);
+}
+
+/*
+ * When a new process is forked by pdfork(), a file descriptor is allocated
+ * by the fork code first, then the process is forked, and then we get a
+ * chance to set up the process descriptor.  Failure is not permitted at this
+ * point, so procdesc_new() must succeed.
+ */
+void
+procdesc_new(struct proc *p, int flags)
+{
+	struct procdesc *pd;
+
+	pd = uma_zalloc(procdesc_zone, M_WAITOK | M_ZERO);
+	pd->pd_proc = p;
+	pd->pd_pid = p->p_pid;
+	p->p_procdesc = pd;
+	pd->pd_flags = 0;
+	if (flags & PD_DAEMON)
+		pd->pd_flags |= PDF_DAEMON;
+	PROCDESC_LOCK_INIT(pd);
+
+	/*
+	 * Process descriptors start out with two references: one from their
+	 * struct file, and the other from their struct proc.
+	 */
+	refcount_init(&pd->pd_refcount, 2);
+}
+
+/*
+ * Initialize a file with a process descriptor.
+ */
+void
+procdesc_finit(struct procdesc *pdp, struct file *fp)
+{
+
+	finit(fp, FREAD | FWRITE, DTYPE_PROCDESC, pdp, &procdesc_ops);
+}
+
+static void
+procdesc_free(struct procdesc *pd)
+{
+
+	/*
+	 * When the last reference is released, we assert that the descriptor
+	 * has been closed, but not that the process has exited, as we will
+	 * detach the descriptor before the process dies if the descript is
+	 * closed, as we can't wait synchronously.
+	 */
+	if (refcount_release(&pd->pd_refcount)) {
+		KASSERT(pd->pd_proc == NULL,
+		    ("procdesc_free: pd_proc != NULL"));
+		KASSERT((pd->pd_flags & PDF_CLOSED),
+		    ("procdesc_free: !PDF_CLOSED"));
+
+		PROCDESC_LOCK_DESTROY(pd);
+		uma_zfree(procdesc_zone, pd);
+	}
+}
+
+/*
+ * procdesc_exit() - notify a process descriptor that its process is exiting.
+ * We use the proctree_lock to ensure that process exit either happens
+ * strictly before or strictly after a concurrent call to procdesc_close().
+ */
+int
+procdesc_exit(struct proc *p)
+{
+	struct procdesc *pd;
+
+	sx_assert(&proctree_lock, SA_XLOCKED);
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL"));
+
+	pd = p->p_procdesc;
+
+	PROCDESC_LOCK(pd);
+	KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == initproc,
+	    ("procdesc_exit: closed && parent not init"));
+
+	pd->pd_flags |= PDF_EXITED;
+
+	/*
+	 * If the process descriptor has been closed, then we have nothing
+	 * to do; return 1 so that init will get SIGCHLD and do the reaping.
+	 * Clean up the procdesc now rather than letting it happen during
+	 * that reap.
+	 */
+	if (pd->pd_flags & PDF_CLOSED) {
+		PROCDESC_UNLOCK(pd);
+		pd->pd_proc = NULL;
+		p->p_procdesc = NULL;
+		procdesc_free(pd);
+		return (1);
+	}
+	if (pd->pd_flags & PDF_SELECTED) {
+		pd->pd_flags &= ~PDF_SELECTED;
+		selwakeup(&pd->pd_selinfo);
+	}
+	PROCDESC_UNLOCK(pd);
+	return (0);
+}
+
+/*
+ * When a process descriptor is reaped, perhaps as a result of close() or
+ * pdwait4(), release the process's reference on the process descriptor.
+ */
+void
+procdesc_reap(struct proc *p)
+{
+	struct procdesc *pd;
+
+	sx_assert(&proctree_lock, SA_XLOCKED);
+	KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL"));
+
+	pd = p->p_procdesc;
+	pd->pd_proc = NULL;
+	p->p_procdesc = NULL;
+	procdesc_free(pd);
+}
+
+/*
+ * procdesc_close() - last close on a process descriptor.  If the process is
+ * still running, terminate with SIGKILL (unless PDF_DAEMON is set) and let
+ * init(8) clean up the mess; if not, we have to clean up the zombie ourselves.
+ */
+static int
+procdesc_close(struct file *fp, struct thread *td)
+{
+	struct procdesc *pd;
+	struct proc *p;
+
+	KASSERT(fp->f_type == DTYPE_PROCDESC, ("procdesc_close: !procdesc"));
+
+	pd = fp->f_data;
+	fp->f_ops = &badfileops;
+	fp->f_data = NULL;
+
+	sx_xlock(&proctree_lock);
+	PROCDESC_LOCK(pd);
+	pd->pd_flags |= PDF_CLOSED;
+	PROCDESC_UNLOCK(pd);
+	p = pd->pd_proc;
+	if (p == NULL) {
+		/*
+		 * This is the case where process' exit status was already
+		 * collected and procdesc_reap() was already called.
+		 */
+		sx_xunlock(&proctree_lock);
+	} else if (p->p_state == PRS_ZOMBIE) {
+		/*
+		 * If the process is already dead and just awaiting reaping,
+		 * do that now.  This will release the process's reference to
+		 * the process descriptor when it calls back into
+		 * procdesc_reap().
+		 */
+		PROC_LOCK(p);
+		PROC_SLOCK(p);
+		proc_reap(curthread, p, NULL, 0);
+	} else {
+		/*
+		 * If the process is not yet dead, we need to kill it, but we
+		 * can't wait around synchronously for it to go away, as that
+		 * path leads to madness (and deadlocks).  First, detach the
+		 * process from its descriptor so that its exit status will
+		 * be reported normally.
+		 */
+		PROC_LOCK(p);
+		pd->pd_proc = NULL;
+		p->p_procdesc = NULL;
+		procdesc_free(pd);
+
+		/*
+		 * Next, reparent it to init(8) so that there's someone to
+		 * pick up the pieces; finally, terminate with prejudice.
+		 */
+		p->p_sigparent = SIGCHLD;
+		proc_reparent(p, initproc);
+		if ((pd->pd_flags & PDF_DAEMON) == 0)
+			kern_psignal(p, SIGKILL);
+		PROC_UNLOCK(p);
+		sx_xunlock(&proctree_lock);
+	}
+
+	/*
+	 * Release the file descriptor's reference on the process descriptor.
+	 */
+	procdesc_free(pd);
+	return (0);
+}
+
+static int
+procdesc_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+procdesc_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+procdesc_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+procdesc_ioctl(struct file *fp, u_long com, void *data,
+    struct ucred *active_cred, struct thread *td)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+procdesc_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct procdesc *pd;
+	int revents;
+
+	revents = 0;
+	pd = fp->f_data;
+	PROCDESC_LOCK(pd);
+	if (pd->pd_flags & PDF_EXITED)
+		revents |= POLLHUP;
+	if (revents == 0) {
+		selrecord(td, &pd->pd_selinfo);
+		pd->pd_flags |= PDF_SELECTED;
+	}
+	PROCDESC_UNLOCK(pd);
+	return (revents);
+}
+
+static int
+procdesc_kqfilter(struct file *fp, struct knote *kn)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct procdesc *pd;
+	struct timeval pstart;
+
+	/*
+	 * XXXRW: Perhaps we should cache some more information from the
+	 * process so that we can return it reliably here even after it has
+	 * died.  For example, caching its credential data.
+	 */
+	bzero(sb, sizeof(*sb));
+	pd = fp->f_data;
+	sx_slock(&proctree_lock);
+	if (pd->pd_proc != NULL) {
+		PROC_LOCK(pd->pd_proc);
+
+		/* Set birth and [acm] times to process start time. */
+		pstart = pd->pd_proc->p_stats->p_start;
+		timevaladd(&pstart, &boottime);
+		TIMEVAL_TO_TIMESPEC(&pstart, &sb->st_birthtim);
+		sb->st_atim = sb->st_birthtim;
+		sb->st_ctim = sb->st_birthtim;
+		sb->st_mtim = sb->st_birthtim;
+		if (pd->pd_proc->p_state != PRS_ZOMBIE)
+			sb->st_mode = S_IFREG | S_IRWXU;
+		else
+			sb->st_mode = S_IFREG;
+		sb->st_uid = pd->pd_proc->p_ucred->cr_ruid;
+		sb->st_gid = pd->pd_proc->p_ucred->cr_rgid;
+		PROC_UNLOCK(pd->pd_proc);
+	} else
+		sb->st_mode = S_IFREG;
+	sx_sunlock(&proctree_lock);
+	return (0);
+}
+
+static int
+procdesc_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+procdesc_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EOPNOTSUPP);
+}
+
+#else /* !PROCDESC */
+
+int
+sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap)
+{
+
+	return (ENOSYS);
+}
+
+#endif /* PROCDESC */
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
new file mode 100644
index 0000000..5508dcf
--- /dev/null
+++ b/sys/kern/sys_process.c
@@ -0,0 +1,1242 @@
+/*-
+ * Copyright (c) 1994, Sean Eric Fagan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Sean Eric Fagan.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/ptrace.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/malloc.h>
+#include <sys/signalvar.h>
+
+#include <machine/reg.h>
+
+#include <security/audit/audit.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_param.h>
+
+#ifdef COMPAT_FREEBSD32
+#include <sys/procfs.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+
+struct ptrace_io_desc32 {
+	int		piod_op;
+	uint32_t	piod_offs;
+	uint32_t	piod_addr;
+	uint32_t	piod_len;
+};
+
+struct ptrace_vm_entry32 {
+	int		pve_entry;
+	int		pve_timestamp;
+	uint32_t	pve_start;
+	uint32_t	pve_end;
+	uint32_t	pve_offset;
+	u_int		pve_prot;
+	u_int		pve_pathlen;
+	int32_t		pve_fileid;
+	u_int		pve_fsid;
+	uint32_t	pve_path;
+};
+
+struct ptrace_lwpinfo32 {
+	lwpid_t	pl_lwpid;	/* LWP described. */
+	int	pl_event;	/* Event that stopped the LWP. */
+	int	pl_flags;	/* LWP flags. */
+	sigset_t	pl_sigmask;	/* LWP signal mask */
+	sigset_t	pl_siglist;	/* LWP pending signal */
+	struct siginfo32 pl_siginfo;	/* siginfo for signal */
+	char	pl_tdname[MAXCOMLEN + 1];	/* LWP name. */
+	int	pl_child_pid;		/* New child pid */
+};
+
+#endif
+
+/*
+ * Functions implemented using PROC_ACTION():
+ *
+ * proc_read_regs(proc, regs)
+ *	Get the current user-visible register set from the process
+ *	and copy it into the regs structure (<machine/reg.h>).
+ *	The process is stopped at the time read_regs is called.
+ *
+ * proc_write_regs(proc, regs)
+ *	Update the current register set from the passed in regs
+ *	structure.  Take care to avoid clobbering special CPU
+ *	registers or privileged bits in the PSL.
+ *	Depending on the architecture this may have fix-up work to do,
+ *	especially if the IAR or PCW are modified.
+ *	The process is stopped at the time write_regs is called.
+ *
+ * proc_read_fpregs, proc_write_fpregs
+ *	deal with the floating point register set, otherwise as above.
+ *
+ * proc_read_dbregs, proc_write_dbregs
+ *	deal with the processor debug register set, otherwise as above.
+ *
+ * proc_sstep(proc)
+ *	Arrange for the process to trap after executing a single instruction.
+ */
+
+#define	PROC_ACTION(action) do {					\
+	int error;							\
+									\
+	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);			\
+	if ((td->td_proc->p_flag & P_INMEM) == 0)			\
+		error = EIO;						\
+	else								\
+		error = (action);					\
+	return (error);							\
+} while(0)
+
+int
+proc_read_regs(struct thread *td, struct reg *regs)
+{
+
+	PROC_ACTION(fill_regs(td, regs));
+}
+
+int
+proc_write_regs(struct thread *td, struct reg *regs)
+{
+
+	PROC_ACTION(set_regs(td, regs));
+}
+
+int
+proc_read_dbregs(struct thread *td, struct dbreg *dbregs)
+{
+
+	PROC_ACTION(fill_dbregs(td, dbregs));
+}
+
+int
+proc_write_dbregs(struct thread *td, struct dbreg *dbregs)
+{
+
+	PROC_ACTION(set_dbregs(td, dbregs));
+}
+
+/*
+ * Ptrace doesn't support fpregs at all, and there are no security holes
+ * or translations for fpregs, so we can just copy them.
+ */
+int
+proc_read_fpregs(struct thread *td, struct fpreg *fpregs)
+{
+
+	PROC_ACTION(fill_fpregs(td, fpregs));
+}
+
+int
+proc_write_fpregs(struct thread *td, struct fpreg *fpregs)
+{
+
+	PROC_ACTION(set_fpregs(td, fpregs));
+}
+
+#ifdef COMPAT_FREEBSD32
+/* For 32 bit binaries, we need to expose the 32 bit regs layouts. */
+int
+proc_read_regs32(struct thread *td, struct reg32 *regs32)
+{
+
+	PROC_ACTION(fill_regs32(td, regs32));
+}
+
+int
+proc_write_regs32(struct thread *td, struct reg32 *regs32)
+{
+
+	PROC_ACTION(set_regs32(td, regs32));
+}
+
+int
+proc_read_dbregs32(struct thread *td, struct dbreg32 *dbregs32)
+{
+
+	PROC_ACTION(fill_dbregs32(td, dbregs32));
+}
+
+int
+proc_write_dbregs32(struct thread *td, struct dbreg32 *dbregs32)
+{
+
+	PROC_ACTION(set_dbregs32(td, dbregs32));
+}
+
+int
+proc_read_fpregs32(struct thread *td, struct fpreg32 *fpregs32)
+{
+
+	PROC_ACTION(fill_fpregs32(td, fpregs32));
+}
+
+int
+proc_write_fpregs32(struct thread *td, struct fpreg32 *fpregs32)
+{
+
+	PROC_ACTION(set_fpregs32(td, fpregs32));
+}
+#endif
+
+int
+proc_sstep(struct thread *td)
+{
+
+	PROC_ACTION(ptrace_single_step(td));
+}
+
+int
+proc_rwmem(struct proc *p, struct uio *uio)
+{
+	vm_map_t map;
+	vm_offset_t pageno;		/* page number */
+	vm_prot_t reqprot;
+	int error, fault_flags, page_offset, writing;
+
+	/*
+	 * Assert that someone has locked this vmspace.  (Should be
+	 * curthread but we can't assert that.)  This keeps the process
+	 * from exiting out from under us until this operation completes.
+	 */
+	KASSERT(p->p_lock >= 1, ("%s: process %p (pid %d) not held", __func__,
+	    p, p->p_pid));
+
+	/*
+	 * The map we want...
+	 */
+	map = &p->p_vmspace->vm_map;
+
+	/*
+	 * If we are writing, then we request vm_fault() to create a private
+	 * copy of each page.  Since these copies will not be writeable by the
+	 * process, we must explicity request that they be dirtied.
+	 */
+	writing = uio->uio_rw == UIO_WRITE;
+	reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ;
+	fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL;
+
+	/*
+	 * Only map in one page at a time.  We don't have to, but it
+	 * makes things easier.  This way is trivial - right?
+	 */
+	do {
+		vm_offset_t uva;
+		u_int len;
+		vm_page_t m;
+
+		uva = (vm_offset_t)uio->uio_offset;
+
+		/*
+		 * Get the page number of this segment.
+		 */
+		pageno = trunc_page(uva);
+		page_offset = uva - pageno;
+
+		/*
+		 * How many bytes to copy
+		 */
+		len = min(PAGE_SIZE - page_offset, uio->uio_resid);
+
+		/*
+		 * Fault and hold the page on behalf of the process.
+		 */
+		error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m);
+		if (error != KERN_SUCCESS) {
+			if (error == KERN_RESOURCE_SHORTAGE)
+				error = ENOMEM;
+			else
+				error = EFAULT;
+			break;
+		}
+
+		/*
+		 * Now do the i/o move.
+		 */
+		error = uiomove_fromphys(&m, page_offset, len, uio);
+
+		/* Make the I-cache coherent for breakpoints. */
+		if (writing && error == 0) {
+			vm_map_lock_read(map);
+			if (vm_map_check_protection(map, pageno, pageno +
+			    PAGE_SIZE, VM_PROT_EXECUTE))
+				vm_sync_icache(map, uva, len);
+			vm_map_unlock_read(map);
+		}
+
+		/*
+		 * Release the page.
+		 */
+		vm_page_lock(m);
+		vm_page_unhold(m);
+		vm_page_unlock(m);
+
+	} while (error == 0 && uio->uio_resid > 0);
+
+	return (error);
+}
+
+static int
+ptrace_vm_entry(struct thread *td, struct proc *p, struct ptrace_vm_entry *pve)
+{
+	struct vattr vattr;
+	vm_map_t map;
+	vm_map_entry_t entry;
+	vm_object_t obj, tobj, lobj;
+	struct vmspace *vm;
+	struct vnode *vp;
+	char *freepath, *fullpath;
+	u_int pathlen;
+	int error, index;
+
+	error = 0;
+	obj = NULL;
+
+	vm = vmspace_acquire_ref(p);
+	map = &vm->vm_map;
+	vm_map_lock_read(map);
+
+	do {
+		entry = map->header.next;
+		index = 0;
+		while (index < pve->pve_entry && entry != &map->header) {
+			entry = entry->next;
+			index++;
+		}
+		if (index != pve->pve_entry) {
+			error = EINVAL;
+			break;
+		}
+		while (entry != &map->header &&
+		    (entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
+			entry = entry->next;
+			index++;
+		}
+		if (entry == &map->header) {
+			error = ENOENT;
+			break;
+		}
+
+		/* We got an entry. */
+		pve->pve_entry = index + 1;
+		pve->pve_timestamp = map->timestamp;
+		pve->pve_start = entry->start;
+		pve->pve_end = entry->end - 1;
+		pve->pve_offset = entry->offset;
+		pve->pve_prot = entry->protection;
+
+		/* Backing object's path needed? */
+		if (pve->pve_pathlen == 0)
+			break;
+
+		pathlen = pve->pve_pathlen;
+		pve->pve_pathlen = 0;
+
+		obj = entry->object.vm_object;
+		if (obj != NULL)
+			VM_OBJECT_RLOCK(obj);
+	} while (0);
+
+	vm_map_unlock_read(map);
+	vmspace_free(vm);
+
+	pve->pve_fsid = VNOVAL;
+	pve->pve_fileid = VNOVAL;
+
+	if (error == 0 && obj != NULL) {
+		lobj = obj;
+		for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) {
+			if (tobj != obj)
+				VM_OBJECT_RLOCK(tobj);
+			if (lobj != obj)
+				VM_OBJECT_RUNLOCK(lobj);
+			lobj = tobj;
+			pve->pve_offset += tobj->backing_object_offset;
+		}
+		vp = (lobj->type == OBJT_VNODE) ? lobj->handle : NULL;
+		if (vp != NULL)
+			vref(vp);
+		if (lobj != obj)
+			VM_OBJECT_RUNLOCK(lobj);
+		VM_OBJECT_RUNLOCK(obj);
+
+		if (vp != NULL) {
+			freepath = NULL;
+			fullpath = NULL;
+			vn_fullpath(td, vp, &fullpath, &freepath);
+			vn_lock(vp, LK_SHARED | LK_RETRY);
+			if (VOP_GETATTR(vp, &vattr, td->td_ucred) == 0) {
+				pve->pve_fileid = vattr.va_fileid;
+				pve->pve_fsid = vattr.va_fsid;
+			}
+			vput(vp);
+
+			if (fullpath != NULL) {
+				pve->pve_pathlen = strlen(fullpath) + 1;
+				if (pve->pve_pathlen <= pathlen) {
+					error = copyout(fullpath, pve->pve_path,
+					    pve->pve_pathlen);
+				} else
+					error = ENAMETOOLONG;
+			}
+			if (freepath != NULL)
+				free(freepath, M_TEMP);
+		}
+	}
+
+	return (error);
+}
+
+#ifdef COMPAT_FREEBSD32
+static int      
+ptrace_vm_entry32(struct thread *td, struct proc *p,
+    struct ptrace_vm_entry32 *pve32)
+{
+	struct ptrace_vm_entry pve;
+	int error;
+
+	pve.pve_entry = pve32->pve_entry;
+	pve.pve_pathlen = pve32->pve_pathlen;
+	pve.pve_path = (void *)(uintptr_t)pve32->pve_path;
+
+	error = ptrace_vm_entry(td, p, &pve);
+	if (error == 0) {
+		pve32->pve_entry = pve.pve_entry;
+		pve32->pve_timestamp = pve.pve_timestamp;
+		pve32->pve_start = pve.pve_start;
+		pve32->pve_end = pve.pve_end;
+		pve32->pve_offset = pve.pve_offset;
+		pve32->pve_prot = pve.pve_prot;
+		pve32->pve_fileid = pve.pve_fileid;
+		pve32->pve_fsid = pve.pve_fsid;
+	}
+
+	pve32->pve_pathlen = pve.pve_pathlen;
+	return (error);
+}
+
+static void
+ptrace_lwpinfo_to32(const struct ptrace_lwpinfo *pl,
+    struct ptrace_lwpinfo32 *pl32)
+{
+
+	pl32->pl_lwpid = pl->pl_lwpid;
+	pl32->pl_event = pl->pl_event;
+	pl32->pl_flags = pl->pl_flags;
+	pl32->pl_sigmask = pl->pl_sigmask;
+	pl32->pl_siglist = pl->pl_siglist;
+	siginfo_to_siginfo32(&pl->pl_siginfo, &pl32->pl_siginfo);
+	strcpy(pl32->pl_tdname, pl->pl_tdname);
+	pl32->pl_child_pid = pl->pl_child_pid;
+}
+#endif /* COMPAT_FREEBSD32 */
+
+/*
+ * Process debugging system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ptrace_args {
+	int	req;
+	pid_t	pid;
+	caddr_t	addr;
+	int	data;
+};
+#endif
+
+#ifdef COMPAT_FREEBSD32
+/*
+ * This CPP subterfuge is to try and reduce the number of ifdefs in
+ * the body of the code.
+ *   COPYIN(uap->addr, &r.reg, sizeof r.reg);
+ * becomes either:
+ *   copyin(uap->addr, &r.reg, sizeof r.reg);
+ * or
+ *   copyin(uap->addr, &r.reg32, sizeof r.reg32);
+ * .. except this is done at runtime.
+ */
+#define	COPYIN(u, k, s)		wrap32 ? \
+	copyin(u, k ## 32, s ## 32) : \
+	copyin(u, k, s)
+#define	COPYOUT(k, u, s)	wrap32 ? \
+	copyout(k ## 32, u, s ## 32) : \
+	copyout(k, u, s)
+#else
+#define	COPYIN(u, k, s)		copyin(u, k, s)
+#define	COPYOUT(k, u, s)	copyout(k, u, s)
+#endif
+int
+sys_ptrace(struct thread *td, struct ptrace_args *uap)
+{
+	/*
+	 * XXX this obfuscation is to reduce stack usage, but the register
+	 * structs may be too large to put on the stack anyway.
+	 */
+	union {
+		struct ptrace_io_desc piod;
+		struct ptrace_lwpinfo pl;
+		struct ptrace_vm_entry pve;
+		struct dbreg dbreg;
+		struct fpreg fpreg;
+		struct reg reg;
+#ifdef COMPAT_FREEBSD32
+		struct dbreg32 dbreg32;
+		struct fpreg32 fpreg32;
+		struct reg32 reg32;
+		struct ptrace_io_desc32 piod32;
+		struct ptrace_lwpinfo32 pl32;
+		struct ptrace_vm_entry32 pve32;
+#endif
+	} r;
+	void *addr;
+	int error = 0;
+#ifdef COMPAT_FREEBSD32
+	int wrap32 = 0;
+
+	if (SV_CURPROC_FLAG(SV_ILP32))
+		wrap32 = 1;
+#endif
+	AUDIT_ARG_PID(uap->pid);
+	AUDIT_ARG_CMD(uap->req);
+	AUDIT_ARG_VALUE(uap->data);
+	addr = &r;
+	switch (uap->req) {
+	case PT_GETREGS:
+	case PT_GETFPREGS:
+	case PT_GETDBREGS:
+	case PT_LWPINFO:
+		break;
+	case PT_SETREGS:
+		error = COPYIN(uap->addr, &r.reg, sizeof r.reg);
+		break;
+	case PT_SETFPREGS:
+		error = COPYIN(uap->addr, &r.fpreg, sizeof r.fpreg);
+		break;
+	case PT_SETDBREGS:
+		error = COPYIN(uap->addr, &r.dbreg, sizeof r.dbreg);
+		break;
+	case PT_IO:
+		error = COPYIN(uap->addr, &r.piod, sizeof r.piod);
+		break;
+	case PT_VM_ENTRY:
+		error = COPYIN(uap->addr, &r.pve, sizeof r.pve);
+		break;
+	default:
+		addr = uap->addr;
+		break;
+	}
+	if (error)
+		return (error);
+
+	error = kern_ptrace(td, uap->req, uap->pid, addr, uap->data);
+	if (error)
+		return (error);
+
+	switch (uap->req) {
+	case PT_VM_ENTRY:
+		error = COPYOUT(&r.pve, uap->addr, sizeof r.pve);
+		break;
+	case PT_IO:
+		error = COPYOUT(&r.piod, uap->addr, sizeof r.piod);
+		break;
+	case PT_GETREGS:
+		error = COPYOUT(&r.reg, uap->addr, sizeof r.reg);
+		break;
+	case PT_GETFPREGS:
+		error = COPYOUT(&r.fpreg, uap->addr, sizeof r.fpreg);
+		break;
+	case PT_GETDBREGS:
+		error = COPYOUT(&r.dbreg, uap->addr, sizeof r.dbreg);
+		break;
+	case PT_LWPINFO:
+		error = copyout(&r.pl, uap->addr, uap->data);
+		break;
+	}
+
+	return (error);
+}
+#undef COPYIN
+#undef COPYOUT
+
+#ifdef COMPAT_FREEBSD32
+/*
+ *   PROC_READ(regs, td2, addr);
+ * becomes either:
+ *   proc_read_regs(td2, addr);
+ * or
+ *   proc_read_regs32(td2, addr);
+ * .. except this is done at runtime.  There is an additional
+ * complication in that PROC_WRITE disallows 32 bit consumers
+ * from writing to 64 bit address space targets.
+ */
+#define	PROC_READ(w, t, a)	wrap32 ? \
+	proc_read_ ## w ## 32(t, a) : \
+	proc_read_ ## w (t, a)
+#define	PROC_WRITE(w, t, a)	wrap32 ? \
+	(safe ? proc_write_ ## w ## 32(t, a) : EINVAL ) : \
+	proc_write_ ## w (t, a)
+#else
+#define	PROC_READ(w, t, a)	proc_read_ ## w (t, a)
+#define	PROC_WRITE(w, t, a)	proc_write_ ## w (t, a)
+#endif
+
+int
+kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
+{
+	struct iovec iov;
+	struct uio uio;
+	struct proc *curp, *p, *pp;
+	struct thread *td2 = NULL, *td3;
+	struct ptrace_io_desc *piod = NULL;
+	struct ptrace_lwpinfo *pl;
+	int error, write, tmp, num;
+	int proctree_locked = 0;
+	lwpid_t tid = 0, *buf;
+#ifdef COMPAT_FREEBSD32
+	int wrap32 = 0, safe = 0;
+	struct ptrace_io_desc32 *piod32 = NULL;
+	struct ptrace_lwpinfo32 *pl32 = NULL;
+	struct ptrace_lwpinfo plr;
+#endif
+
+	curp = td->td_proc;
+
+	/* Lock proctree before locking the process. */
+	switch (req) {
+	case PT_TRACE_ME:
+	case PT_ATTACH:
+	case PT_STEP:
+	case PT_CONTINUE:
+	case PT_TO_SCE:
+	case PT_TO_SCX:
+	case PT_SYSCALL:
+	case PT_FOLLOW_FORK:
+	case PT_DETACH:
+		sx_xlock(&proctree_lock);
+		proctree_locked = 1;
+		break;
+	default:
+		break;
+	}
+
+	write = 0;
+	if (req == PT_TRACE_ME) {
+		p = td->td_proc;
+		PROC_LOCK(p);
+	} else {
+		if (pid <= PID_MAX) {
+			if ((p = pfind(pid)) == NULL) {
+				if (proctree_locked)
+					sx_xunlock(&proctree_lock);
+				return (ESRCH);
+			}
+		} else {
+			td2 = tdfind(pid, -1);
+			if (td2 == NULL) {
+				if (proctree_locked)
+					sx_xunlock(&proctree_lock);
+				return (ESRCH);
+			}
+			p = td2->td_proc;
+			tid = pid;
+			pid = p->p_pid;
+		}
+	}
+	AUDIT_ARG_PROCESS(p);
+
+	if ((p->p_flag & P_WEXIT) != 0) {
+		error = ESRCH;
+		goto fail;
+	}
+	if ((error = p_cansee(td, p)) != 0)
+		goto fail;
+
+	if ((error = p_candebug(td, p)) != 0)
+		goto fail;
+
+	/*
+	 * System processes can't be debugged.
+	 */
+	if ((p->p_flag & P_SYSTEM) != 0) {
+		error = EINVAL;
+		goto fail;
+	}
+
+	if (tid == 0) {
+		if ((p->p_flag & P_STOPPED_TRACE) != 0) {
+			KASSERT(p->p_xthread != NULL, ("NULL p_xthread"));
+			td2 = p->p_xthread;
+		} else {
+			td2 = FIRST_THREAD_IN_PROC(p);
+		}
+		tid = td2->td_tid;
+	}
+
+#ifdef COMPAT_FREEBSD32
+	/*
+	 * Test if we're a 32 bit client and what the target is.
+	 * Set the wrap controls accordingly.
+	 */
+	if (SV_CURPROC_FLAG(SV_ILP32)) {
+		if (SV_PROC_FLAG(td2->td_proc, SV_ILP32))
+			safe = 1;
+		wrap32 = 1;
+	}
+#endif
+	/*
+	 * Permissions check
+	 */
+	switch (req) {
+	case PT_TRACE_ME:
+		/* Always legal. */
+		break;
+
+	case PT_ATTACH:
+		/* Self */
+		if (p->p_pid == td->td_proc->p_pid) {
+			error = EINVAL;
+			goto fail;
+		}
+
+		/* Already traced */
+		if (p->p_flag & P_TRACED) {
+			error = EBUSY;
+			goto fail;
+		}
+
+		/* Can't trace an ancestor if you're being traced. */
+		if (curp->p_flag & P_TRACED) {
+			for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) {
+				if (pp == p) {
+					error = EINVAL;
+					goto fail;
+				}
+			}
+		}
+
+
+		/* OK */
+		break;
+
+	case PT_CLEARSTEP:
+		/* Allow thread to clear single step for itself */
+		if (td->td_tid == tid)
+			break;
+
+		/* FALLTHROUGH */
+	default:
+		/* not being traced... */
+		if ((p->p_flag & P_TRACED) == 0) {
+			error = EPERM;
+			goto fail;
+		}
+
+		/* not being traced by YOU */
+		if (p->p_pptr != td->td_proc) {
+			error = EBUSY;
+			goto fail;
+		}
+
+		/* not currently stopped */
+		if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) == 0 ||
+		    p->p_suspcount != p->p_numthreads  ||
+		    (p->p_flag & P_WAITED) == 0) {
+			error = EBUSY;
+			goto fail;
+		}
+
+		if ((p->p_flag & P_STOPPED_TRACE) == 0) {
+			static int count = 0;
+			if (count++ == 0)
+				printf("P_STOPPED_TRACE not set.\n");
+		}
+
+		/* OK */
+		break;
+	}
+
+	/* Keep this process around until we finish this request. */
+	_PHOLD(p);
+
+#ifdef FIX_SSTEP
+	/*
+	 * Single step fixup ala procfs
+	 */
+	FIX_SSTEP(td2);
+#endif
+
+	/*
+	 * Actually do the requests
+	 */
+
+	td->td_retval[0] = 0;
+
+	switch (req) {
+	case PT_TRACE_ME:
+		/* set my trace flag and "owner" so it can read/write me */
+		p->p_flag |= P_TRACED;
+		if (p->p_flag & P_PPWAIT)
+			p->p_flag |= P_PPTRACE;
+		p->p_oppid = p->p_pptr->p_pid;
+		break;
+
+	case PT_ATTACH:
+		/* security check done above */
+		/*
+		 * It would be nice if the tracing relationship was separate
+		 * from the parent relationship but that would require
+		 * another set of links in the proc struct or for "wait"
+		 * to scan the entire proc table.  To make life easier,
+		 * we just re-parent the process we're trying to trace.
+		 * The old parent is remembered so we can put things back
+		 * on a "detach".
+		 */
+		p->p_flag |= P_TRACED;
+		p->p_oppid = p->p_pptr->p_pid;
+		if (p->p_pptr != td->td_proc) {
+			proc_reparent(p, td->td_proc);
+		}
+		data = SIGSTOP;
+		goto sendsig;	/* in PT_CONTINUE below */
+
+	case PT_CLEARSTEP:
+		error = ptrace_clear_single_step(td2);
+		break;
+
+	case PT_SETSTEP:
+		error = ptrace_single_step(td2);
+		break;
+
+	case PT_SUSPEND:
+		td2->td_dbgflags |= TDB_SUSPEND;
+		thread_lock(td2);
+		td2->td_flags |= TDF_NEEDSUSPCHK;
+		thread_unlock(td2);
+		break;
+
+	case PT_RESUME:
+		td2->td_dbgflags &= ~TDB_SUSPEND;
+		break;
+
+	case PT_FOLLOW_FORK:
+		if (data)
+			p->p_flag |= P_FOLLOWFORK;
+		else
+			p->p_flag &= ~P_FOLLOWFORK;
+		break;
+
+	case PT_STEP:
+	case PT_CONTINUE:
+	case PT_TO_SCE:
+	case PT_TO_SCX:
+	case PT_SYSCALL:
+	case PT_DETACH:
+		/* Zero means do not send any signal */
+		if (data < 0 || data > _SIG_MAXSIG) {
+			error = EINVAL;
+			break;
+		}
+
+		switch (req) {
+		case PT_STEP:
+			error = ptrace_single_step(td2);
+			if (error)
+				goto out;
+			break;
+		case PT_CONTINUE:
+		case PT_TO_SCE:
+		case PT_TO_SCX:
+		case PT_SYSCALL:
+			if (addr != (void *)1) {
+				error = ptrace_set_pc(td2,
+				    (u_long)(uintfptr_t)addr);
+				if (error)
+					goto out;
+			}
+			switch (req) {
+			case PT_TO_SCE:
+				p->p_stops |= S_PT_SCE;
+				break;
+			case PT_TO_SCX:
+				p->p_stops |= S_PT_SCX;
+				break;
+			case PT_SYSCALL:
+				p->p_stops |= S_PT_SCE | S_PT_SCX;
+				break;
+			}
+			break;
+		case PT_DETACH:
+			/* reset process parent */
+			if (p->p_oppid != p->p_pptr->p_pid) {
+				struct proc *pp;
+
+				PROC_LOCK(p->p_pptr);
+				sigqueue_take(p->p_ksi);
+				PROC_UNLOCK(p->p_pptr);
+
+				PROC_UNLOCK(p);
+				pp = pfind(p->p_oppid);
+				if (pp == NULL)
+					pp = initproc;
+				else
+					PROC_UNLOCK(pp);
+				PROC_LOCK(p);
+				proc_reparent(p, pp);
+				if (pp == initproc)
+					p->p_sigparent = SIGCHLD;
+			}
+			p->p_oppid = 0;
+			p->p_flag &= ~(P_TRACED | P_WAITED | P_FOLLOWFORK);
+
+			/* should we send SIGCHLD? */
+			/* childproc_continued(p); */
+			break;
+		}
+
+	sendsig:
+		if (proctree_locked) {
+			sx_xunlock(&proctree_lock);
+			proctree_locked = 0;
+		}
+		p->p_xstat = data;
+		p->p_xthread = NULL;
+		if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) != 0) {
+			/* deliver or queue signal */
+			td2->td_dbgflags &= ~TDB_XSIG;
+			td2->td_xsig = data;
+
+			if (req == PT_DETACH) {
+				FOREACH_THREAD_IN_PROC(p, td3)
+					td3->td_dbgflags &= ~TDB_SUSPEND; 
+			}
+			/*
+			 * unsuspend all threads, to not let a thread run,
+			 * you should use PT_SUSPEND to suspend it before
+			 * continuing process.
+			 */
+			PROC_SLOCK(p);
+			p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG|P_WAITED);
+			thread_unsuspend(p);
+			PROC_SUNLOCK(p);
+			if (req == PT_ATTACH)
+				kern_psignal(p, data);
+		} else {
+			if (data)
+				kern_psignal(p, data);
+		}
+		break;
+
+	case PT_WRITE_I:
+	case PT_WRITE_D:
+		td2->td_dbgflags |= TDB_USERWR;
+		write = 1;
+		/* FALLTHROUGH */
+	case PT_READ_I:
+	case PT_READ_D:
+		PROC_UNLOCK(p);
+		tmp = 0;
+		/* write = 0 set above */
+		iov.iov_base = write ? (caddr_t)&data : (caddr_t)&tmp;
+		iov.iov_len = sizeof(int);
+		uio.uio_iov = &iov;
+		uio.uio_iovcnt = 1;
+		uio.uio_offset = (off_t)(uintptr_t)addr;
+		uio.uio_resid = sizeof(int);
+		uio.uio_segflg = UIO_SYSSPACE;	/* i.e.: the uap */
+		uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+		uio.uio_td = td;
+		error = proc_rwmem(p, &uio);
+		if (uio.uio_resid != 0) {
+			/*
+			 * XXX proc_rwmem() doesn't currently return ENOSPC,
+			 * so I think write() can bogusly return 0.
+			 * XXX what happens for short writes?  We don't want
+			 * to write partial data.
+			 * XXX proc_rwmem() returns EPERM for other invalid
+			 * addresses.  Convert this to EINVAL.  Does this
+			 * clobber returns of EPERM for other reasons?
+			 */
+			if (error == 0 || error == ENOSPC || error == EPERM)
+				error = EINVAL;	/* EOF */
+		}
+		if (!write)
+			td->td_retval[0] = tmp;
+		PROC_LOCK(p);
+		break;
+
+	case PT_IO:
+#ifdef COMPAT_FREEBSD32
+		if (wrap32) {
+			piod32 = addr;
+			iov.iov_base = (void *)(uintptr_t)piod32->piod_addr;
+			iov.iov_len = piod32->piod_len;
+			uio.uio_offset = (off_t)(uintptr_t)piod32->piod_offs;
+			uio.uio_resid = piod32->piod_len;
+		} else
+#endif
+		{
+			piod = addr;
+			iov.iov_base = piod->piod_addr;
+			iov.iov_len = piod->piod_len;
+			uio.uio_offset = (off_t)(uintptr_t)piod->piod_offs;
+			uio.uio_resid = piod->piod_len;
+		}
+		uio.uio_iov = &iov;
+		uio.uio_iovcnt = 1;
+		uio.uio_segflg = UIO_USERSPACE;
+		uio.uio_td = td;
+#ifdef COMPAT_FREEBSD32
+		tmp = wrap32 ? piod32->piod_op : piod->piod_op;
+#else
+		tmp = piod->piod_op;
+#endif
+		switch (tmp) {
+		case PIOD_READ_D:
+		case PIOD_READ_I:
+			uio.uio_rw = UIO_READ;
+			break;
+		case PIOD_WRITE_D:
+		case PIOD_WRITE_I:
+			td2->td_dbgflags |= TDB_USERWR;
+			uio.uio_rw = UIO_WRITE;
+			break;
+		default:
+			error = EINVAL;
+			goto out;
+		}
+		PROC_UNLOCK(p);
+		error = proc_rwmem(p, &uio);
+#ifdef COMPAT_FREEBSD32
+		if (wrap32)
+			piod32->piod_len -= uio.uio_resid;
+		else
+#endif
+			piod->piod_len -= uio.uio_resid;
+		PROC_LOCK(p);
+		break;
+
+	case PT_KILL:
+		data = SIGKILL;
+		goto sendsig;	/* in PT_CONTINUE above */
+
+	case PT_SETREGS:
+		td2->td_dbgflags |= TDB_USERWR;
+		error = PROC_WRITE(regs, td2, addr);
+		break;
+
+	case PT_GETREGS:
+		error = PROC_READ(regs, td2, addr);
+		break;
+
+	case PT_SETFPREGS:
+		td2->td_dbgflags |= TDB_USERWR;
+		error = PROC_WRITE(fpregs, td2, addr);
+		break;
+
+	case PT_GETFPREGS:
+		error = PROC_READ(fpregs, td2, addr);
+		break;
+
+	case PT_SETDBREGS:
+		td2->td_dbgflags |= TDB_USERWR;
+		error = PROC_WRITE(dbregs, td2, addr);
+		break;
+
+	case PT_GETDBREGS:
+		error = PROC_READ(dbregs, td2, addr);
+		break;
+
+	case PT_LWPINFO:
+		if (data <= 0 ||
+#ifdef COMPAT_FREEBSD32
+		    (!wrap32 && data > sizeof(*pl)) ||
+		    (wrap32 && data > sizeof(*pl32))) {
+#else
+		    data > sizeof(*pl)) {
+#endif
+			error = EINVAL;
+			break;
+		}
+#ifdef COMPAT_FREEBSD32
+		if (wrap32) {
+			pl = &plr;
+			pl32 = addr;
+		} else
+#endif
+		pl = addr;
+		pl->pl_lwpid = td2->td_tid;
+		pl->pl_event = PL_EVENT_NONE;
+		pl->pl_flags = 0;
+		if (td2->td_dbgflags & TDB_XSIG) {
+			pl->pl_event = PL_EVENT_SIGNAL;
+			if (td2->td_dbgksi.ksi_signo != 0 &&
+#ifdef COMPAT_FREEBSD32
+			    ((!wrap32 && data >= offsetof(struct ptrace_lwpinfo,
+			    pl_siginfo) + sizeof(pl->pl_siginfo)) ||
+			    (wrap32 && data >= offsetof(struct ptrace_lwpinfo32,
+			    pl_siginfo) + sizeof(struct siginfo32)))
+#else
+			    data >= offsetof(struct ptrace_lwpinfo, pl_siginfo)
+			    + sizeof(pl->pl_siginfo)
+#endif
+			){
+				pl->pl_flags |= PL_FLAG_SI;
+				pl->pl_siginfo = td2->td_dbgksi.ksi_info;
+			}
+		}
+		if ((pl->pl_flags & PL_FLAG_SI) == 0)
+			bzero(&pl->pl_siginfo, sizeof(pl->pl_siginfo));
+		if (td2->td_dbgflags & TDB_SCE)
+			pl->pl_flags |= PL_FLAG_SCE;
+		else if (td2->td_dbgflags & TDB_SCX)
+			pl->pl_flags |= PL_FLAG_SCX;
+		if (td2->td_dbgflags & TDB_EXEC)
+			pl->pl_flags |= PL_FLAG_EXEC;
+		if (td2->td_dbgflags & TDB_FORK) {
+			pl->pl_flags |= PL_FLAG_FORKED;
+			pl->pl_child_pid = td2->td_dbg_forked;
+		}
+		if (td2->td_dbgflags & TDB_CHILD)
+			pl->pl_flags |= PL_FLAG_CHILD;
+		pl->pl_sigmask = td2->td_sigmask;
+		pl->pl_siglist = td2->td_siglist;
+		strcpy(pl->pl_tdname, td2->td_name);
+#ifdef COMPAT_FREEBSD32
+		if (wrap32)
+			ptrace_lwpinfo_to32(pl, pl32);
+#endif
+		break;
+
+	case PT_GETNUMLWPS:
+		td->td_retval[0] = p->p_numthreads;
+		break;
+
+	case PT_GETLWPLIST:
+		if (data <= 0) {
+			error = EINVAL;
+			break;
+		}
+		num = imin(p->p_numthreads, data);
+		PROC_UNLOCK(p);
+		buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK);
+		tmp = 0;
+		PROC_LOCK(p);
+		FOREACH_THREAD_IN_PROC(p, td2) {
+			if (tmp >= num)
+				break;
+			buf[tmp++] = td2->td_tid;
+		}
+		PROC_UNLOCK(p);
+		error = copyout(buf, addr, tmp * sizeof(lwpid_t));
+		free(buf, M_TEMP);
+		if (!error)
+			td->td_retval[0] = tmp;
+		PROC_LOCK(p);
+		break;
+
+	case PT_VM_TIMESTAMP:
+		td->td_retval[0] = p->p_vmspace->vm_map.timestamp;
+		break;
+
+	case PT_VM_ENTRY:
+		PROC_UNLOCK(p);
+#ifdef COMPAT_FREEBSD32
+		if (wrap32)
+			error = ptrace_vm_entry32(td, p, addr);
+		else
+#endif
+		error = ptrace_vm_entry(td, p, addr);
+		PROC_LOCK(p);
+		break;
+
+	default:
+#ifdef __HAVE_PTRACE_MACHDEP
+		if (req >= PT_FIRSTMACH) {
+			PROC_UNLOCK(p);
+			error = cpu_ptrace(td2, req, addr, data);
+			PROC_LOCK(p);
+		} else
+#endif
+			/* Unknown request. */
+			error = EINVAL;
+		break;
+	}
+
+out:
+	/* Drop our hold on this process now that the request has completed. */
+	_PRELE(p);
+fail:
+	PROC_UNLOCK(p);
+	if (proctree_locked)
+		sx_xunlock(&proctree_lock);
+	return (error);
+}
+#undef PROC_READ
+#undef PROC_WRITE
+
+/*
+ * Stop a process because of a debugging event;
+ * stay stopped until p->p_step is cleared
+ * (cleared by PIOCCONT in procfs).
+ */
+void
+stopevent(struct proc *p, unsigned int event, unsigned int val)
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	p->p_step = 1;
+	do {
+		p->p_xstat = val;
+		p->p_xthread = NULL;
+		p->p_stype = event;	/* Which event caused the stop? */
+		wakeup(&p->p_stype);	/* Wake up any PIOCWAIT'ing procs */
+		msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0);
+	} while (p->p_step);
+}
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
new file mode 100644
index 0000000..6a766af
--- /dev/null
+++ b/sys/kern/sys_socket.c
@@ -0,0 +1,297 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)sys_socket.c	8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/sigio.h>
+#include <sys/signal.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/filio.h>			/* XXX */
+#include <sys/sockio.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ucred.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <security/mac/mac_framework.h>
+
+struct fileops	socketops = {
+	.fo_read = soo_read,
+	.fo_write = soo_write,
+	.fo_truncate = soo_truncate,
+	.fo_ioctl = soo_ioctl,
+	.fo_poll = soo_poll,
+	.fo_kqfilter = soo_kqfilter,
+	.fo_stat = soo_stat,
+	.fo_close = soo_close,
+	.fo_chmod = invfo_chmod,
+	.fo_chown = invfo_chown,
+	.fo_sendfile = invfo_sendfile,
+	.fo_flags = DFLAG_PASSABLE
+};
+
+/* ARGSUSED */
+int
+soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	struct socket *so = fp->f_data;
+	int error;
+
+#ifdef MAC
+	error = mac_socket_check_receive(active_cred, so);
+	if (error)
+		return (error);
+#endif
+	error = soreceive(so, 0, uio, 0, 0, 0);
+	return (error);
+}
+
+/* ARGSUSED */
+int
+soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	struct socket *so = fp->f_data;
+	int error;
+
+#ifdef MAC
+	error = mac_socket_check_send(active_cred, so);
+	if (error)
+		return (error);
+#endif
+	error = sosend(so, 0, uio, 0, 0, 0, uio->uio_td);
+	if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) {
+		PROC_LOCK(uio->uio_td->td_proc);
+		tdsignal(uio->uio_td, SIGPIPE);
+		PROC_UNLOCK(uio->uio_td->td_proc);
+	}
+	return (error);
+}
+
+int
+soo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+int
+soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct socket *so = fp->f_data;
+	int error = 0;
+
+	switch (cmd) {
+	case FIONBIO:
+		SOCK_LOCK(so);
+		if (*(int *)data)
+			so->so_state |= SS_NBIO;
+		else
+			so->so_state &= ~SS_NBIO;
+		SOCK_UNLOCK(so);
+		break;
+
+	case FIOASYNC:
+		/*
+		 * XXXRW: This code separately acquires SOCK_LOCK(so) and
+		 * SOCKBUF_LOCK(&so->so_rcv) even though they are the same
+		 * mutex to avoid introducing the assumption that they are
+		 * the same.
+		 */
+		if (*(int *)data) {
+			SOCK_LOCK(so);
+			so->so_state |= SS_ASYNC;
+			SOCK_UNLOCK(so);
+			SOCKBUF_LOCK(&so->so_rcv);
+			so->so_rcv.sb_flags |= SB_ASYNC;
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			SOCKBUF_LOCK(&so->so_snd);
+			so->so_snd.sb_flags |= SB_ASYNC;
+			SOCKBUF_UNLOCK(&so->so_snd);
+		} else {
+			SOCK_LOCK(so);
+			so->so_state &= ~SS_ASYNC;
+			SOCK_UNLOCK(so);
+			SOCKBUF_LOCK(&so->so_rcv);
+			so->so_rcv.sb_flags &= ~SB_ASYNC;
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			SOCKBUF_LOCK(&so->so_snd);
+			so->so_snd.sb_flags &= ~SB_ASYNC;
+			SOCKBUF_UNLOCK(&so->so_snd);
+		}
+		break;
+
+	case FIONREAD:
+		/* Unlocked read. */
+		*(int *)data = so->so_rcv.sb_cc;
+		break;
+
+	case FIONWRITE:
+		/* Unlocked read. */
+		*(int *)data = so->so_snd.sb_cc;
+		break;
+
+	case FIONSPACE:
+		if ((so->so_snd.sb_hiwat < so->so_snd.sb_cc) ||
+		    (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt))
+			*(int *)data = 0;
+		else
+			*(int *)data = sbspace(&so->so_snd);
+		break;
+
+	case FIOSETOWN:
+		error = fsetown(*(int *)data, &so->so_sigio);
+		break;
+
+	case FIOGETOWN:
+		*(int *)data = fgetown(&so->so_sigio);
+		break;
+
+	case SIOCSPGRP:
+		error = fsetown(-(*(int *)data), &so->so_sigio);
+		break;
+
+	case SIOCGPGRP:
+		*(int *)data = -fgetown(&so->so_sigio);
+		break;
+
+	case SIOCATMARK:
+		/* Unlocked read. */
+		*(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0;
+		break;
+	default:
+		/*
+		 * Interface/routing/protocol specific ioctls: interface and
+		 * routing ioctls should have a different entry since a
+		 * socket is unnecessary.
+		 */
+		if (IOCGROUP(cmd) == 'i')
+			error = ifioctl(so, cmd, data, td);
+		else if (IOCGROUP(cmd) == 'r') {
+			CURVNET_SET(so->so_vnet);
+			error = rtioctl_fib(cmd, data, so->so_fibnum);
+			CURVNET_RESTORE();
+		} else {
+			CURVNET_SET(so->so_vnet);
+			error = ((*so->so_proto->pr_usrreqs->pru_control)
+			    (so, cmd, data, 0, td));
+			CURVNET_RESTORE();
+		}
+		break;
+	}
+	return (error);
+}
+
+int
+soo_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct socket *so = fp->f_data;
+#ifdef MAC
+	int error;
+
+	error = mac_socket_check_poll(active_cred, so);
+	if (error)
+		return (error);
+#endif
+	return (sopoll(so, events, fp->f_cred, td));
+}
+
+int
+soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct socket *so = fp->f_data;
+#ifdef MAC
+	int error;
+#endif
+
+	bzero((caddr_t)ub, sizeof (*ub));
+	ub->st_mode = S_IFSOCK;
+#ifdef MAC
+	error = mac_socket_check_stat(active_cred, so);
+	if (error)
+		return (error);
+#endif
+	/*
+	 * If SBS_CANTRCVMORE is set, but there's still data left in the
+	 * receive buffer, the socket is still readable.
+	 */
+	SOCKBUF_LOCK(&so->so_rcv);
+	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 ||
+	    so->so_rcv.sb_cc != 0)
+		ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
+	ub->st_size = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	/* Unlocked read. */
+	if ((so->so_snd.sb_state & SBS_CANTSENDMORE) == 0)
+		ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
+	ub->st_uid = so->so_cred->cr_uid;
+	ub->st_gid = so->so_cred->cr_gid;
+	return (*so->so_proto->pr_usrreqs->pru_sense)(so, ub);
+}
+
+/*
+ * API socket close on file pointer.  We call soclose() to close the socket
+ * (including initiating closing protocols).  soclose() will sorele() the
+ * file reference but the actual socket will not go away until the socket's
+ * ref count hits 0.
+ */
+/* ARGSUSED */
+int
+soo_close(struct file *fp, struct thread *td)
+{
+	int error = 0;
+	struct socket *so;
+
+	so = fp->f_data;
+	fp->f_ops = &badfileops;
+	fp->f_data = NULL;
+
+	if (so)
+		error = soclose(so);
+	return (error);
+}
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
new file mode 100644
index 0000000..f330879
--- /dev/null
+++ b/sys/kern/syscalls.c
@@ -0,0 +1,554 @@
+/*
+ * System call names.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * created from FreeBSD: head/sys/kern/syscalls.master 255219 2013-09-05 00:09:56Z pjd 
+ */
+
+const char *syscallnames[] = {
+	"syscall",			/* 0 = syscall */
+	"exit",			/* 1 = exit */
+	"fork",			/* 2 = fork */
+	"read",			/* 3 = read */
+	"write",			/* 4 = write */
+	"open",			/* 5 = open */
+	"close",			/* 6 = close */
+	"wait4",			/* 7 = wait4 */
+	"compat.creat",		/* 8 = old creat */
+	"link",			/* 9 = link */
+	"unlink",			/* 10 = unlink */
+	"obs_execv",			/* 11 = obsolete execv */
+	"chdir",			/* 12 = chdir */
+	"fchdir",			/* 13 = fchdir */
+	"mknod",			/* 14 = mknod */
+	"chmod",			/* 15 = chmod */
+	"chown",			/* 16 = chown */
+	"break",			/* 17 = break */
+	"compat4.getfsstat",		/* 18 = freebsd4 getfsstat */
+	"compat.lseek",		/* 19 = old lseek */
+	"getpid",			/* 20 = getpid */
+	"mount",			/* 21 = mount */
+	"unmount",			/* 22 = unmount */
+	"setuid",			/* 23 = setuid */
+	"getuid",			/* 24 = getuid */
+	"geteuid",			/* 25 = geteuid */
+	"ptrace",			/* 26 = ptrace */
+	"recvmsg",			/* 27 = recvmsg */
+	"sendmsg",			/* 28 = sendmsg */
+	"recvfrom",			/* 29 = recvfrom */
+	"accept",			/* 30 = accept */
+	"getpeername",			/* 31 = getpeername */
+	"getsockname",			/* 32 = getsockname */
+	"access",			/* 33 = access */
+	"chflags",			/* 34 = chflags */
+	"fchflags",			/* 35 = fchflags */
+	"sync",			/* 36 = sync */
+	"kill",			/* 37 = kill */
+	"compat.stat",		/* 38 = old stat */
+	"getppid",			/* 39 = getppid */
+	"compat.lstat",		/* 40 = old lstat */
+	"dup",			/* 41 = dup */
+	"pipe",			/* 42 = pipe */
+	"getegid",			/* 43 = getegid */
+	"profil",			/* 44 = profil */
+	"ktrace",			/* 45 = ktrace */
+	"compat.sigaction",		/* 46 = old sigaction */
+	"getgid",			/* 47 = getgid */
+	"compat.sigprocmask",		/* 48 = old sigprocmask */
+	"getlogin",			/* 49 = getlogin */
+	"setlogin",			/* 50 = setlogin */
+	"acct",			/* 51 = acct */
+	"compat.sigpending",		/* 52 = old sigpending */
+	"sigaltstack",			/* 53 = sigaltstack */
+	"ioctl",			/* 54 = ioctl */
+	"reboot",			/* 55 = reboot */
+	"revoke",			/* 56 = revoke */
+	"symlink",			/* 57 = symlink */
+	"readlink",			/* 58 = readlink */
+	"execve",			/* 59 = execve */
+	"umask",			/* 60 = umask */
+	"chroot",			/* 61 = chroot */
+	"compat.fstat",		/* 62 = old fstat */
+	"compat.getkerninfo",		/* 63 = old getkerninfo */
+	"compat.getpagesize",		/* 64 = old getpagesize */
+	"msync",			/* 65 = msync */
+	"vfork",			/* 66 = vfork */
+	"obs_vread",			/* 67 = obsolete vread */
+	"obs_vwrite",			/* 68 = obsolete vwrite */
+	"sbrk",			/* 69 = sbrk */
+	"sstk",			/* 70 = sstk */
+	"compat.mmap",		/* 71 = old mmap */
+	"vadvise",			/* 72 = vadvise */
+	"munmap",			/* 73 = munmap */
+	"mprotect",			/* 74 = mprotect */
+	"madvise",			/* 75 = madvise */
+	"obs_vhangup",			/* 76 = obsolete vhangup */
+	"obs_vlimit",			/* 77 = obsolete vlimit */
+	"mincore",			/* 78 = mincore */
+	"getgroups",			/* 79 = getgroups */
+	"setgroups",			/* 80 = setgroups */
+	"getpgrp",			/* 81 = getpgrp */
+	"setpgid",			/* 82 = setpgid */
+	"setitimer",			/* 83 = setitimer */
+	"compat.wait",		/* 84 = old wait */
+	"swapon",			/* 85 = swapon */
+	"getitimer",			/* 86 = getitimer */
+	"compat.gethostname",		/* 87 = old gethostname */
+	"compat.sethostname",		/* 88 = old sethostname */
+	"getdtablesize",			/* 89 = getdtablesize */
+	"dup2",			/* 90 = dup2 */
+	"#91",			/* 91 = getdopt */
+	"fcntl",			/* 92 = fcntl */
+	"select",			/* 93 = select */
+	"#94",			/* 94 = setdopt */
+	"fsync",			/* 95 = fsync */
+	"setpriority",			/* 96 = setpriority */
+	"socket",			/* 97 = socket */
+	"connect",			/* 98 = connect */
+	"compat.accept",		/* 99 = old accept */
+	"getpriority",			/* 100 = getpriority */
+	"compat.send",		/* 101 = old send */
+	"compat.recv",		/* 102 = old recv */
+	"compat.sigreturn",		/* 103 = old sigreturn */
+	"bind",			/* 104 = bind */
+	"setsockopt",			/* 105 = setsockopt */
+	"listen",			/* 106 = listen */
+	"obs_vtimes",			/* 107 = obsolete vtimes */
+	"compat.sigvec",		/* 108 = old sigvec */
+	"compat.sigblock",		/* 109 = old sigblock */
+	"compat.sigsetmask",		/* 110 = old sigsetmask */
+	"compat.sigsuspend",		/* 111 = old sigsuspend */
+	"compat.sigstack",		/* 112 = old sigstack */
+	"compat.recvmsg",		/* 113 = old recvmsg */
+	"compat.sendmsg",		/* 114 = old sendmsg */
+	"obs_vtrace",			/* 115 = obsolete vtrace */
+	"gettimeofday",			/* 116 = gettimeofday */
+	"getrusage",			/* 117 = getrusage */
+	"getsockopt",			/* 118 = getsockopt */
+	"#119",			/* 119 = resuba */
+	"readv",			/* 120 = readv */
+	"writev",			/* 121 = writev */
+	"settimeofday",			/* 122 = settimeofday */
+	"fchown",			/* 123 = fchown */
+	"fchmod",			/* 124 = fchmod */
+	"compat.recvfrom",		/* 125 = old recvfrom */
+	"setreuid",			/* 126 = setreuid */
+	"setregid",			/* 127 = setregid */
+	"rename",			/* 128 = rename */
+	"compat.truncate",		/* 129 = old truncate */
+	"compat.ftruncate",		/* 130 = old ftruncate */
+	"flock",			/* 131 = flock */
+	"mkfifo",			/* 132 = mkfifo */
+	"sendto",			/* 133 = sendto */
+	"shutdown",			/* 134 = shutdown */
+	"socketpair",			/* 135 = socketpair */
+	"mkdir",			/* 136 = mkdir */
+	"rmdir",			/* 137 = rmdir */
+	"utimes",			/* 138 = utimes */
+	"obs_4.2",			/* 139 = obsolete 4.2 sigreturn */
+	"adjtime",			/* 140 = adjtime */
+	"compat.getpeername",		/* 141 = old getpeername */
+	"compat.gethostid",		/* 142 = old gethostid */
+	"compat.sethostid",		/* 143 = old sethostid */
+	"compat.getrlimit",		/* 144 = old getrlimit */
+	"compat.setrlimit",		/* 145 = old setrlimit */
+	"compat.killpg",		/* 146 = old killpg */
+	"setsid",			/* 147 = setsid */
+	"quotactl",			/* 148 = quotactl */
+	"compat.quota",		/* 149 = old quota */
+	"compat.getsockname",		/* 150 = old getsockname */
+	"#151",			/* 151 = sem_lock */
+	"#152",			/* 152 = sem_wakeup */
+	"#153",			/* 153 = asyncdaemon */
+	"nlm_syscall",			/* 154 = nlm_syscall */
+	"nfssvc",			/* 155 = nfssvc */
+	"compat.getdirentries",		/* 156 = old getdirentries */
+	"compat4.statfs",		/* 157 = freebsd4 statfs */
+	"compat4.fstatfs",		/* 158 = freebsd4 fstatfs */
+	"#159",			/* 159 = nosys */
+	"lgetfh",			/* 160 = lgetfh */
+	"getfh",			/* 161 = getfh */
+	"compat4.getdomainname",		/* 162 = freebsd4 getdomainname */
+	"compat4.setdomainname",		/* 163 = freebsd4 setdomainname */
+	"compat4.uname",		/* 164 = freebsd4 uname */
+	"sysarch",			/* 165 = sysarch */
+	"rtprio",			/* 166 = rtprio */
+	"#167",			/* 167 = nosys */
+	"#168",			/* 168 = nosys */
+	"semsys",			/* 169 = semsys */
+	"msgsys",			/* 170 = msgsys */
+	"shmsys",			/* 171 = shmsys */
+	"#172",			/* 172 = nosys */
+	"freebsd6_pread",			/* 173 = freebsd6_pread */
+	"freebsd6_pwrite",			/* 174 = freebsd6_pwrite */
+	"setfib",			/* 175 = setfib */
+	"ntp_adjtime",			/* 176 = ntp_adjtime */
+	"#177",			/* 177 = sfork */
+	"#178",			/* 178 = getdescriptor */
+	"#179",			/* 179 = setdescriptor */
+	"#180",			/* 180 = nosys */
+	"setgid",			/* 181 = setgid */
+	"setegid",			/* 182 = setegid */
+	"seteuid",			/* 183 = seteuid */
+	"#184",			/* 184 = lfs_bmapv */
+	"#185",			/* 185 = lfs_markv */
+	"#186",			/* 186 = lfs_segclean */
+	"#187",			/* 187 = lfs_segwait */
+	"stat",			/* 188 = stat */
+	"fstat",			/* 189 = fstat */
+	"lstat",			/* 190 = lstat */
+	"pathconf",			/* 191 = pathconf */
+	"fpathconf",			/* 192 = fpathconf */
+	"#193",			/* 193 = nosys */
+	"getrlimit",			/* 194 = getrlimit */
+	"setrlimit",			/* 195 = setrlimit */
+	"getdirentries",			/* 196 = getdirentries */
+	"freebsd6_mmap",			/* 197 = freebsd6_mmap */
+	"__syscall",			/* 198 = __syscall */
+	"freebsd6_lseek",			/* 199 = freebsd6_lseek */
+	"freebsd6_truncate",			/* 200 = freebsd6_truncate */
+	"freebsd6_ftruncate",			/* 201 = freebsd6_ftruncate */
+	"__sysctl",			/* 202 = __sysctl */
+	"mlock",			/* 203 = mlock */
+	"munlock",			/* 204 = munlock */
+	"undelete",			/* 205 = undelete */
+	"futimes",			/* 206 = futimes */
+	"getpgid",			/* 207 = getpgid */
+	"#208",			/* 208 = newreboot */
+	"poll",			/* 209 = poll */
+	"lkmnosys",			/* 210 = lkmnosys */
+	"lkmnosys",			/* 211 = lkmnosys */
+	"lkmnosys",			/* 212 = lkmnosys */
+	"lkmnosys",			/* 213 = lkmnosys */
+	"lkmnosys",			/* 214 = lkmnosys */
+	"lkmnosys",			/* 215 = lkmnosys */
+	"lkmnosys",			/* 216 = lkmnosys */
+	"lkmnosys",			/* 217 = lkmnosys */
+	"lkmnosys",			/* 218 = lkmnosys */
+	"lkmnosys",			/* 219 = lkmnosys */
+	"compat7.__semctl",		/* 220 = freebsd7 __semctl */
+	"semget",			/* 221 = semget */
+	"semop",			/* 222 = semop */
+	"#223",			/* 223 = semconfig */
+	"compat7.msgctl",		/* 224 = freebsd7 msgctl */
+	"msgget",			/* 225 = msgget */
+	"msgsnd",			/* 226 = msgsnd */
+	"msgrcv",			/* 227 = msgrcv */
+	"shmat",			/* 228 = shmat */
+	"compat7.shmctl",		/* 229 = freebsd7 shmctl */
+	"shmdt",			/* 230 = shmdt */
+	"shmget",			/* 231 = shmget */
+	"clock_gettime",			/* 232 = clock_gettime */
+	"clock_settime",			/* 233 = clock_settime */
+	"clock_getres",			/* 234 = clock_getres */
+	"ktimer_create",			/* 235 = ktimer_create */
+	"ktimer_delete",			/* 236 = ktimer_delete */
+	"ktimer_settime",			/* 237 = ktimer_settime */
+	"ktimer_gettime",			/* 238 = ktimer_gettime */
+	"ktimer_getoverrun",			/* 239 = ktimer_getoverrun */
+	"nanosleep",			/* 240 = nanosleep */
+	"ffclock_getcounter",			/* 241 = ffclock_getcounter */
+	"ffclock_setestimate",			/* 242 = ffclock_setestimate */
+	"ffclock_getestimate",			/* 243 = ffclock_getestimate */
+	"#244",			/* 244 = nosys */
+	"#245",			/* 245 = nosys */
+	"#246",			/* 246 = nosys */
+	"clock_getcpuclockid2",			/* 247 = clock_getcpuclockid2 */
+	"ntp_gettime",			/* 248 = ntp_gettime */
+	"#249",			/* 249 = nosys */
+	"minherit",			/* 250 = minherit */
+	"rfork",			/* 251 = rfork */
+	"openbsd_poll",			/* 252 = openbsd_poll */
+	"issetugid",			/* 253 = issetugid */
+	"lchown",			/* 254 = lchown */
+	"aio_read",			/* 255 = aio_read */
+	"aio_write",			/* 256 = aio_write */
+	"lio_listio",			/* 257 = lio_listio */
+	"#258",			/* 258 = nosys */
+	"#259",			/* 259 = nosys */
+	"#260",			/* 260 = nosys */
+	"#261",			/* 261 = nosys */
+	"#262",			/* 262 = nosys */
+	"#263",			/* 263 = nosys */
+	"#264",			/* 264 = nosys */
+	"#265",			/* 265 = nosys */
+	"#266",			/* 266 = nosys */
+	"#267",			/* 267 = nosys */
+	"#268",			/* 268 = nosys */
+	"#269",			/* 269 = nosys */
+	"#270",			/* 270 = nosys */
+	"#271",			/* 271 = nosys */
+	"getdents",			/* 272 = getdents */
+	"#273",			/* 273 = nosys */
+	"lchmod",			/* 274 = lchmod */
+	"netbsd_lchown",			/* 275 = netbsd_lchown */
+	"lutimes",			/* 276 = lutimes */
+	"netbsd_msync",			/* 277 = netbsd_msync */
+	"nstat",			/* 278 = nstat */
+	"nfstat",			/* 279 = nfstat */
+	"nlstat",			/* 280 = nlstat */
+	"#281",			/* 281 = nosys */
+	"#282",			/* 282 = nosys */
+	"#283",			/* 283 = nosys */
+	"#284",			/* 284 = nosys */
+	"#285",			/* 285 = nosys */
+	"#286",			/* 286 = nosys */
+	"#287",			/* 287 = nosys */
+	"#288",			/* 288 = nosys */
+	"preadv",			/* 289 = preadv */
+	"pwritev",			/* 290 = pwritev */
+	"#291",			/* 291 = nosys */
+	"#292",			/* 292 = nosys */
+	"#293",			/* 293 = nosys */
+	"#294",			/* 294 = nosys */
+	"#295",			/* 295 = nosys */
+	"#296",			/* 296 = nosys */
+	"compat4.fhstatfs",		/* 297 = freebsd4 fhstatfs */
+	"fhopen",			/* 298 = fhopen */
+	"fhstat",			/* 299 = fhstat */
+	"modnext",			/* 300 = modnext */
+	"modstat",			/* 301 = modstat */
+	"modfnext",			/* 302 = modfnext */
+	"modfind",			/* 303 = modfind */
+	"kldload",			/* 304 = kldload */
+	"kldunload",			/* 305 = kldunload */
+	"kldfind",			/* 306 = kldfind */
+	"kldnext",			/* 307 = kldnext */
+	"kldstat",			/* 308 = kldstat */
+	"kldfirstmod",			/* 309 = kldfirstmod */
+	"getsid",			/* 310 = getsid */
+	"setresuid",			/* 311 = setresuid */
+	"setresgid",			/* 312 = setresgid */
+	"obs_signanosleep",			/* 313 = obsolete signanosleep */
+	"aio_return",			/* 314 = aio_return */
+	"aio_suspend",			/* 315 = aio_suspend */
+	"aio_cancel",			/* 316 = aio_cancel */
+	"aio_error",			/* 317 = aio_error */
+	"oaio_read",			/* 318 = oaio_read */
+	"oaio_write",			/* 319 = oaio_write */
+	"olio_listio",			/* 320 = olio_listio */
+	"yield",			/* 321 = yield */
+	"obs_thr_sleep",			/* 322 = obsolete thr_sleep */
+	"obs_thr_wakeup",			/* 323 = obsolete thr_wakeup */
+	"mlockall",			/* 324 = mlockall */
+	"munlockall",			/* 325 = munlockall */
+	"__getcwd",			/* 326 = __getcwd */
+	"sched_setparam",			/* 327 = sched_setparam */
+	"sched_getparam",			/* 328 = sched_getparam */
+	"sched_setscheduler",			/* 329 = sched_setscheduler */
+	"sched_getscheduler",			/* 330 = sched_getscheduler */
+	"sched_yield",			/* 331 = sched_yield */
+	"sched_get_priority_max",			/* 332 = sched_get_priority_max */
+	"sched_get_priority_min",			/* 333 = sched_get_priority_min */
+	"sched_rr_get_interval",			/* 334 = sched_rr_get_interval */
+	"utrace",			/* 335 = utrace */
+	"compat4.sendfile",		/* 336 = freebsd4 sendfile */
+	"kldsym",			/* 337 = kldsym */
+	"jail",			/* 338 = jail */
+	"nnpfs_syscall",			/* 339 = nnpfs_syscall */
+	"sigprocmask",			/* 340 = sigprocmask */
+	"sigsuspend",			/* 341 = sigsuspend */
+	"compat4.sigaction",		/* 342 = freebsd4 sigaction */
+	"sigpending",			/* 343 = sigpending */
+	"compat4.sigreturn",		/* 344 = freebsd4 sigreturn */
+	"sigtimedwait",			/* 345 = sigtimedwait */
+	"sigwaitinfo",			/* 346 = sigwaitinfo */
+	"__acl_get_file",			/* 347 = __acl_get_file */
+	"__acl_set_file",			/* 348 = __acl_set_file */
+	"__acl_get_fd",			/* 349 = __acl_get_fd */
+	"__acl_set_fd",			/* 350 = __acl_set_fd */
+	"__acl_delete_file",			/* 351 = __acl_delete_file */
+	"__acl_delete_fd",			/* 352 = __acl_delete_fd */
+	"__acl_aclcheck_file",			/* 353 = __acl_aclcheck_file */
+	"__acl_aclcheck_fd",			/* 354 = __acl_aclcheck_fd */
+	"extattrctl",			/* 355 = extattrctl */
+	"extattr_set_file",			/* 356 = extattr_set_file */
+	"extattr_get_file",			/* 357 = extattr_get_file */
+	"extattr_delete_file",			/* 358 = extattr_delete_file */
+	"aio_waitcomplete",			/* 359 = aio_waitcomplete */
+	"getresuid",			/* 360 = getresuid */
+	"getresgid",			/* 361 = getresgid */
+	"kqueue",			/* 362 = kqueue */
+	"kevent",			/* 363 = kevent */
+	"#364",			/* 364 = __cap_get_proc */
+	"#365",			/* 365 = __cap_set_proc */
+	"#366",			/* 366 = __cap_get_fd */
+	"#367",			/* 367 = __cap_get_file */
+	"#368",			/* 368 = __cap_set_fd */
+	"#369",			/* 369 = __cap_set_file */
+	"#370",			/* 370 = nosys */
+	"extattr_set_fd",			/* 371 = extattr_set_fd */
+	"extattr_get_fd",			/* 372 = extattr_get_fd */
+	"extattr_delete_fd",			/* 373 = extattr_delete_fd */
+	"__setugid",			/* 374 = __setugid */
+	"#375",			/* 375 = nfsclnt */
+	"eaccess",			/* 376 = eaccess */
+	"afs3_syscall",			/* 377 = afs3_syscall */
+	"nmount",			/* 378 = nmount */
+	"#379",			/* 379 = kse_exit */
+	"#380",			/* 380 = kse_wakeup */
+	"#381",			/* 381 = kse_create */
+	"#382",			/* 382 = kse_thr_interrupt */
+	"#383",			/* 383 = kse_release */
+	"__mac_get_proc",			/* 384 = __mac_get_proc */
+	"__mac_set_proc",			/* 385 = __mac_set_proc */
+	"__mac_get_fd",			/* 386 = __mac_get_fd */
+	"__mac_get_file",			/* 387 = __mac_get_file */
+	"__mac_set_fd",			/* 388 = __mac_set_fd */
+	"__mac_set_file",			/* 389 = __mac_set_file */
+	"kenv",			/* 390 = kenv */
+	"lchflags",			/* 391 = lchflags */
+	"uuidgen",			/* 392 = uuidgen */
+	"sendfile",			/* 393 = sendfile */
+	"mac_syscall",			/* 394 = mac_syscall */
+	"getfsstat",			/* 395 = getfsstat */
+	"statfs",			/* 396 = statfs */
+	"fstatfs",			/* 397 = fstatfs */
+	"fhstatfs",			/* 398 = fhstatfs */
+	"#399",			/* 399 = nosys */
+	"ksem_close",			/* 400 = ksem_close */
+	"ksem_post",			/* 401 = ksem_post */
+	"ksem_wait",			/* 402 = ksem_wait */
+	"ksem_trywait",			/* 403 = ksem_trywait */
+	"ksem_init",			/* 404 = ksem_init */
+	"ksem_open",			/* 405 = ksem_open */
+	"ksem_unlink",			/* 406 = ksem_unlink */
+	"ksem_getvalue",			/* 407 = ksem_getvalue */
+	"ksem_destroy",			/* 408 = ksem_destroy */
+	"__mac_get_pid",			/* 409 = __mac_get_pid */
+	"__mac_get_link",			/* 410 = __mac_get_link */
+	"__mac_set_link",			/* 411 = __mac_set_link */
+	"extattr_set_link",			/* 412 = extattr_set_link */
+	"extattr_get_link",			/* 413 = extattr_get_link */
+	"extattr_delete_link",			/* 414 = extattr_delete_link */
+	"__mac_execve",			/* 415 = __mac_execve */
+	"sigaction",			/* 416 = sigaction */
+	"sigreturn",			/* 417 = sigreturn */
+	"#418",			/* 418 = __xstat */
+	"#419",			/* 419 = __xfstat */
+	"#420",			/* 420 = __xlstat */
+	"getcontext",			/* 421 = getcontext */
+	"setcontext",			/* 422 = setcontext */
+	"swapcontext",			/* 423 = swapcontext */
+	"swapoff",			/* 424 = swapoff */
+	"__acl_get_link",			/* 425 = __acl_get_link */
+	"__acl_set_link",			/* 426 = __acl_set_link */
+	"__acl_delete_link",			/* 427 = __acl_delete_link */
+	"__acl_aclcheck_link",			/* 428 = __acl_aclcheck_link */
+	"sigwait",			/* 429 = sigwait */
+	"thr_create",			/* 430 = thr_create */
+	"thr_exit",			/* 431 = thr_exit */
+	"thr_self",			/* 432 = thr_self */
+	"thr_kill",			/* 433 = thr_kill */
+	"_umtx_lock",			/* 434 = _umtx_lock */
+	"_umtx_unlock",			/* 435 = _umtx_unlock */
+	"jail_attach",			/* 436 = jail_attach */
+	"extattr_list_fd",			/* 437 = extattr_list_fd */
+	"extattr_list_file",			/* 438 = extattr_list_file */
+	"extattr_list_link",			/* 439 = extattr_list_link */
+	"#440",			/* 440 = kse_switchin */
+	"ksem_timedwait",			/* 441 = ksem_timedwait */
+	"thr_suspend",			/* 442 = thr_suspend */
+	"thr_wake",			/* 443 = thr_wake */
+	"kldunloadf",			/* 444 = kldunloadf */
+	"audit",			/* 445 = audit */
+	"auditon",			/* 446 = auditon */
+	"getauid",			/* 447 = getauid */
+	"setauid",			/* 448 = setauid */
+	"getaudit",			/* 449 = getaudit */
+	"setaudit",			/* 450 = setaudit */
+	"getaudit_addr",			/* 451 = getaudit_addr */
+	"setaudit_addr",			/* 452 = setaudit_addr */
+	"auditctl",			/* 453 = auditctl */
+	"_umtx_op",			/* 454 = _umtx_op */
+	"thr_new",			/* 455 = thr_new */
+	"sigqueue",			/* 456 = sigqueue */
+	"kmq_open",			/* 457 = kmq_open */
+	"kmq_setattr",			/* 458 = kmq_setattr */
+	"kmq_timedreceive",			/* 459 = kmq_timedreceive */
+	"kmq_timedsend",			/* 460 = kmq_timedsend */
+	"kmq_notify",			/* 461 = kmq_notify */
+	"kmq_unlink",			/* 462 = kmq_unlink */
+	"abort2",			/* 463 = abort2 */
+	"thr_set_name",			/* 464 = thr_set_name */
+	"aio_fsync",			/* 465 = aio_fsync */
+	"rtprio_thread",			/* 466 = rtprio_thread */
+	"#467",			/* 467 = nosys */
+	"#468",			/* 468 = nosys */
+	"#469",			/* 469 = __getpath_fromfd */
+	"#470",			/* 470 = __getpath_fromaddr */
+	"sctp_peeloff",			/* 471 = sctp_peeloff */
+	"sctp_generic_sendmsg",			/* 472 = sctp_generic_sendmsg */
+	"sctp_generic_sendmsg_iov",			/* 473 = sctp_generic_sendmsg_iov */
+	"sctp_generic_recvmsg",			/* 474 = sctp_generic_recvmsg */
+	"pread",			/* 475 = pread */
+	"pwrite",			/* 476 = pwrite */
+	"mmap",			/* 477 = mmap */
+	"lseek",			/* 478 = lseek */
+	"truncate",			/* 479 = truncate */
+	"ftruncate",			/* 480 = ftruncate */
+	"thr_kill2",			/* 481 = thr_kill2 */
+	"shm_open",			/* 482 = shm_open */
+	"shm_unlink",			/* 483 = shm_unlink */
+	"cpuset",			/* 484 = cpuset */
+	"cpuset_setid",			/* 485 = cpuset_setid */
+	"cpuset_getid",			/* 486 = cpuset_getid */
+	"cpuset_getaffinity",			/* 487 = cpuset_getaffinity */
+	"cpuset_setaffinity",			/* 488 = cpuset_setaffinity */
+	"faccessat",			/* 489 = faccessat */
+	"fchmodat",			/* 490 = fchmodat */
+	"fchownat",			/* 491 = fchownat */
+	"fexecve",			/* 492 = fexecve */
+	"fstatat",			/* 493 = fstatat */
+	"futimesat",			/* 494 = futimesat */
+	"linkat",			/* 495 = linkat */
+	"mkdirat",			/* 496 = mkdirat */
+	"mkfifoat",			/* 497 = mkfifoat */
+	"mknodat",			/* 498 = mknodat */
+	"openat",			/* 499 = openat */
+	"readlinkat",			/* 500 = readlinkat */
+	"renameat",			/* 501 = renameat */
+	"symlinkat",			/* 502 = symlinkat */
+	"unlinkat",			/* 503 = unlinkat */
+	"posix_openpt",			/* 504 = posix_openpt */
+	"gssd_syscall",			/* 505 = gssd_syscall */
+	"jail_get",			/* 506 = jail_get */
+	"jail_set",			/* 507 = jail_set */
+	"jail_remove",			/* 508 = jail_remove */
+	"closefrom",			/* 509 = closefrom */
+	"__semctl",			/* 510 = __semctl */
+	"msgctl",			/* 511 = msgctl */
+	"shmctl",			/* 512 = shmctl */
+	"lpathconf",			/* 513 = lpathconf */
+	"obs_cap_new",			/* 514 = obsolete cap_new */
+	"__cap_rights_get",			/* 515 = __cap_rights_get */
+	"cap_enter",			/* 516 = cap_enter */
+	"cap_getmode",			/* 517 = cap_getmode */
+	"pdfork",			/* 518 = pdfork */
+	"pdkill",			/* 519 = pdkill */
+	"pdgetpid",			/* 520 = pdgetpid */
+	"#521",			/* 521 = pdwait4 */
+	"pselect",			/* 522 = pselect */
+	"getloginclass",			/* 523 = getloginclass */
+	"setloginclass",			/* 524 = setloginclass */
+	"rctl_get_racct",			/* 525 = rctl_get_racct */
+	"rctl_get_rules",			/* 526 = rctl_get_rules */
+	"rctl_get_limits",			/* 527 = rctl_get_limits */
+	"rctl_add_rule",			/* 528 = rctl_add_rule */
+	"rctl_remove_rule",			/* 529 = rctl_remove_rule */
+	"posix_fallocate",			/* 530 = posix_fallocate */
+	"posix_fadvise",			/* 531 = posix_fadvise */
+	"wait6",			/* 532 = wait6 */
+	"cap_rights_limit",			/* 533 = cap_rights_limit */
+	"cap_ioctls_limit",			/* 534 = cap_ioctls_limit */
+	"cap_ioctls_get",			/* 535 = cap_ioctls_get */
+	"cap_fcntls_limit",			/* 536 = cap_fcntls_limit */
+	"cap_fcntls_get",			/* 537 = cap_fcntls_get */
+	"bindat",			/* 538 = bindat */
+	"connectat",			/* 539 = connectat */
+	"chflagsat",			/* 540 = chflagsat */
+	"accept4",			/* 541 = accept4 */
+	"pipe2",			/* 542 = pipe2 */
+	"aio_mlock",			/* 543 = aio_mlock */
+};
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
new file mode 100644
index 0000000..e19e310
--- /dev/null
+++ b/sys/kern/syscalls.master
@@ -0,0 +1,982 @@
+ $FreeBSD$
+;	from: @(#)syscalls.master	8.2 (Berkeley) 1/13/94
+;
+; System call name/number master file.
+; Processed to created init_sysent.c, syscalls.c and syscall.h.
+
+; Columns: number audit type name alt{name,tag,rtyp}/comments
+;	number	system call number, must be in order
+;	audit	the audit event associated with the system call
+;		A value of AUE_NULL means no auditing, but it also means that
+;		there is no audit event for the call at this time. For the
+;		case where the event exists, but we don't want auditing, the
+;		event should be #defined to AUE_NULL in audit_kevents.h.
+;	type	one of STD, OBSOL, UNIMPL, COMPAT, COMPAT4, COMPAT6,
+;		COMPAT7, NODEF, NOARGS, NOPROTO, NOSTD
+;		The COMPAT* options may be combined with one or more NO*
+;		options separated by '|' with no spaces (e.g. COMPAT|NOARGS)
+;	name	psuedo-prototype of syscall routine
+;		If one of the following alts is different, then all appear:
+;	altname	name of system call if different
+;	alttag	name of args struct tag if different from [o]`name'"_args"
+;	altrtyp	return type if not int (bogus - syscalls always return int)
+;		for UNIMPL/OBSOL, name continues with comments
+
+; types:
+;	STD	always included
+;	COMPAT	included on COMPAT #ifdef
+;	COMPAT4	included on COMPAT4 #ifdef (FreeBSD 4 compat)
+;	COMPAT6	included on COMPAT6 #ifdef (FreeBSD 6 compat)
+;	COMPAT7	included on COMPAT7 #ifdef (FreeBSD 7 compat)
+;	OBSOL	obsolete, not included in system, only specifies name
+;	UNIMPL	not implemented, placeholder only
+;	NOSTD	implemented but as a lkm that can be statically
+;		compiled in; sysent entry will be filled with lkmressys
+;		so the SYSCALL_MODULE macro works
+;	NOARGS	same as STD except do not create structure in sys/sysproto.h
+;	NODEF	same as STD except only have the entry in the syscall table
+;		added.  Meaning - do not create structure or function
+;		prototype in sys/sysproto.h
+;	NOPROTO	same as STD except do not create structure or
+;		function prototype in sys/sysproto.h.  Does add a
+;		definition to syscall.h besides adding a sysent.
+;	NOTSTATIC syscall is loadable
+;
+; Please copy any additions and changes to the following compatability tables:
+; sys/compat/freebsd32/syscalls.master
+
+; #ifdef's, etc. may be included, and are copied to the output files.
+
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+
+; Reserved/unimplemented system calls in the range 0-150 inclusive
+; are reserved for use in future Berkeley releases.
+; Additional system calls implemented in vendor and other
+; redistributions should be placed in the reserved range at the end
+; of the current calls.
+
+0	AUE_NULL	STD	{ int nosys(void); } syscall nosys_args int
+1	AUE_EXIT	STD	{ void sys_exit(int rval); } exit \
+				    sys_exit_args void
+2	AUE_FORK	STD	{ int fork(void); }
+3	AUE_NULL	STD	{ ssize_t read(int fd, void *buf, \
+				    size_t nbyte); }
+4	AUE_NULL	STD	{ ssize_t write(int fd, const void *buf, \
+				    size_t nbyte); }
+5	AUE_OPEN_RWTC	STD	{ int open(char *path, int flags, int mode); }
+; XXX should be		{ int open(const char *path, int flags, ...); }
+; but we're not ready for `const' or varargs.
+; XXX man page says `mode_t mode'.
+6	AUE_CLOSE	STD	{ int close(int fd); }
+7	AUE_WAIT4	STD	{ int wait4(int pid, int *status, \
+				    int options, struct rusage *rusage); }
+8	AUE_CREAT	COMPAT	{ int creat(char *path, int mode); }
+9	AUE_LINK	STD	{ int link(char *path, char *link); }
+10	AUE_UNLINK	STD	{ int unlink(char *path); }
+11	AUE_NULL	OBSOL	execv
+12	AUE_CHDIR	STD	{ int chdir(char *path); }
+13	AUE_FCHDIR	STD	{ int fchdir(int fd); }
+14	AUE_MKNOD	STD	{ int mknod(char *path, int mode, int dev); }
+15	AUE_CHMOD	STD	{ int chmod(char *path, int mode); }
+16	AUE_CHOWN	STD	{ int chown(char *path, int uid, int gid); }
+17	AUE_NULL	STD	{ int obreak(char *nsize); } break \
+				    obreak_args int
+18	AUE_GETFSSTAT	COMPAT4	{ int getfsstat(struct ostatfs *buf, \
+				    long bufsize, int flags); }
+19	AUE_LSEEK	COMPAT	{ long lseek(int fd, long offset, \
+				    int whence); }
+20	AUE_GETPID	STD	{ pid_t getpid(void); }
+21	AUE_MOUNT	STD	{ int mount(char *type, char *path, \
+				    int flags, caddr_t data); }
+; XXX `path' should have type `const char *' but we're not ready for that.
+22	AUE_UMOUNT	STD	{ int unmount(char *path, int flags); }
+23	AUE_SETUID	STD	{ int setuid(uid_t uid); }
+24	AUE_GETUID	STD	{ uid_t getuid(void); }
+25	AUE_GETEUID	STD	{ uid_t geteuid(void); }
+26	AUE_PTRACE	STD	{ int ptrace(int req, pid_t pid, \
+				    caddr_t addr, int data); }
+27	AUE_RECVMSG	STD	{ int recvmsg(int s, struct msghdr *msg, \
+				    int flags); }
+28	AUE_SENDMSG	STD	{ int sendmsg(int s, struct msghdr *msg, \
+				    int flags); }
+29	AUE_RECVFROM	STD	{ int recvfrom(int s, caddr_t buf, \
+				    size_t len, int flags, \
+				    struct sockaddr * __restrict from, \
+				    __socklen_t * __restrict fromlenaddr); }
+30	AUE_ACCEPT	STD	{ int accept(int s, \
+				    struct sockaddr * __restrict name, \
+				    __socklen_t * __restrict anamelen); }
+31	AUE_GETPEERNAME	STD	{ int getpeername(int fdes, \
+				    struct sockaddr * __restrict asa, \
+				    __socklen_t * __restrict alen); }
+32	AUE_GETSOCKNAME	STD	{ int getsockname(int fdes, \
+				    struct sockaddr * __restrict asa, \
+				    __socklen_t * __restrict alen); }
+33	AUE_ACCESS	STD	{ int access(char *path, int amode); }
+34	AUE_CHFLAGS	STD	{ int chflags(const char *path, u_long flags); }
+35	AUE_FCHFLAGS	STD	{ int fchflags(int fd, u_long flags); }
+36	AUE_SYNC	STD	{ int sync(void); }
+37	AUE_KILL	STD	{ int kill(int pid, int signum); }
+38	AUE_STAT	COMPAT	{ int stat(char *path, struct ostat *ub); }
+39	AUE_GETPPID	STD	{ pid_t getppid(void); }
+40	AUE_LSTAT	COMPAT	{ int lstat(char *path, struct ostat *ub); }
+41	AUE_DUP		STD	{ int dup(u_int fd); }
+42	AUE_PIPE	STD	{ int pipe(void); }
+43	AUE_GETEGID	STD	{ gid_t getegid(void); }
+44	AUE_PROFILE	STD	{ int profil(caddr_t samples, size_t size, \
+				    size_t offset, u_int scale); }
+45	AUE_KTRACE	STD	{ int ktrace(const char *fname, int ops, \
+				    int facs, int pid); }
+46	AUE_SIGACTION	COMPAT	{ int sigaction(int signum, \
+				    struct osigaction *nsa, \
+				    struct osigaction *osa); }
+47	AUE_GETGID	STD	{ gid_t getgid(void); }
+48	AUE_SIGPROCMASK	COMPAT	{ int sigprocmask(int how, osigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it, and we return the old mask as the
+; (int) return value.
+49	AUE_GETLOGIN	STD	{ int getlogin(char *namebuf, u_int \
+				    namelen); }
+50	AUE_SETLOGIN	STD	{ int setlogin(char *namebuf); }
+51	AUE_ACCT	STD	{ int acct(char *path); }
+52	AUE_SIGPENDING	COMPAT	{ int sigpending(void); }
+53	AUE_SIGALTSTACK	STD	{ int sigaltstack(stack_t *ss, \
+				    stack_t *oss); }
+54	AUE_IOCTL	STD	{ int ioctl(int fd, u_long com, \
+				    caddr_t data); }
+55	AUE_REBOOT	STD	{ int reboot(int opt); }
+56	AUE_REVOKE	STD	{ int revoke(char *path); }
+57	AUE_SYMLINK	STD	{ int symlink(char *path, char *link); }
+58	AUE_READLINK	STD	{ ssize_t readlink(char *path, char *buf, \
+				    size_t count); }
+59	AUE_EXECVE	STD	{ int execve(char *fname, char **argv, \
+				    char **envv); }
+60	AUE_UMASK	STD	{ int umask(int newmask); } umask umask_args \
+				    int
+61	AUE_CHROOT	STD	{ int chroot(char *path); }
+62	AUE_FSTAT	COMPAT	{ int fstat(int fd, struct ostat *sb); }
+63	AUE_NULL	COMPAT	{ int getkerninfo(int op, char *where, \
+				    size_t *size, int arg); } getkerninfo \
+				    getkerninfo_args int
+64	AUE_NULL	COMPAT	{ int getpagesize(void); } getpagesize \
+				    getpagesize_args int
+65	AUE_MSYNC	STD	{ int msync(void *addr, size_t len, \
+				    int flags); }
+66	AUE_VFORK	STD	{ int vfork(void); }
+67	AUE_NULL	OBSOL	vread
+68	AUE_NULL	OBSOL	vwrite
+69	AUE_SBRK	STD	{ int sbrk(int incr); }
+70	AUE_SSTK	STD	{ int sstk(int incr); }
+71	AUE_MMAP	COMPAT	{ int mmap(void *addr, int len, int prot, \
+				    int flags, int fd, long pos); }
+72	AUE_O_VADVISE	STD	{ int ovadvise(int anom); } vadvise \
+				    ovadvise_args int
+73	AUE_MUNMAP	STD	{ int munmap(void *addr, size_t len); }
+74	AUE_MPROTECT	STD	{ int mprotect(const void *addr, size_t len, \
+				    int prot); }
+75	AUE_MADVISE	STD	{ int madvise(void *addr, size_t len, \
+				    int behav); }
+76	AUE_NULL	OBSOL	vhangup
+77	AUE_NULL	OBSOL	vlimit
+78	AUE_MINCORE	STD	{ int mincore(const void *addr, size_t len, \
+				    char *vec); }
+79	AUE_GETGROUPS	STD	{ int getgroups(u_int gidsetsize, \
+				    gid_t *gidset); }
+80	AUE_SETGROUPS	STD	{ int setgroups(u_int gidsetsize, \
+				    gid_t *gidset); }
+81	AUE_GETPGRP	STD	{ int getpgrp(void); }
+82	AUE_SETPGRP	STD	{ int setpgid(int pid, int pgid); }
+83	AUE_SETITIMER	STD	{ int setitimer(u_int which, struct \
+				    itimerval *itv, struct itimerval *oitv); }
+84	AUE_WAIT4	COMPAT	{ int wait(void); }
+85	AUE_SWAPON	STD	{ int swapon(char *name); }
+86	AUE_GETITIMER	STD	{ int getitimer(u_int which, \
+				    struct itimerval *itv); }
+87	AUE_SYSCTL	COMPAT	{ int gethostname(char *hostname, \
+				    u_int len); } gethostname \
+				    gethostname_args int
+88	AUE_SYSCTL	COMPAT	{ int sethostname(char *hostname, \
+				    u_int len); } sethostname \
+				    sethostname_args int
+89	AUE_GETDTABLESIZE	STD	{ int getdtablesize(void); }
+90	AUE_DUP2	STD	{ int dup2(u_int from, u_int to); }
+91	AUE_NULL	UNIMPL	getdopt
+92	AUE_FCNTL	STD	{ int fcntl(int fd, int cmd, long arg); }
+; XXX should be	{ int fcntl(int fd, int cmd, ...); }
+; but we're not ready for varargs.
+93	AUE_SELECT	STD	{ int select(int nd, fd_set *in, fd_set *ou, \
+				    fd_set *ex, struct timeval *tv); }
+94	AUE_NULL	UNIMPL	setdopt
+95	AUE_FSYNC	STD	{ int fsync(int fd); }
+96	AUE_SETPRIORITY	STD	{ int setpriority(int which, int who, \
+				    int prio); }
+97	AUE_SOCKET	STD	{ int socket(int domain, int type, \
+				    int protocol); }
+98	AUE_CONNECT	STD	{ int connect(int s, caddr_t name, \
+				    int namelen); }
+99	AUE_ACCEPT	COMPAT|NOARGS { int accept(int s, caddr_t name, \
+				    int *anamelen); } accept accept_args int
+100	AUE_GETPRIORITY	STD	{ int getpriority(int which, int who); }
+101	AUE_SEND	COMPAT	{ int send(int s, caddr_t buf, int len, \
+				    int flags); }
+102	AUE_RECV	COMPAT	{ int recv(int s, caddr_t buf, int len, \
+				    int flags); }
+103	AUE_SIGRETURN	COMPAT	{ int sigreturn( \
+				    struct osigcontext *sigcntxp); }
+104	AUE_BIND	STD	{ int bind(int s, caddr_t name, \
+				    int namelen); }
+105	AUE_SETSOCKOPT	STD	{ int setsockopt(int s, int level, int name, \
+				    caddr_t val, int valsize); }
+106	AUE_LISTEN	STD	{ int listen(int s, int backlog); }
+107	AUE_NULL	OBSOL	vtimes
+108	AUE_NULL	COMPAT	{ int sigvec(int signum, struct sigvec *nsv, \
+				    struct sigvec *osv); }
+109	AUE_NULL	COMPAT	{ int sigblock(int mask); }
+110	AUE_NULL	COMPAT	{ int sigsetmask(int mask); }
+111	AUE_NULL	COMPAT	{ int sigsuspend(osigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it.
+112	AUE_NULL	COMPAT	{ int sigstack(struct sigstack *nss, \
+				    struct sigstack *oss); }
+113	AUE_RECVMSG	COMPAT	{ int recvmsg(int s, struct omsghdr *msg, \
+				    int flags); }
+114	AUE_SENDMSG	COMPAT	{ int sendmsg(int s, caddr_t msg, \
+				    int flags); }
+115	AUE_NULL	OBSOL	vtrace
+116	AUE_GETTIMEOFDAY	STD	{ int gettimeofday(struct timeval *tp, \
+				    struct timezone *tzp); }
+117	AUE_GETRUSAGE	STD	{ int getrusage(int who, \
+				    struct rusage *rusage); }
+118	AUE_GETSOCKOPT	STD	{ int getsockopt(int s, int level, int name, \
+				    caddr_t val, int *avalsize); }
+119	AUE_NULL	UNIMPL	resuba (BSD/OS 2.x)
+120	AUE_READV	STD	{ int readv(int fd, struct iovec *iovp, \
+				    u_int iovcnt); }
+121	AUE_WRITEV	STD	{ int writev(int fd, struct iovec *iovp, \
+				    u_int iovcnt); }
+122	AUE_SETTIMEOFDAY	STD	{ int settimeofday(struct timeval *tv, \
+				    struct timezone *tzp); }
+123	AUE_FCHOWN	STD	{ int fchown(int fd, int uid, int gid); }
+124	AUE_FCHMOD	STD	{ int fchmod(int fd, int mode); }
+125	AUE_RECVFROM	COMPAT|NOARGS { int recvfrom(int s, caddr_t buf, \
+				    size_t len, int flags, caddr_t from, int \
+				    *fromlenaddr); } recvfrom recvfrom_args \
+				    int
+126	AUE_SETREUID	STD	{ int setreuid(int ruid, int euid); }
+127	AUE_SETREGID	STD	{ int setregid(int rgid, int egid); }
+128	AUE_RENAME	STD	{ int rename(char *from, char *to); }
+129	AUE_TRUNCATE	COMPAT	{ int truncate(char *path, long length); }
+130	AUE_FTRUNCATE	COMPAT	{ int ftruncate(int fd, long length); }
+131	AUE_FLOCK	STD	{ int flock(int fd, int how); }
+132	AUE_MKFIFO	STD	{ int mkfifo(char *path, int mode); }
+133	AUE_SENDTO	STD	{ int sendto(int s, caddr_t buf, size_t len, \
+				    int flags, caddr_t to, int tolen); }
+134	AUE_SHUTDOWN	STD	{ int shutdown(int s, int how); }
+135	AUE_SOCKETPAIR	STD	{ int socketpair(int domain, int type, \
+				    int protocol, int *rsv); }
+136	AUE_MKDIR	STD	{ int mkdir(char *path, int mode); }
+137	AUE_RMDIR	STD	{ int rmdir(char *path); }
+138	AUE_UTIMES	STD	{ int utimes(char *path, \
+				    struct timeval *tptr); }
+139	AUE_NULL	OBSOL	4.2 sigreturn
+140	AUE_ADJTIME	STD	{ int adjtime(struct timeval *delta, \
+				    struct timeval *olddelta); }
+141	AUE_GETPEERNAME	COMPAT	{ int getpeername(int fdes, caddr_t asa, \
+				    int *alen); }
+142	AUE_SYSCTL	COMPAT	{ long gethostid(void); }
+143	AUE_SYSCTL	COMPAT	{ int sethostid(long hostid); }
+144	AUE_GETRLIMIT	COMPAT	{ int getrlimit(u_int which, struct \
+				    orlimit *rlp); }
+145	AUE_SETRLIMIT	COMPAT	{ int setrlimit(u_int which, \
+				    struct orlimit *rlp); }
+146	AUE_KILLPG	COMPAT	{ int killpg(int pgid, int signum); }
+147	AUE_SETSID	STD	{ int setsid(void); }
+148	AUE_QUOTACTL	STD	{ int quotactl(char *path, int cmd, int uid, \
+				    caddr_t arg); }
+149	AUE_O_QUOTA	COMPAT	{ int quota(void); }
+150	AUE_GETSOCKNAME	COMPAT|NOARGS { int getsockname(int fdec, \
+				    caddr_t asa, int *alen); } getsockname \
+				    getsockname_args int
+
+; Syscalls 151-180 inclusive are reserved for vendor-specific
+; system calls.  (This includes various calls added for compatibity
+; with other Unix variants.)
+; Some of these calls are now supported by BSD...
+151	AUE_NULL	UNIMPL	sem_lock (BSD/OS 2.x)
+152	AUE_NULL	UNIMPL	sem_wakeup (BSD/OS 2.x)
+153	AUE_NULL	UNIMPL	asyncdaemon (BSD/OS 2.x)
+; 154 is initialised by the NLM code, if present.
+154	AUE_NULL	NOSTD	{ int nlm_syscall(int debug_level, int grace_period, int addr_count, char **addrs); }
+; 155 is initialized by the NFS code, if present.
+155	AUE_NFS_SVC	NOSTD	{ int nfssvc(int flag, caddr_t argp); }
+156	AUE_GETDIRENTRIES	COMPAT	{ int getdirentries(int fd, char *buf, \
+				    u_int count, long *basep); }
+157	AUE_STATFS	COMPAT4	{ int statfs(char *path, \
+				    struct ostatfs *buf); }
+158	AUE_FSTATFS	COMPAT4	{ int fstatfs(int fd, \
+				    struct ostatfs *buf); }
+159	AUE_NULL	UNIMPL	nosys
+160	AUE_LGETFH	STD	{ int lgetfh(char *fname, \
+				    struct fhandle *fhp); }
+161	AUE_NFS_GETFH	STD	{ int getfh(char *fname, \
+				    struct fhandle *fhp); }
+162	AUE_SYSCTL	COMPAT4	{ int getdomainname(char *domainname, \
+				    int len); }
+163	AUE_SYSCTL	COMPAT4	{ int setdomainname(char *domainname, \
+				    int len); }
+164	AUE_NULL	COMPAT4	{ int uname(struct utsname *name); }
+165	AUE_SYSARCH	STD	{ int sysarch(int op, char *parms); }
+166	AUE_RTPRIO	STD	{ int rtprio(int function, pid_t pid, \
+				    struct rtprio *rtp); }
+167	AUE_NULL	UNIMPL	nosys
+168	AUE_NULL	UNIMPL	nosys
+169	AUE_SEMSYS	NOSTD	{ int semsys(int which, int a2, int a3, \
+				    int a4, int a5); }
+; XXX should be	{ int semsys(int which, ...); }
+170	AUE_MSGSYS	NOSTD	{ int msgsys(int which, int a2, int a3, \
+				    int a4, int a5, int a6); }
+; XXX should be	{ int msgsys(int which, ...); }
+171	AUE_SHMSYS	NOSTD	{ int shmsys(int which, int a2, int a3, \
+				    int a4); }
+; XXX should be	{ int shmsys(int which, ...); }
+172	AUE_NULL	UNIMPL	nosys
+173	AUE_PREAD	STD	{ ssize_t freebsd6_pread(int fd, void *buf, \
+				    size_t nbyte, int pad, off_t offset); }
+174	AUE_PWRITE	STD	{ ssize_t freebsd6_pwrite(int fd, \
+				    const void *buf, \
+				    size_t nbyte, int pad, off_t offset); }
+175	AUE_NULL	STD	{ int setfib(int fibnum); }
+176	AUE_NTP_ADJTIME	STD	{ int ntp_adjtime(struct timex *tp); }
+177	AUE_NULL	UNIMPL	sfork (BSD/OS 2.x)
+178	AUE_NULL	UNIMPL	getdescriptor (BSD/OS 2.x)
+179	AUE_NULL	UNIMPL	setdescriptor (BSD/OS 2.x)
+180	AUE_NULL	UNIMPL	nosys
+
+; Syscalls 181-199 are used by/reserved for BSD
+181	AUE_SETGID	STD	{ int setgid(gid_t gid); }
+182	AUE_SETEGID	STD	{ int setegid(gid_t egid); }
+183	AUE_SETEUID	STD	{ int seteuid(uid_t euid); }
+184	AUE_NULL	UNIMPL	lfs_bmapv
+185	AUE_NULL	UNIMPL	lfs_markv
+186	AUE_NULL	UNIMPL	lfs_segclean
+187	AUE_NULL	UNIMPL	lfs_segwait
+188	AUE_STAT	STD	{ int stat(char *path, struct stat *ub); }
+189	AUE_FSTAT	STD	{ int fstat(int fd, struct stat *sb); }
+190	AUE_LSTAT	STD	{ int lstat(char *path, struct stat *ub); }
+191	AUE_PATHCONF	STD	{ int pathconf(char *path, int name); }
+192	AUE_FPATHCONF	STD	{ int fpathconf(int fd, int name); }
+193	AUE_NULL	UNIMPL	nosys
+194	AUE_GETRLIMIT	STD	{ int getrlimit(u_int which, \
+				    struct rlimit *rlp); } getrlimit \
+				    __getrlimit_args int
+195	AUE_SETRLIMIT	STD	{ int setrlimit(u_int which, \
+				    struct rlimit *rlp); } setrlimit \
+				    __setrlimit_args int
+196	AUE_GETDIRENTRIES	STD	{ int getdirentries(int fd, char *buf, \
+				    u_int count, long *basep); }
+197	AUE_MMAP	STD	{ caddr_t freebsd6_mmap(caddr_t addr, \
+				    size_t len, int prot, int flags, int fd, \
+				    int pad, off_t pos); }
+198	AUE_NULL	NOPROTO	{ int nosys(void); } __syscall \
+				    __syscall_args int
+199	AUE_LSEEK	STD	{ off_t freebsd6_lseek(int fd, int pad, \
+				    off_t offset, int whence); }
+200	AUE_TRUNCATE	STD	{ int freebsd6_truncate(char *path, int pad, \
+				    off_t length); }
+201	AUE_FTRUNCATE	STD	{ int freebsd6_ftruncate(int fd, int pad, \
+				    off_t length); }
+202	AUE_SYSCTL	STD	{ int __sysctl(int *name, u_int namelen, \
+				    void *old, size_t *oldlenp, void *new, \
+				    size_t newlen); } __sysctl sysctl_args int
+203	AUE_MLOCK	STD	{ int mlock(const void *addr, size_t len); }
+204	AUE_MUNLOCK	STD	{ int munlock(const void *addr, size_t len); }
+205	AUE_UNDELETE	STD	{ int undelete(char *path); }
+206	AUE_FUTIMES	STD	{ int futimes(int fd, struct timeval *tptr); }
+207	AUE_GETPGID	STD	{ int getpgid(pid_t pid); }
+208	AUE_NULL	UNIMPL	newreboot (NetBSD)
+209	AUE_POLL	STD	{ int poll(struct pollfd *fds, u_int nfds, \
+				    int timeout); }
+
+;
+; The following are reserved for loadable syscalls
+;
+210	AUE_NULL	NODEF|NOTSTATIC	lkmnosys lkmnosys nosys_args int
+211	AUE_NULL	NODEF|NOTSTATIC	lkmnosys lkmnosys nosys_args int
+212	AUE_NULL	NODEF|NOTSTATIC	lkmnosys lkmnosys nosys_args int
+213	AUE_NULL	NODEF|NOTSTATIC	lkmnosys lkmnosys nosys_args int
+214	AUE_NULL	NODEF|NOTSTATIC	lkmnosys lkmnosys nosys_args int
+215	AUE_NULL	NODEF|NOTSTATIC	lkmnosys lkmnosys nosys_args int
+216	AUE_NULL	NODEF|NOTSTATIC	lkmnosys lkmnosys nosys_args int
+217	AUE_NULL	NODEF|NOTSTATIC	lkmnosys lkmnosys nosys_args int
+218	AUE_NULL	NODEF|NOTSTATIC	lkmnosys lkmnosys nosys_args int
+219	AUE_NULL	NODEF|NOTSTATIC	lkmnosys lkmnosys nosys_args int
+
+;
+; The following were introduced with NetBSD/4.4Lite-2
+220	AUE_SEMCTL	COMPAT7|NOSTD { int __semctl(int semid, int semnum, \
+				    int cmd, union semun_old *arg); }
+221	AUE_SEMGET	NOSTD	{ int semget(key_t key, int nsems, \
+				    int semflg); }
+222	AUE_SEMOP	NOSTD	{ int semop(int semid, struct sembuf *sops, \
+				    size_t nsops); }
+223	AUE_NULL	UNIMPL	semconfig
+224	AUE_MSGCTL	COMPAT7|NOSTD { int msgctl(int msqid, int cmd, \
+				    struct msqid_ds_old *buf); }
+225	AUE_MSGGET	NOSTD	{ int msgget(key_t key, int msgflg); }
+226	AUE_MSGSND	NOSTD	{ int msgsnd(int msqid, const void *msgp, \
+				    size_t msgsz, int msgflg); }
+227	AUE_MSGRCV	NOSTD	{ int msgrcv(int msqid, void *msgp, \
+				    size_t msgsz, long msgtyp, int msgflg); }
+228	AUE_SHMAT	NOSTD	{ int shmat(int shmid, const void *shmaddr, \
+				    int shmflg); }
+229	AUE_SHMCTL	COMPAT7|NOSTD { int shmctl(int shmid, int cmd, \
+				    struct shmid_ds_old *buf); }
+230	AUE_SHMDT	NOSTD	{ int shmdt(const void *shmaddr); }
+231	AUE_SHMGET	NOSTD	{ int shmget(key_t key, size_t size, \
+				    int shmflg); }
+;
+232	AUE_NULL	STD	{ int clock_gettime(clockid_t clock_id, \
+				    struct timespec *tp); }
+233	AUE_CLOCK_SETTIME	STD	{ int clock_settime( \
+				    clockid_t clock_id, \
+				    const struct timespec *tp); }
+234	AUE_NULL	STD	{ int clock_getres(clockid_t clock_id, \
+				    struct timespec *tp); }
+235	AUE_NULL	STD	{ int ktimer_create(clockid_t clock_id, \
+				    struct sigevent *evp, int *timerid); }
+236	AUE_NULL	STD	{ int ktimer_delete(int timerid); }
+237	AUE_NULL	STD	{ int ktimer_settime(int timerid, int flags, \
+				    const struct itimerspec *value, \
+				    struct itimerspec *ovalue); }
+238	AUE_NULL	STD	{ int ktimer_gettime(int timerid, struct \
+				    itimerspec *value); }
+239	AUE_NULL	STD	{ int ktimer_getoverrun(int timerid); }
+240	AUE_NULL	STD	{ int nanosleep(const struct timespec *rqtp, \
+				    struct timespec *rmtp); }
+241	AUE_NULL	STD	{ int ffclock_getcounter(ffcounter *ffcount); }
+242	AUE_NULL	STD	{ int ffclock_setestimate( \
+				    struct ffclock_estimate *cest); }
+243	AUE_NULL	STD	{ int ffclock_getestimate( \
+				    struct ffclock_estimate *cest); }
+244	AUE_NULL	UNIMPL	nosys
+245	AUE_NULL	UNIMPL	nosys
+246	AUE_NULL	UNIMPL	nosys
+247	AUE_NULL	STD	{ int clock_getcpuclockid2(id_t id,\
+				    int which, clockid_t *clock_id); }
+248	AUE_NULL	STD	{ int ntp_gettime(struct ntptimeval *ntvp); }
+249	AUE_NULL	UNIMPL	nosys
+; syscall numbers initially used in OpenBSD
+250	AUE_MINHERIT	STD	{ int minherit(void *addr, size_t len, \
+				    int inherit); }
+251	AUE_RFORK	STD	{ int rfork(int flags); }
+252	AUE_POLL	STD	{ int openbsd_poll(struct pollfd *fds, \
+				    u_int nfds, int timeout); }
+253	AUE_ISSETUGID	STD	{ int issetugid(void); }
+254	AUE_LCHOWN	STD	{ int lchown(char *path, int uid, int gid); }
+255	AUE_NULL	NOSTD	{ int aio_read(struct aiocb *aiocbp); }
+256	AUE_NULL	NOSTD	{ int aio_write(struct aiocb *aiocbp); }
+257	AUE_NULL	NOSTD	{ int lio_listio(int mode, \
+				    struct aiocb * const *acb_list, \
+				    int nent, struct sigevent *sig); }
+258	AUE_NULL	UNIMPL	nosys
+259	AUE_NULL	UNIMPL	nosys
+260	AUE_NULL	UNIMPL	nosys
+261	AUE_NULL	UNIMPL	nosys
+262	AUE_NULL	UNIMPL	nosys
+263	AUE_NULL	UNIMPL	nosys
+264	AUE_NULL	UNIMPL	nosys
+265	AUE_NULL	UNIMPL	nosys
+266	AUE_NULL	UNIMPL	nosys
+267	AUE_NULL	UNIMPL	nosys
+268	AUE_NULL	UNIMPL	nosys
+269	AUE_NULL	UNIMPL	nosys
+270	AUE_NULL	UNIMPL	nosys
+271	AUE_NULL	UNIMPL	nosys
+272	AUE_O_GETDENTS	STD	{ int getdents(int fd, char *buf, \
+				    size_t count); }
+273	AUE_NULL	UNIMPL	nosys
+274	AUE_LCHMOD	STD	{ int lchmod(char *path, mode_t mode); }
+275	AUE_LCHOWN	NOPROTO	{ int lchown(char *path, uid_t uid, \
+				    gid_t gid); } netbsd_lchown lchown_args \
+				    int
+276	AUE_LUTIMES	STD	{ int lutimes(char *path, \
+				    struct timeval *tptr); }
+277	AUE_MSYNC	NOPROTO	{ int msync(void *addr, size_t len, \
+				    int flags); } netbsd_msync msync_args int
+278	AUE_STAT	STD	{ int nstat(char *path, struct nstat *ub); }
+279	AUE_FSTAT	STD	{ int nfstat(int fd, struct nstat *sb); }
+280	AUE_LSTAT	STD	{ int nlstat(char *path, struct nstat *ub); }
+281	AUE_NULL	UNIMPL	nosys
+282	AUE_NULL	UNIMPL	nosys
+283	AUE_NULL	UNIMPL	nosys
+284	AUE_NULL	UNIMPL	nosys
+285	AUE_NULL	UNIMPL	nosys
+286	AUE_NULL	UNIMPL	nosys
+287	AUE_NULL	UNIMPL	nosys
+288	AUE_NULL	UNIMPL	nosys
+; 289 and 290 from NetBSD (OpenBSD: 267 and 268)
+289	AUE_PREADV	STD	{ ssize_t preadv(int fd, struct iovec *iovp, \
+					u_int iovcnt, off_t offset); }
+290	AUE_PWRITEV	STD	{ ssize_t pwritev(int fd, struct iovec *iovp, \
+					u_int iovcnt, off_t offset); }
+291	AUE_NULL	UNIMPL	nosys
+292	AUE_NULL	UNIMPL	nosys
+293	AUE_NULL	UNIMPL	nosys
+294	AUE_NULL	UNIMPL	nosys
+295	AUE_NULL	UNIMPL	nosys
+296	AUE_NULL	UNIMPL	nosys
+; XXX 297 is 300 in NetBSD 
+297	AUE_FHSTATFS	COMPAT4	{ int fhstatfs( \
+				    const struct fhandle *u_fhp, \
+				    struct ostatfs *buf); }
+298	AUE_FHOPEN	STD	{ int fhopen(const struct fhandle *u_fhp, \
+				    int flags); }
+299	AUE_FHSTAT	STD	{ int fhstat(const struct fhandle *u_fhp, \
+				    struct stat *sb); }
+; syscall numbers for FreeBSD
+300	AUE_NULL	STD	{ int modnext(int modid); }
+301	AUE_NULL	STD	{ int modstat(int modid, \
+				    struct module_stat *stat); }
+302	AUE_NULL	STD	{ int modfnext(int modid); }
+303	AUE_NULL	STD	{ int modfind(const char *name); }
+304	AUE_MODLOAD	STD	{ int kldload(const char *file); }
+305	AUE_MODUNLOAD	STD	{ int kldunload(int fileid); }
+306	AUE_NULL	STD	{ int kldfind(const char *file); }
+307	AUE_NULL	STD	{ int kldnext(int fileid); }
+308	AUE_NULL	STD	{ int kldstat(int fileid, struct \
+				    kld_file_stat* stat); }
+309	AUE_NULL	STD	{ int kldfirstmod(int fileid); }
+310	AUE_GETSID	STD	{ int getsid(pid_t pid); }
+311	AUE_SETRESUID	STD	{ int setresuid(uid_t ruid, uid_t euid, \
+				    uid_t suid); }
+312	AUE_SETRESGID	STD	{ int setresgid(gid_t rgid, gid_t egid, \
+				    gid_t sgid); }
+313	AUE_NULL	OBSOL	signanosleep
+314	AUE_NULL	NOSTD	{ int aio_return(struct aiocb *aiocbp); }
+315	AUE_NULL	NOSTD	{ int aio_suspend( \
+				    struct aiocb * const * aiocbp, int nent, \
+				    const struct timespec *timeout); }
+316	AUE_NULL	NOSTD	{ int aio_cancel(int fd, \
+				    struct aiocb *aiocbp); }
+317	AUE_NULL	NOSTD	{ int aio_error(struct aiocb *aiocbp); }
+318	AUE_NULL	NOSTD	{ int oaio_read(struct oaiocb *aiocbp); }
+319	AUE_NULL	NOSTD	{ int oaio_write(struct oaiocb *aiocbp); }
+320	AUE_NULL	NOSTD	{ int olio_listio(int mode, \
+				    struct oaiocb * const *acb_list, \
+				    int nent, struct osigevent *sig); }
+321	AUE_NULL	STD	{ int yield(void); }
+322	AUE_NULL	OBSOL	thr_sleep
+323	AUE_NULL	OBSOL	thr_wakeup
+324	AUE_MLOCKALL	STD	{ int mlockall(int how); }
+325	AUE_MUNLOCKALL	STD	{ int munlockall(void); }
+326	AUE_GETCWD	STD	{ int __getcwd(u_char *buf, u_int buflen); }
+
+327	AUE_NULL	STD	{ int sched_setparam (pid_t pid, \
+				    const struct sched_param *param); }
+328	AUE_NULL	STD	{ int sched_getparam (pid_t pid, struct \
+				    sched_param *param); }
+
+329	AUE_NULL	STD	{ int sched_setscheduler (pid_t pid, int \
+				    policy, const struct sched_param \
+				    *param); }
+330	AUE_NULL	STD	{ int sched_getscheduler (pid_t pid); }
+
+331	AUE_NULL	STD	{ int sched_yield (void); }
+332	AUE_NULL	STD	{ int sched_get_priority_max (int policy); }
+333	AUE_NULL	STD	{ int sched_get_priority_min (int policy); }
+334	AUE_NULL	STD	{ int sched_rr_get_interval (pid_t pid, \
+				    struct timespec *interval); }
+335	AUE_NULL	STD	{ int utrace(const void *addr, size_t len); }
+336	AUE_SENDFILE	COMPAT4	{ int sendfile(int fd, int s, \
+				    off_t offset, size_t nbytes, \
+				    struct sf_hdtr *hdtr, off_t *sbytes, \
+				    int flags); }
+337	AUE_NULL	STD	{ int kldsym(int fileid, int cmd, \
+				    void *data); }
+338	AUE_JAIL	STD	{ int jail(struct jail *jail); }
+339	AUE_NULL	NOSTD|NOTSTATIC	{ int nnpfs_syscall(int operation, \
+				    char *a_pathP, int a_opcode, \
+				    void *a_paramsP, int a_followSymlinks); }
+340	AUE_SIGPROCMASK	STD	{ int sigprocmask(int how, \
+				    const sigset_t *set, sigset_t *oset); }
+341	AUE_SIGSUSPEND	STD	{ int sigsuspend(const sigset_t *sigmask); }
+342	AUE_SIGACTION	COMPAT4	{ int sigaction(int sig, const \
+				    struct sigaction *act, \
+				    struct sigaction *oact); }
+343	AUE_SIGPENDING	STD	{ int sigpending(sigset_t *set); }
+344	AUE_SIGRETURN	COMPAT4	{ int sigreturn( \
+				    const struct ucontext4 *sigcntxp); }
+345	AUE_SIGWAIT	STD	{ int sigtimedwait(const sigset_t *set, \
+				    siginfo_t *info, \
+				    const struct timespec *timeout); }
+346	AUE_NULL	STD	{ int sigwaitinfo(const sigset_t *set, \
+				    siginfo_t *info); }
+347	AUE_NULL	STD	{ int __acl_get_file(const char *path, \
+				    acl_type_t type, struct acl *aclp); }
+348	AUE_NULL	STD	{ int __acl_set_file(const char *path, \
+				    acl_type_t type, struct acl *aclp); }
+349	AUE_NULL	STD	{ int __acl_get_fd(int filedes, \
+				    acl_type_t type, struct acl *aclp); }
+350	AUE_NULL	STD	{ int __acl_set_fd(int filedes, \
+				    acl_type_t type, struct acl *aclp); }
+351	AUE_NULL	STD	{ int __acl_delete_file(const char *path, \
+				    acl_type_t type); }
+352	AUE_NULL	STD	{ int __acl_delete_fd(int filedes, \
+				    acl_type_t type); }
+353	AUE_NULL	STD	{ int __acl_aclcheck_file(const char *path, \
+				    acl_type_t type, struct acl *aclp); }
+354	AUE_NULL	STD	{ int __acl_aclcheck_fd(int filedes, \
+				    acl_type_t type, struct acl *aclp); }
+355	AUE_EXTATTRCTL	STD	{ int extattrctl(const char *path, int cmd, \
+				    const char *filename, int attrnamespace, \
+				    const char *attrname); }
+356	AUE_EXTATTR_SET_FILE	STD	{ ssize_t extattr_set_file( \
+				    const char *path, int attrnamespace, \
+				    const char *attrname, void *data, \
+				    size_t nbytes); }
+357	AUE_EXTATTR_GET_FILE	STD	{ ssize_t extattr_get_file( \
+				    const char *path, int attrnamespace, \
+				    const char *attrname, void *data, \
+				    size_t nbytes); }
+358	AUE_EXTATTR_DELETE_FILE	STD	{ int extattr_delete_file(const char *path, \
+				    int attrnamespace, \
+				    const char *attrname); }
+359	AUE_NULL	NOSTD	{ int aio_waitcomplete( \
+				    struct aiocb **aiocbp, \
+				    struct timespec *timeout); }
+360	AUE_GETRESUID	STD	{ int getresuid(uid_t *ruid, uid_t *euid, \
+				    uid_t *suid); }
+361	AUE_GETRESGID	STD	{ int getresgid(gid_t *rgid, gid_t *egid, \
+				    gid_t *sgid); }
+362	AUE_KQUEUE	STD	{ int kqueue(void); }
+363	AUE_NULL	STD	{ int kevent(int fd, \
+				    struct kevent *changelist, int nchanges, \
+				    struct kevent *eventlist, int nevents, \
+				    const struct timespec *timeout); }
+364	AUE_NULL	UNIMPL	__cap_get_proc
+365	AUE_NULL	UNIMPL	__cap_set_proc
+366	AUE_NULL	UNIMPL	__cap_get_fd
+367	AUE_NULL	UNIMPL	__cap_get_file
+368	AUE_NULL	UNIMPL	__cap_set_fd
+369	AUE_NULL	UNIMPL	__cap_set_file
+370	AUE_NULL	UNIMPL	nosys
+371	AUE_EXTATTR_SET_FD	STD	{ ssize_t extattr_set_fd(int fd, \
+				    int attrnamespace, const char *attrname, \
+				    void *data, size_t nbytes); }
+372	AUE_EXTATTR_GET_FD	STD	{ ssize_t extattr_get_fd(int fd, \
+				    int attrnamespace, const char *attrname, \
+				    void *data, size_t nbytes); }
+373	AUE_EXTATTR_DELETE_FD	STD	{ int extattr_delete_fd(int fd, \
+				    int attrnamespace, \
+				    const char *attrname); }
+374	AUE_NULL	STD	{ int __setugid(int flag); }
+375	AUE_NULL	UNIMPL	nfsclnt
+376	AUE_EACCESS	STD	{ int eaccess(char *path, int amode); }
+377	AUE_NULL	NOSTD|NOTSTATIC	{ int afs3_syscall(long syscall, \
+				    long parm1, long parm2, long parm3, \
+				    long parm4, long parm5, long parm6); }
+378	AUE_NMOUNT	STD	{ int nmount(struct iovec *iovp, \
+				    unsigned int iovcnt, int flags); }
+379	AUE_NULL	UNIMPL	kse_exit
+380	AUE_NULL	UNIMPL	kse_wakeup
+381	AUE_NULL	UNIMPL	kse_create
+382	AUE_NULL	UNIMPL	kse_thr_interrupt
+383	AUE_NULL	UNIMPL	kse_release
+384	AUE_NULL	STD	{ int __mac_get_proc(struct mac *mac_p); }
+385	AUE_NULL	STD	{ int __mac_set_proc(struct mac *mac_p); }
+386	AUE_NULL	STD	{ int __mac_get_fd(int fd, \
+				    struct mac *mac_p); }
+387	AUE_NULL	STD	{ int __mac_get_file(const char *path_p, \
+				    struct mac *mac_p); }
+388	AUE_NULL	STD	{ int __mac_set_fd(int fd, \
+				    struct mac *mac_p); }
+389	AUE_NULL	STD	{ int __mac_set_file(const char *path_p, \
+				    struct mac *mac_p); }
+390	AUE_NULL	STD	{ int kenv(int what, const char *name, \
+				    char *value, int len); }
+391	AUE_LCHFLAGS	STD	{ int lchflags(const char *path, \
+				    u_long flags); }
+392	AUE_NULL	STD	{ int uuidgen(struct uuid *store, \
+				    int count); }
+393	AUE_SENDFILE	STD	{ int sendfile(int fd, int s, off_t offset, \
+				    size_t nbytes, struct sf_hdtr *hdtr, \
+				    off_t *sbytes, int flags); }
+394	AUE_NULL	STD	{ int mac_syscall(const char *policy, \
+				    int call, void *arg); }
+395	AUE_GETFSSTAT	STD	{ int getfsstat(struct statfs *buf, \
+				    long bufsize, int flags); }
+396	AUE_STATFS	STD	{ int statfs(char *path, \
+				    struct statfs *buf); }
+397	AUE_FSTATFS	STD	{ int fstatfs(int fd, struct statfs *buf); }
+398	AUE_FHSTATFS	STD	{ int fhstatfs(const struct fhandle *u_fhp, \
+				    struct statfs *buf); }
+399	AUE_NULL	UNIMPL	nosys
+400	AUE_NULL	NOSTD	{ int ksem_close(semid_t id); }
+401	AUE_NULL	NOSTD	{ int ksem_post(semid_t id); }
+402	AUE_NULL	NOSTD	{ int ksem_wait(semid_t id); }
+403	AUE_NULL	NOSTD	{ int ksem_trywait(semid_t id); }
+404	AUE_NULL	NOSTD	{ int ksem_init(semid_t *idp, \
+				    unsigned int value); }
+405	AUE_NULL	NOSTD	{ int ksem_open(semid_t *idp, \
+				    const char *name, int oflag, \
+				    mode_t mode, unsigned int value); }
+406	AUE_NULL	NOSTD	{ int ksem_unlink(const char *name); }
+407	AUE_NULL	NOSTD	{ int ksem_getvalue(semid_t id, int *val); }
+408	AUE_NULL	NOSTD	{ int ksem_destroy(semid_t id); }
+409	AUE_NULL	STD	{ int __mac_get_pid(pid_t pid, \
+				    struct mac *mac_p); }
+410	AUE_NULL	STD	{ int __mac_get_link(const char *path_p, \
+				    struct mac *mac_p); }
+411	AUE_NULL	STD	{ int __mac_set_link(const char *path_p, \
+				    struct mac *mac_p); }
+412	AUE_EXTATTR_SET_LINK	STD	{ ssize_t extattr_set_link( \
+				    const char *path, int attrnamespace, \
+				    const char *attrname, void *data, \
+				    size_t nbytes); }
+413	AUE_EXTATTR_GET_LINK	STD	{ ssize_t extattr_get_link( \
+				    const char *path, int attrnamespace, \
+				    const char *attrname, void *data, \
+				    size_t nbytes); }
+414	AUE_EXTATTR_DELETE_LINK	STD	{ int extattr_delete_link( \
+				    const char *path, int attrnamespace, \
+				    const char *attrname); }
+415	AUE_NULL	STD	{ int __mac_execve(char *fname, char **argv, \
+				    char **envv, struct mac *mac_p); }
+416	AUE_SIGACTION	STD	{ int sigaction(int sig, \
+				    const struct sigaction *act, \
+				    struct sigaction *oact); }
+417	AUE_SIGRETURN	STD	{ int sigreturn( \
+				    const struct __ucontext *sigcntxp); }
+418	AUE_NULL	UNIMPL	__xstat
+419	AUE_NULL	UNIMPL	__xfstat
+420	AUE_NULL	UNIMPL	__xlstat
+421	AUE_NULL	STD	{ int getcontext(struct __ucontext *ucp); }
+422	AUE_NULL	STD	{ int setcontext( \
+				    const struct __ucontext *ucp); }
+423	AUE_NULL	STD	{ int swapcontext(struct __ucontext *oucp, \
+				    const struct __ucontext *ucp); }
+424	AUE_SWAPOFF	STD	{ int swapoff(const char *name); }
+425	AUE_NULL	STD	{ int __acl_get_link(const char *path, \
+				    acl_type_t type, struct acl *aclp); }
+426	AUE_NULL	STD	{ int __acl_set_link(const char *path, \
+				    acl_type_t type, struct acl *aclp); }
+427	AUE_NULL	STD	{ int __acl_delete_link(const char *path, \
+				    acl_type_t type); }
+428	AUE_NULL	STD	{ int __acl_aclcheck_link(const char *path, \
+				    acl_type_t type, struct acl *aclp); }
+429	AUE_SIGWAIT	STD	{ int sigwait(const sigset_t *set, \
+				    int *sig); }
+430	AUE_NULL	STD	{ int thr_create(ucontext_t *ctx, long *id, \
+				    int flags); }
+431	AUE_NULL	STD	{ void thr_exit(long *state); }
+432	AUE_NULL	STD	{ int thr_self(long *id); }
+433	AUE_NULL	STD	{ int thr_kill(long id, int sig); }
+434	AUE_NULL	STD	{ int _umtx_lock(struct umtx *umtx); }
+435	AUE_NULL	STD	{ int _umtx_unlock(struct umtx *umtx); }
+436	AUE_NULL	STD	{ int jail_attach(int jid); }
+437	AUE_EXTATTR_LIST_FD	STD	{ ssize_t extattr_list_fd(int fd, \
+				    int attrnamespace, void *data, \
+				    size_t nbytes); }
+438	AUE_EXTATTR_LIST_FILE	STD	{ ssize_t extattr_list_file( \
+				    const char *path, int attrnamespace, \
+				    void *data, size_t nbytes); }
+439	AUE_EXTATTR_LIST_LINK	STD	{ ssize_t extattr_list_link( \
+				    const char *path, int attrnamespace, \
+				    void *data, size_t nbytes); }
+440	AUE_NULL	UNIMPL	kse_switchin
+441	AUE_NULL	NOSTD	{ int ksem_timedwait(semid_t id, \
+				    const struct timespec *abstime); }
+442	AUE_NULL	STD	{ int thr_suspend( \
+				    const struct timespec *timeout); }
+443	AUE_NULL	STD	{ int thr_wake(long id); }
+444	AUE_MODUNLOAD	STD	{ int kldunloadf(int fileid, int flags); }
+445	AUE_AUDIT	STD	{ int audit(const void *record, \
+				    u_int length); }
+446	AUE_AUDITON	STD	{ int auditon(int cmd, void *data, \
+				    u_int length); }
+447	AUE_GETAUID	STD	{ int getauid(uid_t *auid); }
+448	AUE_SETAUID	STD	{ int setauid(uid_t *auid); }
+449	AUE_GETAUDIT	STD	{ int getaudit(struct auditinfo *auditinfo); }
+450	AUE_SETAUDIT	STD	{ int setaudit(struct auditinfo *auditinfo); }
+451	AUE_GETAUDIT_ADDR	STD	{ int getaudit_addr( \
+				    struct auditinfo_addr *auditinfo_addr, \
+				    u_int length); }
+452	AUE_SETAUDIT_ADDR	STD	{ int setaudit_addr( \
+				    struct auditinfo_addr *auditinfo_addr, \
+				    u_int length); }
+453	AUE_AUDITCTL	STD	{ int auditctl(char *path); }
+454	AUE_NULL	STD	{ int _umtx_op(void *obj, int op, \
+				    u_long val, void *uaddr1, void *uaddr2); }
+455	AUE_NULL	STD	{ int thr_new(struct thr_param *param, \
+				    int param_size); }
+456	AUE_NULL	STD	{ int sigqueue(pid_t pid, int signum, void *value); }
+457	AUE_NULL	NOSTD	{ int kmq_open(const char *path, int flags, \
+				    mode_t mode, const struct mq_attr *attr); }
+458	AUE_NULL	NOSTD	{ int kmq_setattr(int mqd,		\
+				    const struct mq_attr *attr,		\
+				    struct mq_attr *oattr); }
+459	AUE_NULL	NOSTD	{ int kmq_timedreceive(int mqd,	\
+				    char *msg_ptr, size_t msg_len,	\
+				    unsigned *msg_prio,			\
+				    const struct timespec *abs_timeout); }
+460	AUE_NULL	NOSTD	{ int kmq_timedsend(int mqd,		\
+				    const char *msg_ptr, size_t msg_len,\
+				    unsigned msg_prio,			\
+				    const struct timespec *abs_timeout);}
+461	AUE_NULL	NOSTD	{ int kmq_notify(int mqd,		\
+				    const struct sigevent *sigev); }
+462	AUE_NULL	NOSTD	{ int kmq_unlink(const char *path); }
+463	AUE_NULL	STD	{ int abort2(const char *why, int nargs, void **args); }
+464	AUE_NULL	STD	{ int thr_set_name(long id, const char *name); }
+465	AUE_NULL	NOSTD	{ int aio_fsync(int op, struct aiocb *aiocbp); }
+466	AUE_RTPRIO	STD	{ int rtprio_thread(int function, \
+				    lwpid_t lwpid, struct rtprio *rtp); }
+467	AUE_NULL	UNIMPL	nosys
+468	AUE_NULL	UNIMPL	nosys
+469	AUE_NULL	UNIMPL	__getpath_fromfd
+470	AUE_NULL	UNIMPL	__getpath_fromaddr
+471	AUE_NULL	STD	{ int sctp_peeloff(int sd, uint32_t name); }
+472     AUE_NULL        STD    { int sctp_generic_sendmsg(int sd, caddr_t msg, int mlen, \
+	                            caddr_t to, __socklen_t tolen, \
+				    struct sctp_sndrcvinfo *sinfo, int flags); }
+473     AUE_NULL        STD    { int sctp_generic_sendmsg_iov(int sd, struct iovec *iov, int iovlen, \
+	                            caddr_t to, __socklen_t tolen, \
+				    struct sctp_sndrcvinfo *sinfo, int flags); }
+474     AUE_NULL        STD    { int sctp_generic_recvmsg(int sd, struct iovec *iov, int iovlen, \
+				    struct sockaddr * from, __socklen_t *fromlenaddr, \
+				    struct sctp_sndrcvinfo *sinfo, int *msg_flags); }
+475	AUE_PREAD	STD	{ ssize_t pread(int fd, void *buf, \
+				    size_t nbyte, off_t offset); }
+476	AUE_PWRITE	STD	{ ssize_t pwrite(int fd, const void *buf, \
+				    size_t nbyte, off_t offset); }
+477	AUE_MMAP	STD	{ caddr_t mmap(caddr_t addr, size_t len, \
+				    int prot, int flags, int fd, off_t pos); }
+478	AUE_LSEEK	STD	{ off_t lseek(int fd, off_t offset, \
+				    int whence); }
+479	AUE_TRUNCATE	STD	{ int truncate(char *path, off_t length); }
+480	AUE_FTRUNCATE	STD	{ int ftruncate(int fd, off_t length); }
+481	AUE_KILL	STD	{ int thr_kill2(pid_t pid, long id, int sig); }
+482	AUE_SHMOPEN	STD	{ int shm_open(const char *path, int flags, \
+				    mode_t mode); }
+483	AUE_SHMUNLINK	STD	{ int shm_unlink(const char *path); }
+484	AUE_NULL	STD	{ int cpuset(cpusetid_t *setid); }
+485	AUE_NULL	STD	{ int cpuset_setid(cpuwhich_t which, id_t id, \
+				    cpusetid_t setid); }
+486	AUE_NULL	STD	{ int cpuset_getid(cpulevel_t level, \
+				    cpuwhich_t which, id_t id, \
+				    cpusetid_t *setid); }
+487	AUE_NULL	STD	{ int cpuset_getaffinity(cpulevel_t level, \
+				    cpuwhich_t which, id_t id, size_t cpusetsize, \
+				    cpuset_t *mask); }
+488	AUE_NULL	STD	{ int cpuset_setaffinity(cpulevel_t level, \
+				    cpuwhich_t which, id_t id, size_t cpusetsize, \
+				    const cpuset_t *mask); }
+489	AUE_FACCESSAT	STD	{ int faccessat(int fd, char *path, int amode, \
+				    int flag); }
+490	AUE_FCHMODAT	STD	{ int fchmodat(int fd, char *path, mode_t mode, \
+				    int flag); }
+491	AUE_FCHOWNAT	STD	{ int fchownat(int fd, char *path, uid_t uid, \
+				    gid_t gid, int flag); }
+492	AUE_FEXECVE	STD	{ int fexecve(int fd, char **argv, \
+				    char **envv); }
+493	AUE_FSTATAT	STD	{ int fstatat(int fd, char *path, \
+				    struct stat *buf, int flag); }
+494	AUE_FUTIMESAT	STD	{ int futimesat(int fd, char *path, \
+				    struct timeval *times); }
+495	AUE_LINKAT	STD	{ int linkat(int fd1, char *path1, int fd2, \
+				    char *path2, int flag); }
+496	AUE_MKDIRAT	STD	{ int mkdirat(int fd, char *path, mode_t mode); }
+497	AUE_MKFIFOAT	STD	{ int mkfifoat(int fd, char *path, mode_t mode); }
+498	AUE_MKNODAT	STD	{ int mknodat(int fd, char *path, mode_t mode, \
+				    dev_t dev); }
+; XXX: see the comment for open
+499	AUE_OPENAT_RWTC	STD	{ int openat(int fd, char *path, int flag, \
+				    mode_t mode); }
+500	AUE_READLINKAT	STD	{ int readlinkat(int fd, char *path, char *buf, \
+				    size_t bufsize); }
+501	AUE_RENAMEAT	STD	{ int renameat(int oldfd, char *old, int newfd, \
+				     char *new); }
+502	AUE_SYMLINKAT	STD	{ int symlinkat(char *path1, int fd, \
+				     char *path2); }
+503	AUE_UNLINKAT	STD	{ int unlinkat(int fd, char *path, int flag); }
+504	AUE_POSIX_OPENPT	STD	{ int posix_openpt(int flags); }
+; 505 is initialised by the kgssapi code, if present.
+505	AUE_NULL	NOSTD	{ int gssd_syscall(char *path); }
+506	AUE_NULL	STD	{ int jail_get(struct iovec *iovp, \
+				    unsigned int iovcnt, int flags); }
+507	AUE_NULL	STD	{ int jail_set(struct iovec *iovp, \
+				    unsigned int iovcnt, int flags); }
+508	AUE_NULL	STD	{ int jail_remove(int jid); }
+509	AUE_CLOSEFROM	STD	{ int closefrom(int lowfd); }
+510	AUE_SEMCTL	NOSTD	{ int __semctl(int semid, int semnum, \
+				    int cmd, union semun *arg); }
+511	AUE_MSGCTL	NOSTD	{ int msgctl(int msqid, int cmd, \
+				    struct msqid_ds *buf); }
+512	AUE_SHMCTL	NOSTD	{ int shmctl(int shmid, int cmd, \
+				    struct shmid_ds *buf); }
+513	AUE_LPATHCONF	STD	{ int lpathconf(char *path, int name); }
+514	AUE_NULL	OBSOL	cap_new
+515	AUE_CAP_RIGHTS_GET	STD	{ int __cap_rights_get(int version, \
+				    int fd, cap_rights_t *rightsp); }
+516	AUE_CAP_ENTER	STD	{ int cap_enter(void); }
+517	AUE_CAP_GETMODE	STD	{ int cap_getmode(u_int *modep); }
+518	AUE_PDFORK	STD	{ int pdfork(int *fdp, int flags); }
+519	AUE_PDKILL	STD	{ int pdkill(int fd, int signum); }
+520	AUE_PDGETPID	STD	{ int pdgetpid(int fd, pid_t *pidp); }
+521	AUE_PDWAIT	UNIMPL	pdwait4
+522	AUE_SELECT	STD	{ int pselect(int nd, fd_set *in, \
+				    fd_set *ou, fd_set *ex, \
+				    const struct timespec *ts, \
+				    const sigset_t *sm); }
+523	AUE_NULL	STD	{ int getloginclass(char *namebuf, \
+				    size_t namelen); }
+524	AUE_NULL	STD	{ int setloginclass(const char *namebuf); }
+525	AUE_NULL	STD	{ int rctl_get_racct(const void *inbufp, \
+				    size_t inbuflen, void *outbufp, \
+				    size_t outbuflen); }
+526	AUE_NULL	STD	{ int rctl_get_rules(const void *inbufp, \
+				    size_t inbuflen, void *outbufp, \
+				    size_t outbuflen); }
+527	AUE_NULL	STD	{ int rctl_get_limits(const void *inbufp, \
+				    size_t inbuflen, void *outbufp, \
+				    size_t outbuflen); }
+528	AUE_NULL	STD	{ int rctl_add_rule(const void *inbufp, \
+				    size_t inbuflen, void *outbufp, \
+				    size_t outbuflen); }
+529	AUE_NULL	STD	{ int rctl_remove_rule(const void *inbufp, \
+				    size_t inbuflen, void *outbufp, \
+				    size_t outbuflen); }
+530	AUE_NULL	STD	{ int posix_fallocate(int fd, \
+				    off_t offset, off_t len); }
+531	AUE_NULL	STD	{ int posix_fadvise(int fd, off_t offset, \
+				    off_t len, int advice); }
+532	AUE_WAIT6	STD	{ int wait6(int idtype, id_t id, \
+				    int *status, int options, \
+				    struct __wrusage *wrusage, \
+				    siginfo_t *info); }
+533	AUE_CAP_RIGHTS_LIMIT	STD	{ int cap_rights_limit(int fd, \
+					    cap_rights_t *rightsp); }
+534	AUE_CAP_IOCTLS_LIMIT	STD	{ int cap_ioctls_limit(int fd, \
+					    const u_long *cmds, size_t ncmds); }
+535	AUE_CAP_IOCTLS_GET	STD	{ ssize_t cap_ioctls_get(int fd, \
+					    u_long *cmds, size_t maxcmds); }
+536	AUE_CAP_FCNTLS_LIMIT	STD	{ int cap_fcntls_limit(int fd, \
+					    uint32_t fcntlrights); }
+537	AUE_CAP_FCNTLS_GET	STD	{ int cap_fcntls_get(int fd, \
+					    uint32_t *fcntlrightsp); }
+538	AUE_BINDAT	STD	{ int bindat(int fd, int s, caddr_t name, \
+				    int namelen); }
+539	AUE_CONNECTAT	STD	{ int connectat(int fd, int s, caddr_t name, \
+				    int namelen); }
+540	AUE_CHFLAGSAT	STD	{ int chflagsat(int fd, const char *path, \
+				    u_long flags, int atflag); }
+541	AUE_ACCEPT	STD	{ int accept4(int s, \
+				    struct sockaddr * __restrict name, \
+				    __socklen_t * __restrict anamelen, \
+				    int flags); }
+542	AUE_PIPE	STD	{ int pipe2(int *fildes, int flags); }
+543	AUE_NULL	NOSTD	{ int aio_mlock(struct aiocb *aiocbp); }
+; Please copy any additions and changes to the following compatability tables:
+; sys/compat/freebsd32/syscalls.master
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
new file mode 100644
index 0000000..0a6bae4
--- /dev/null
+++ b/sys/kern/systrace_args.c
@@ -0,0 +1,10946 @@
+/*
+ * System call argument to DTrace register array converstion.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * This file is part of the DTrace syscall provider.
+ */
+
+static void
+systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
+{
+	int64_t *iarg  = (int64_t *) uarg;
+	switch (sysnum) {
+	/* nosys */
+	case 0: {
+		*n_args = 0;
+		break;
+	}
+	/* sys_exit */
+	case 1: {
+		struct sys_exit_args *p = params;
+		iarg[0] = p->rval; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* fork */
+	case 2: {
+		*n_args = 0;
+		break;
+	}
+	/* read */
+	case 3: {
+		struct read_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* void * */
+		uarg[2] = p->nbyte; /* size_t */
+		*n_args = 3;
+		break;
+	}
+	/* write */
+	case 4: {
+		struct write_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* const void * */
+		uarg[2] = p->nbyte; /* size_t */
+		*n_args = 3;
+		break;
+	}
+	/* open */
+	case 5: {
+		struct open_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->flags; /* int */
+		iarg[2] = p->mode; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* close */
+	case 6: {
+		struct close_args *p = params;
+		iarg[0] = p->fd; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* wait4 */
+	case 7: {
+		struct wait4_args *p = params;
+		iarg[0] = p->pid; /* int */
+		uarg[1] = (intptr_t) p->status; /* int * */
+		iarg[2] = p->options; /* int */
+		uarg[3] = (intptr_t) p->rusage; /* struct rusage * */
+		*n_args = 4;
+		break;
+	}
+	/* link */
+	case 9: {
+		struct link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->link; /* char * */
+		*n_args = 2;
+		break;
+	}
+	/* unlink */
+	case 10: {
+		struct unlink_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* chdir */
+	case 12: {
+		struct chdir_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* fchdir */
+	case 13: {
+		struct fchdir_args *p = params;
+		iarg[0] = p->fd; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* mknod */
+	case 14: {
+		struct mknod_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->mode; /* int */
+		iarg[2] = p->dev; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* chmod */
+	case 15: {
+		struct chmod_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->mode; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* chown */
+	case 16: {
+		struct chown_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->uid; /* int */
+		iarg[2] = p->gid; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* obreak */
+	case 17: {
+		struct obreak_args *p = params;
+		uarg[0] = (intptr_t) p->nsize; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* getpid */
+	case 20: {
+		*n_args = 0;
+		break;
+	}
+	/* mount */
+	case 21: {
+		struct mount_args *p = params;
+		uarg[0] = (intptr_t) p->type; /* char * */
+		uarg[1] = (intptr_t) p->path; /* char * */
+		iarg[2] = p->flags; /* int */
+		uarg[3] = (intptr_t) p->data; /* caddr_t */
+		*n_args = 4;
+		break;
+	}
+	/* unmount */
+	case 22: {
+		struct unmount_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* setuid */
+	case 23: {
+		struct setuid_args *p = params;
+		uarg[0] = p->uid; /* uid_t */
+		*n_args = 1;
+		break;
+	}
+	/* getuid */
+	case 24: {
+		*n_args = 0;
+		break;
+	}
+	/* geteuid */
+	case 25: {
+		*n_args = 0;
+		break;
+	}
+	/* ptrace */
+	case 26: {
+		struct ptrace_args *p = params;
+		iarg[0] = p->req; /* int */
+		iarg[1] = p->pid; /* pid_t */
+		uarg[2] = (intptr_t) p->addr; /* caddr_t */
+		iarg[3] = p->data; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* recvmsg */
+	case 27: {
+		struct recvmsg_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->msg; /* struct msghdr * */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* sendmsg */
+	case 28: {
+		struct sendmsg_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->msg; /* struct msghdr * */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* recvfrom */
+	case 29: {
+		struct recvfrom_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->buf; /* caddr_t */
+		uarg[2] = p->len; /* size_t */
+		iarg[3] = p->flags; /* int */
+		uarg[4] = (intptr_t) p->from; /* struct sockaddr *__restrict */
+		uarg[5] = (intptr_t) p->fromlenaddr; /* __socklen_t *__restrict */
+		*n_args = 6;
+		break;
+	}
+	/* accept */
+	case 30: {
+		struct accept_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->name; /* struct sockaddr *__restrict */
+		uarg[2] = (intptr_t) p->anamelen; /* __socklen_t *__restrict */
+		*n_args = 3;
+		break;
+	}
+	/* getpeername */
+	case 31: {
+		struct getpeername_args *p = params;
+		iarg[0] = p->fdes; /* int */
+		uarg[1] = (intptr_t) p->asa; /* struct sockaddr *__restrict */
+		uarg[2] = (intptr_t) p->alen; /* __socklen_t *__restrict */
+		*n_args = 3;
+		break;
+	}
+	/* getsockname */
+	case 32: {
+		struct getsockname_args *p = params;
+		iarg[0] = p->fdes; /* int */
+		uarg[1] = (intptr_t) p->asa; /* struct sockaddr *__restrict */
+		uarg[2] = (intptr_t) p->alen; /* __socklen_t *__restrict */
+		*n_args = 3;
+		break;
+	}
+	/* access */
+	case 33: {
+		struct access_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->amode; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* chflags */
+	case 34: {
+		struct chflags_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		uarg[1] = p->flags; /* u_long */
+		*n_args = 2;
+		break;
+	}
+	/* fchflags */
+	case 35: {
+		struct fchflags_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = p->flags; /* u_long */
+		*n_args = 2;
+		break;
+	}
+	/* sync */
+	case 36: {
+		*n_args = 0;
+		break;
+	}
+	/* kill */
+	case 37: {
+		struct kill_args *p = params;
+		iarg[0] = p->pid; /* int */
+		iarg[1] = p->signum; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* getppid */
+	case 39: {
+		*n_args = 0;
+		break;
+	}
+	/* dup */
+	case 41: {
+		struct dup_args *p = params;
+		uarg[0] = p->fd; /* u_int */
+		*n_args = 1;
+		break;
+	}
+	/* pipe */
+	case 42: {
+		*n_args = 0;
+		break;
+	}
+	/* getegid */
+	case 43: {
+		*n_args = 0;
+		break;
+	}
+	/* profil */
+	case 44: {
+		struct profil_args *p = params;
+		uarg[0] = (intptr_t) p->samples; /* caddr_t */
+		uarg[1] = p->size; /* size_t */
+		uarg[2] = p->offset; /* size_t */
+		uarg[3] = p->scale; /* u_int */
+		*n_args = 4;
+		break;
+	}
+	/* ktrace */
+	case 45: {
+		struct ktrace_args *p = params;
+		uarg[0] = (intptr_t) p->fname; /* const char * */
+		iarg[1] = p->ops; /* int */
+		iarg[2] = p->facs; /* int */
+		iarg[3] = p->pid; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* getgid */
+	case 47: {
+		*n_args = 0;
+		break;
+	}
+	/* getlogin */
+	case 49: {
+		struct getlogin_args *p = params;
+		uarg[0] = (intptr_t) p->namebuf; /* char * */
+		uarg[1] = p->namelen; /* u_int */
+		*n_args = 2;
+		break;
+	}
+	/* setlogin */
+	case 50: {
+		struct setlogin_args *p = params;
+		uarg[0] = (intptr_t) p->namebuf; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* acct */
+	case 51: {
+		struct acct_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* sigaltstack */
+	case 53: {
+		struct sigaltstack_args *p = params;
+		uarg[0] = (intptr_t) p->ss; /* stack_t * */
+		uarg[1] = (intptr_t) p->oss; /* stack_t * */
+		*n_args = 2;
+		break;
+	}
+	/* ioctl */
+	case 54: {
+		struct ioctl_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = p->com; /* u_long */
+		uarg[2] = (intptr_t) p->data; /* caddr_t */
+		*n_args = 3;
+		break;
+	}
+	/* reboot */
+	case 55: {
+		struct reboot_args *p = params;
+		iarg[0] = p->opt; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* revoke */
+	case 56: {
+		struct revoke_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* symlink */
+	case 57: {
+		struct symlink_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->link; /* char * */
+		*n_args = 2;
+		break;
+	}
+	/* readlink */
+	case 58: {
+		struct readlink_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->buf; /* char * */
+		uarg[2] = p->count; /* size_t */
+		*n_args = 3;
+		break;
+	}
+	/* execve */
+	case 59: {
+		struct execve_args *p = params;
+		uarg[0] = (intptr_t) p->fname; /* char * */
+		uarg[1] = (intptr_t) p->argv; /* char ** */
+		uarg[2] = (intptr_t) p->envv; /* char ** */
+		*n_args = 3;
+		break;
+	}
+	/* umask */
+	case 60: {
+		struct umask_args *p = params;
+		iarg[0] = p->newmask; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* chroot */
+	case 61: {
+		struct chroot_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* msync */
+	case 65: {
+		struct msync_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* void * */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* vfork */
+	case 66: {
+		*n_args = 0;
+		break;
+	}
+	/* sbrk */
+	case 69: {
+		struct sbrk_args *p = params;
+		iarg[0] = p->incr; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* sstk */
+	case 70: {
+		struct sstk_args *p = params;
+		iarg[0] = p->incr; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* ovadvise */
+	case 72: {
+		struct ovadvise_args *p = params;
+		iarg[0] = p->anom; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* munmap */
+	case 73: {
+		struct munmap_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* void * */
+		uarg[1] = p->len; /* size_t */
+		*n_args = 2;
+		break;
+	}
+	/* mprotect */
+	case 74: {
+		struct mprotect_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* const void * */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->prot; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* madvise */
+	case 75: {
+		struct madvise_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* void * */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->behav; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* mincore */
+	case 78: {
+		struct mincore_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* const void * */
+		uarg[1] = p->len; /* size_t */
+		uarg[2] = (intptr_t) p->vec; /* char * */
+		*n_args = 3;
+		break;
+	}
+	/* getgroups */
+	case 79: {
+		struct getgroups_args *p = params;
+		uarg[0] = p->gidsetsize; /* u_int */
+		uarg[1] = (intptr_t) p->gidset; /* gid_t * */
+		*n_args = 2;
+		break;
+	}
+	/* setgroups */
+	case 80: {
+		struct setgroups_args *p = params;
+		uarg[0] = p->gidsetsize; /* u_int */
+		uarg[1] = (intptr_t) p->gidset; /* gid_t * */
+		*n_args = 2;
+		break;
+	}
+	/* getpgrp */
+	case 81: {
+		*n_args = 0;
+		break;
+	}
+	/* setpgid */
+	case 82: {
+		struct setpgid_args *p = params;
+		iarg[0] = p->pid; /* int */
+		iarg[1] = p->pgid; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* setitimer */
+	case 83: {
+		struct setitimer_args *p = params;
+		uarg[0] = p->which; /* u_int */
+		uarg[1] = (intptr_t) p->itv; /* struct itimerval * */
+		uarg[2] = (intptr_t) p->oitv; /* struct itimerval * */
+		*n_args = 3;
+		break;
+	}
+	/* swapon */
+	case 85: {
+		struct swapon_args *p = params;
+		uarg[0] = (intptr_t) p->name; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* getitimer */
+	case 86: {
+		struct getitimer_args *p = params;
+		uarg[0] = p->which; /* u_int */
+		uarg[1] = (intptr_t) p->itv; /* struct itimerval * */
+		*n_args = 2;
+		break;
+	}
+	/* getdtablesize */
+	case 89: {
+		*n_args = 0;
+		break;
+	}
+	/* dup2 */
+	case 90: {
+		struct dup2_args *p = params;
+		uarg[0] = p->from; /* u_int */
+		uarg[1] = p->to; /* u_int */
+		*n_args = 2;
+		break;
+	}
+	/* fcntl */
+	case 92: {
+		struct fcntl_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->cmd; /* int */
+		iarg[2] = p->arg; /* long */
+		*n_args = 3;
+		break;
+	}
+	/* select */
+	case 93: {
+		struct select_args *p = params;
+		iarg[0] = p->nd; /* int */
+		uarg[1] = (intptr_t) p->in; /* fd_set * */
+		uarg[2] = (intptr_t) p->ou; /* fd_set * */
+		uarg[3] = (intptr_t) p->ex; /* fd_set * */
+		uarg[4] = (intptr_t) p->tv; /* struct timeval * */
+		*n_args = 5;
+		break;
+	}
+	/* fsync */
+	case 95: {
+		struct fsync_args *p = params;
+		iarg[0] = p->fd; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* setpriority */
+	case 96: {
+		struct setpriority_args *p = params;
+		iarg[0] = p->which; /* int */
+		iarg[1] = p->who; /* int */
+		iarg[2] = p->prio; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* socket */
+	case 97: {
+		struct socket_args *p = params;
+		iarg[0] = p->domain; /* int */
+		iarg[1] = p->type; /* int */
+		iarg[2] = p->protocol; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* connect */
+	case 98: {
+		struct connect_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->name; /* caddr_t */
+		iarg[2] = p->namelen; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* getpriority */
+	case 100: {
+		struct getpriority_args *p = params;
+		iarg[0] = p->which; /* int */
+		iarg[1] = p->who; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* bind */
+	case 104: {
+		struct bind_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->name; /* caddr_t */
+		iarg[2] = p->namelen; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* setsockopt */
+	case 105: {
+		struct setsockopt_args *p = params;
+		iarg[0] = p->s; /* int */
+		iarg[1] = p->level; /* int */
+		iarg[2] = p->name; /* int */
+		uarg[3] = (intptr_t) p->val; /* caddr_t */
+		iarg[4] = p->valsize; /* int */
+		*n_args = 5;
+		break;
+	}
+	/* listen */
+	case 106: {
+		struct listen_args *p = params;
+		iarg[0] = p->s; /* int */
+		iarg[1] = p->backlog; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* gettimeofday */
+	case 116: {
+		struct gettimeofday_args *p = params;
+		uarg[0] = (intptr_t) p->tp; /* struct timeval * */
+		uarg[1] = (intptr_t) p->tzp; /* struct timezone * */
+		*n_args = 2;
+		break;
+	}
+	/* getrusage */
+	case 117: {
+		struct getrusage_args *p = params;
+		iarg[0] = p->who; /* int */
+		uarg[1] = (intptr_t) p->rusage; /* struct rusage * */
+		*n_args = 2;
+		break;
+	}
+	/* getsockopt */
+	case 118: {
+		struct getsockopt_args *p = params;
+		iarg[0] = p->s; /* int */
+		iarg[1] = p->level; /* int */
+		iarg[2] = p->name; /* int */
+		uarg[3] = (intptr_t) p->val; /* caddr_t */
+		uarg[4] = (intptr_t) p->avalsize; /* int * */
+		*n_args = 5;
+		break;
+	}
+	/* readv */
+	case 120: {
+		struct readv_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+		uarg[2] = p->iovcnt; /* u_int */
+		*n_args = 3;
+		break;
+	}
+	/* writev */
+	case 121: {
+		struct writev_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+		uarg[2] = p->iovcnt; /* u_int */
+		*n_args = 3;
+		break;
+	}
+	/* settimeofday */
+	case 122: {
+		struct settimeofday_args *p = params;
+		uarg[0] = (intptr_t) p->tv; /* struct timeval * */
+		uarg[1] = (intptr_t) p->tzp; /* struct timezone * */
+		*n_args = 2;
+		break;
+	}
+	/* fchown */
+	case 123: {
+		struct fchown_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->uid; /* int */
+		iarg[2] = p->gid; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* fchmod */
+	case 124: {
+		struct fchmod_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->mode; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* setreuid */
+	case 126: {
+		struct setreuid_args *p = params;
+		iarg[0] = p->ruid; /* int */
+		iarg[1] = p->euid; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* setregid */
+	case 127: {
+		struct setregid_args *p = params;
+		iarg[0] = p->rgid; /* int */
+		iarg[1] = p->egid; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* rename */
+	case 128: {
+		struct rename_args *p = params;
+		uarg[0] = (intptr_t) p->from; /* char * */
+		uarg[1] = (intptr_t) p->to; /* char * */
+		*n_args = 2;
+		break;
+	}
+	/* flock */
+	case 131: {
+		struct flock_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->how; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* mkfifo */
+	case 132: {
+		struct mkfifo_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->mode; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* sendto */
+	case 133: {
+		struct sendto_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->buf; /* caddr_t */
+		uarg[2] = p->len; /* size_t */
+		iarg[3] = p->flags; /* int */
+		uarg[4] = (intptr_t) p->to; /* caddr_t */
+		iarg[5] = p->tolen; /* int */
+		*n_args = 6;
+		break;
+	}
+	/* shutdown */
+	case 134: {
+		struct shutdown_args *p = params;
+		iarg[0] = p->s; /* int */
+		iarg[1] = p->how; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* socketpair */
+	case 135: {
+		struct socketpair_args *p = params;
+		iarg[0] = p->domain; /* int */
+		iarg[1] = p->type; /* int */
+		iarg[2] = p->protocol; /* int */
+		uarg[3] = (intptr_t) p->rsv; /* int * */
+		*n_args = 4;
+		break;
+	}
+	/* mkdir */
+	case 136: {
+		struct mkdir_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->mode; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* rmdir */
+	case 137: {
+		struct rmdir_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* utimes */
+	case 138: {
+		struct utimes_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->tptr; /* struct timeval * */
+		*n_args = 2;
+		break;
+	}
+	/* adjtime */
+	case 140: {
+		struct adjtime_args *p = params;
+		uarg[0] = (intptr_t) p->delta; /* struct timeval * */
+		uarg[1] = (intptr_t) p->olddelta; /* struct timeval * */
+		*n_args = 2;
+		break;
+	}
+	/* setsid */
+	case 147: {
+		*n_args = 0;
+		break;
+	}
+	/* quotactl */
+	case 148: {
+		struct quotactl_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->cmd; /* int */
+		iarg[2] = p->uid; /* int */
+		uarg[3] = (intptr_t) p->arg; /* caddr_t */
+		*n_args = 4;
+		break;
+	}
+	/* nlm_syscall */
+	case 154: {
+		struct nlm_syscall_args *p = params;
+		iarg[0] = p->debug_level; /* int */
+		iarg[1] = p->grace_period; /* int */
+		iarg[2] = p->addr_count; /* int */
+		uarg[3] = (intptr_t) p->addrs; /* char ** */
+		*n_args = 4;
+		break;
+	}
+	/* nfssvc */
+	case 155: {
+		struct nfssvc_args *p = params;
+		iarg[0] = p->flag; /* int */
+		uarg[1] = (intptr_t) p->argp; /* caddr_t */
+		*n_args = 2;
+		break;
+	}
+	/* lgetfh */
+	case 160: {
+		struct lgetfh_args *p = params;
+		uarg[0] = (intptr_t) p->fname; /* char * */
+		uarg[1] = (intptr_t) p->fhp; /* struct fhandle * */
+		*n_args = 2;
+		break;
+	}
+	/* getfh */
+	case 161: {
+		struct getfh_args *p = params;
+		uarg[0] = (intptr_t) p->fname; /* char * */
+		uarg[1] = (intptr_t) p->fhp; /* struct fhandle * */
+		*n_args = 2;
+		break;
+	}
+	/* sysarch */
+	case 165: {
+		struct sysarch_args *p = params;
+		iarg[0] = p->op; /* int */
+		uarg[1] = (intptr_t) p->parms; /* char * */
+		*n_args = 2;
+		break;
+	}
+	/* rtprio */
+	case 166: {
+		struct rtprio_args *p = params;
+		iarg[0] = p->function; /* int */
+		iarg[1] = p->pid; /* pid_t */
+		uarg[2] = (intptr_t) p->rtp; /* struct rtprio * */
+		*n_args = 3;
+		break;
+	}
+	/* semsys */
+	case 169: {
+		struct semsys_args *p = params;
+		iarg[0] = p->which; /* int */
+		iarg[1] = p->a2; /* int */
+		iarg[2] = p->a3; /* int */
+		iarg[3] = p->a4; /* int */
+		iarg[4] = p->a5; /* int */
+		*n_args = 5;
+		break;
+	}
+	/* msgsys */
+	case 170: {
+		struct msgsys_args *p = params;
+		iarg[0] = p->which; /* int */
+		iarg[1] = p->a2; /* int */
+		iarg[2] = p->a3; /* int */
+		iarg[3] = p->a4; /* int */
+		iarg[4] = p->a5; /* int */
+		iarg[5] = p->a6; /* int */
+		*n_args = 6;
+		break;
+	}
+	/* shmsys */
+	case 171: {
+		struct shmsys_args *p = params;
+		iarg[0] = p->which; /* int */
+		iarg[1] = p->a2; /* int */
+		iarg[2] = p->a3; /* int */
+		iarg[3] = p->a4; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* freebsd6_pread */
+	case 173: {
+		struct freebsd6_pread_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* void * */
+		uarg[2] = p->nbyte; /* size_t */
+		iarg[3] = p->pad; /* int */
+		iarg[4] = p->offset; /* off_t */
+		*n_args = 5;
+		break;
+	}
+	/* freebsd6_pwrite */
+	case 174: {
+		struct freebsd6_pwrite_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* const void * */
+		uarg[2] = p->nbyte; /* size_t */
+		iarg[3] = p->pad; /* int */
+		iarg[4] = p->offset; /* off_t */
+		*n_args = 5;
+		break;
+	}
+	/* setfib */
+	case 175: {
+		struct setfib_args *p = params;
+		iarg[0] = p->fibnum; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* ntp_adjtime */
+	case 176: {
+		struct ntp_adjtime_args *p = params;
+		uarg[0] = (intptr_t) p->tp; /* struct timex * */
+		*n_args = 1;
+		break;
+	}
+	/* setgid */
+	case 181: {
+		struct setgid_args *p = params;
+		iarg[0] = p->gid; /* gid_t */
+		*n_args = 1;
+		break;
+	}
+	/* setegid */
+	case 182: {
+		struct setegid_args *p = params;
+		iarg[0] = p->egid; /* gid_t */
+		*n_args = 1;
+		break;
+	}
+	/* seteuid */
+	case 183: {
+		struct seteuid_args *p = params;
+		uarg[0] = p->euid; /* uid_t */
+		*n_args = 1;
+		break;
+	}
+	/* stat */
+	case 188: {
+		struct stat_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->ub; /* struct stat * */
+		*n_args = 2;
+		break;
+	}
+	/* fstat */
+	case 189: {
+		struct fstat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->sb; /* struct stat * */
+		*n_args = 2;
+		break;
+	}
+	/* lstat */
+	case 190: {
+		struct lstat_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->ub; /* struct stat * */
+		*n_args = 2;
+		break;
+	}
+	/* pathconf */
+	case 191: {
+		struct pathconf_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->name; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* fpathconf */
+	case 192: {
+		struct fpathconf_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->name; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* getrlimit */
+	case 194: {
+		struct __getrlimit_args *p = params;
+		uarg[0] = p->which; /* u_int */
+		uarg[1] = (intptr_t) p->rlp; /* struct rlimit * */
+		*n_args = 2;
+		break;
+	}
+	/* setrlimit */
+	case 195: {
+		struct __setrlimit_args *p = params;
+		uarg[0] = p->which; /* u_int */
+		uarg[1] = (intptr_t) p->rlp; /* struct rlimit * */
+		*n_args = 2;
+		break;
+	}
+	/* getdirentries */
+	case 196: {
+		struct getdirentries_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* char * */
+		uarg[2] = p->count; /* u_int */
+		uarg[3] = (intptr_t) p->basep; /* long * */
+		*n_args = 4;
+		break;
+	}
+	/* freebsd6_mmap */
+	case 197: {
+		struct freebsd6_mmap_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* caddr_t */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->prot; /* int */
+		iarg[3] = p->flags; /* int */
+		iarg[4] = p->fd; /* int */
+		iarg[5] = p->pad; /* int */
+		iarg[6] = p->pos; /* off_t */
+		*n_args = 7;
+		break;
+	}
+	/* nosys */
+	case 198: {
+		*n_args = 0;
+		break;
+	}
+	/* freebsd6_lseek */
+	case 199: {
+		struct freebsd6_lseek_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->pad; /* int */
+		iarg[2] = p->offset; /* off_t */
+		iarg[3] = p->whence; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* freebsd6_truncate */
+	case 200: {
+		struct freebsd6_truncate_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->pad; /* int */
+		iarg[2] = p->length; /* off_t */
+		*n_args = 3;
+		break;
+	}
+	/* freebsd6_ftruncate */
+	case 201: {
+		struct freebsd6_ftruncate_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->pad; /* int */
+		iarg[2] = p->length; /* off_t */
+		*n_args = 3;
+		break;
+	}
+	/* __sysctl */
+	case 202: {
+		struct sysctl_args *p = params;
+		uarg[0] = (intptr_t) p->name; /* int * */
+		uarg[1] = p->namelen; /* u_int */
+		uarg[2] = (intptr_t) p->old; /* void * */
+		uarg[3] = (intptr_t) p->oldlenp; /* size_t * */
+		uarg[4] = (intptr_t) p->new; /* void * */
+		uarg[5] = p->newlen; /* size_t */
+		*n_args = 6;
+		break;
+	}
+	/* mlock */
+	case 203: {
+		struct mlock_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* const void * */
+		uarg[1] = p->len; /* size_t */
+		*n_args = 2;
+		break;
+	}
+	/* munlock */
+	case 204: {
+		struct munlock_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* const void * */
+		uarg[1] = p->len; /* size_t */
+		*n_args = 2;
+		break;
+	}
+	/* undelete */
+	case 205: {
+		struct undelete_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* futimes */
+	case 206: {
+		struct futimes_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->tptr; /* struct timeval * */
+		*n_args = 2;
+		break;
+	}
+	/* getpgid */
+	case 207: {
+		struct getpgid_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		*n_args = 1;
+		break;
+	}
+	/* poll */
+	case 209: {
+		struct poll_args *p = params;
+		uarg[0] = (intptr_t) p->fds; /* struct pollfd * */
+		uarg[1] = p->nfds; /* u_int */
+		iarg[2] = p->timeout; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* lkmnosys */
+	case 210: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 211: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 212: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 213: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 214: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 215: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 216: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 217: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 218: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 219: {
+		*n_args = 0;
+		break;
+	}
+	/* semget */
+	case 221: {
+		struct semget_args *p = params;
+		iarg[0] = p->key; /* key_t */
+		iarg[1] = p->nsems; /* int */
+		iarg[2] = p->semflg; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* semop */
+	case 222: {
+		struct semop_args *p = params;
+		iarg[0] = p->semid; /* int */
+		uarg[1] = (intptr_t) p->sops; /* struct sembuf * */
+		uarg[2] = p->nsops; /* size_t */
+		*n_args = 3;
+		break;
+	}
+	/* msgget */
+	case 225: {
+		struct msgget_args *p = params;
+		iarg[0] = p->key; /* key_t */
+		iarg[1] = p->msgflg; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* msgsnd */
+	case 226: {
+		struct msgsnd_args *p = params;
+		iarg[0] = p->msqid; /* int */
+		uarg[1] = (intptr_t) p->msgp; /* const void * */
+		uarg[2] = p->msgsz; /* size_t */
+		iarg[3] = p->msgflg; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* msgrcv */
+	case 227: {
+		struct msgrcv_args *p = params;
+		iarg[0] = p->msqid; /* int */
+		uarg[1] = (intptr_t) p->msgp; /* void * */
+		uarg[2] = p->msgsz; /* size_t */
+		iarg[3] = p->msgtyp; /* long */
+		iarg[4] = p->msgflg; /* int */
+		*n_args = 5;
+		break;
+	}
+	/* shmat */
+	case 228: {
+		struct shmat_args *p = params;
+		iarg[0] = p->shmid; /* int */
+		uarg[1] = (intptr_t) p->shmaddr; /* const void * */
+		iarg[2] = p->shmflg; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* shmdt */
+	case 230: {
+		struct shmdt_args *p = params;
+		uarg[0] = (intptr_t) p->shmaddr; /* const void * */
+		*n_args = 1;
+		break;
+	}
+	/* shmget */
+	case 231: {
+		struct shmget_args *p = params;
+		iarg[0] = p->key; /* key_t */
+		uarg[1] = p->size; /* size_t */
+		iarg[2] = p->shmflg; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* clock_gettime */
+	case 232: {
+		struct clock_gettime_args *p = params;
+		iarg[0] = p->clock_id; /* clockid_t */
+		uarg[1] = (intptr_t) p->tp; /* struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* clock_settime */
+	case 233: {
+		struct clock_settime_args *p = params;
+		iarg[0] = p->clock_id; /* clockid_t */
+		uarg[1] = (intptr_t) p->tp; /* const struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* clock_getres */
+	case 234: {
+		struct clock_getres_args *p = params;
+		iarg[0] = p->clock_id; /* clockid_t */
+		uarg[1] = (intptr_t) p->tp; /* struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* ktimer_create */
+	case 235: {
+		struct ktimer_create_args *p = params;
+		iarg[0] = p->clock_id; /* clockid_t */
+		uarg[1] = (intptr_t) p->evp; /* struct sigevent * */
+		uarg[2] = (intptr_t) p->timerid; /* int * */
+		*n_args = 3;
+		break;
+	}
+	/* ktimer_delete */
+	case 236: {
+		struct ktimer_delete_args *p = params;
+		iarg[0] = p->timerid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* ktimer_settime */
+	case 237: {
+		struct ktimer_settime_args *p = params;
+		iarg[0] = p->timerid; /* int */
+		iarg[1] = p->flags; /* int */
+		uarg[2] = (intptr_t) p->value; /* const struct itimerspec * */
+		uarg[3] = (intptr_t) p->ovalue; /* struct itimerspec * */
+		*n_args = 4;
+		break;
+	}
+	/* ktimer_gettime */
+	case 238: {
+		struct ktimer_gettime_args *p = params;
+		iarg[0] = p->timerid; /* int */
+		uarg[1] = (intptr_t) p->value; /* struct itimerspec * */
+		*n_args = 2;
+		break;
+	}
+	/* ktimer_getoverrun */
+	case 239: {
+		struct ktimer_getoverrun_args *p = params;
+		iarg[0] = p->timerid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* nanosleep */
+	case 240: {
+		struct nanosleep_args *p = params;
+		uarg[0] = (intptr_t) p->rqtp; /* const struct timespec * */
+		uarg[1] = (intptr_t) p->rmtp; /* struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* ffclock_getcounter */
+	case 241: {
+		struct ffclock_getcounter_args *p = params;
+		uarg[0] = (intptr_t) p->ffcount; /* ffcounter * */
+		*n_args = 1;
+		break;
+	}
+	/* ffclock_setestimate */
+	case 242: {
+		struct ffclock_setestimate_args *p = params;
+		uarg[0] = (intptr_t) p->cest; /* struct ffclock_estimate * */
+		*n_args = 1;
+		break;
+	}
+	/* ffclock_getestimate */
+	case 243: {
+		struct ffclock_getestimate_args *p = params;
+		uarg[0] = (intptr_t) p->cest; /* struct ffclock_estimate * */
+		*n_args = 1;
+		break;
+	}
+	/* clock_getcpuclockid2 */
+	case 247: {
+		struct clock_getcpuclockid2_args *p = params;
+		iarg[0] = p->id; /* id_t */
+		iarg[1] = p->which; /* int */
+		uarg[2] = (intptr_t) p->clock_id; /* clockid_t * */
+		*n_args = 3;
+		break;
+	}
+	/* ntp_gettime */
+	case 248: {
+		struct ntp_gettime_args *p = params;
+		uarg[0] = (intptr_t) p->ntvp; /* struct ntptimeval * */
+		*n_args = 1;
+		break;
+	}
+	/* minherit */
+	case 250: {
+		struct minherit_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* void * */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->inherit; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* rfork */
+	case 251: {
+		struct rfork_args *p = params;
+		iarg[0] = p->flags; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* openbsd_poll */
+	case 252: {
+		struct openbsd_poll_args *p = params;
+		uarg[0] = (intptr_t) p->fds; /* struct pollfd * */
+		uarg[1] = p->nfds; /* u_int */
+		iarg[2] = p->timeout; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* issetugid */
+	case 253: {
+		*n_args = 0;
+		break;
+	}
+	/* lchown */
+	case 254: {
+		struct lchown_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->uid; /* int */
+		iarg[2] = p->gid; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* aio_read */
+	case 255: {
+		struct aio_read_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+		*n_args = 1;
+		break;
+	}
+	/* aio_write */
+	case 256: {
+		struct aio_write_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+		*n_args = 1;
+		break;
+	}
+	/* lio_listio */
+	case 257: {
+		struct lio_listio_args *p = params;
+		iarg[0] = p->mode; /* int */
+		uarg[1] = (intptr_t) p->acb_list; /* struct aiocb *const * */
+		iarg[2] = p->nent; /* int */
+		uarg[3] = (intptr_t) p->sig; /* struct sigevent * */
+		*n_args = 4;
+		break;
+	}
+	/* getdents */
+	case 272: {
+		struct getdents_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* char * */
+		uarg[2] = p->count; /* size_t */
+		*n_args = 3;
+		break;
+	}
+	/* lchmod */
+	case 274: {
+		struct lchmod_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->mode; /* mode_t */
+		*n_args = 2;
+		break;
+	}
+	/* lchown */
+	case 275: {
+		struct lchown_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = p->uid; /* uid_t */
+		iarg[2] = p->gid; /* gid_t */
+		*n_args = 3;
+		break;
+	}
+	/* lutimes */
+	case 276: {
+		struct lutimes_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->tptr; /* struct timeval * */
+		*n_args = 2;
+		break;
+	}
+	/* msync */
+	case 277: {
+		struct msync_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* void * */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* nstat */
+	case 278: {
+		struct nstat_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->ub; /* struct nstat * */
+		*n_args = 2;
+		break;
+	}
+	/* nfstat */
+	case 279: {
+		struct nfstat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->sb; /* struct nstat * */
+		*n_args = 2;
+		break;
+	}
+	/* nlstat */
+	case 280: {
+		struct nlstat_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->ub; /* struct nstat * */
+		*n_args = 2;
+		break;
+	}
+	/* preadv */
+	case 289: {
+		struct preadv_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+		uarg[2] = p->iovcnt; /* u_int */
+		iarg[3] = p->offset; /* off_t */
+		*n_args = 4;
+		break;
+	}
+	/* pwritev */
+	case 290: {
+		struct pwritev_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+		uarg[2] = p->iovcnt; /* u_int */
+		iarg[3] = p->offset; /* off_t */
+		*n_args = 4;
+		break;
+	}
+	/* fhopen */
+	case 298: {
+		struct fhopen_args *p = params;
+		uarg[0] = (intptr_t) p->u_fhp; /* const struct fhandle * */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* fhstat */
+	case 299: {
+		struct fhstat_args *p = params;
+		uarg[0] = (intptr_t) p->u_fhp; /* const struct fhandle * */
+		uarg[1] = (intptr_t) p->sb; /* struct stat * */
+		*n_args = 2;
+		break;
+	}
+	/* modnext */
+	case 300: {
+		struct modnext_args *p = params;
+		iarg[0] = p->modid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* modstat */
+	case 301: {
+		struct modstat_args *p = params;
+		iarg[0] = p->modid; /* int */
+		uarg[1] = (intptr_t) p->stat; /* struct module_stat * */
+		*n_args = 2;
+		break;
+	}
+	/* modfnext */
+	case 302: {
+		struct modfnext_args *p = params;
+		iarg[0] = p->modid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* modfind */
+	case 303: {
+		struct modfind_args *p = params;
+		uarg[0] = (intptr_t) p->name; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* kldload */
+	case 304: {
+		struct kldload_args *p = params;
+		uarg[0] = (intptr_t) p->file; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* kldunload */
+	case 305: {
+		struct kldunload_args *p = params;
+		iarg[0] = p->fileid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* kldfind */
+	case 306: {
+		struct kldfind_args *p = params;
+		uarg[0] = (intptr_t) p->file; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* kldnext */
+	case 307: {
+		struct kldnext_args *p = params;
+		iarg[0] = p->fileid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* kldstat */
+	case 308: {
+		struct kldstat_args *p = params;
+		iarg[0] = p->fileid; /* int */
+		uarg[1] = (intptr_t) p->stat; /* struct kld_file_stat * */
+		*n_args = 2;
+		break;
+	}
+	/* kldfirstmod */
+	case 309: {
+		struct kldfirstmod_args *p = params;
+		iarg[0] = p->fileid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* getsid */
+	case 310: {
+		struct getsid_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		*n_args = 1;
+		break;
+	}
+	/* setresuid */
+	case 311: {
+		struct setresuid_args *p = params;
+		uarg[0] = p->ruid; /* uid_t */
+		uarg[1] = p->euid; /* uid_t */
+		uarg[2] = p->suid; /* uid_t */
+		*n_args = 3;
+		break;
+	}
+	/* setresgid */
+	case 312: {
+		struct setresgid_args *p = params;
+		iarg[0] = p->rgid; /* gid_t */
+		iarg[1] = p->egid; /* gid_t */
+		iarg[2] = p->sgid; /* gid_t */
+		*n_args = 3;
+		break;
+	}
+	/* aio_return */
+	case 314: {
+		struct aio_return_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+		*n_args = 1;
+		break;
+	}
+	/* aio_suspend */
+	case 315: {
+		struct aio_suspend_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb *const * */
+		iarg[1] = p->nent; /* int */
+		uarg[2] = (intptr_t) p->timeout; /* const struct timespec * */
+		*n_args = 3;
+		break;
+	}
+	/* aio_cancel */
+	case 316: {
+		struct aio_cancel_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->aiocbp; /* struct aiocb * */
+		*n_args = 2;
+		break;
+	}
+	/* aio_error */
+	case 317: {
+		struct aio_error_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+		*n_args = 1;
+		break;
+	}
+	/* oaio_read */
+	case 318: {
+		struct oaio_read_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct oaiocb * */
+		*n_args = 1;
+		break;
+	}
+	/* oaio_write */
+	case 319: {
+		struct oaio_write_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct oaiocb * */
+		*n_args = 1;
+		break;
+	}
+	/* olio_listio */
+	case 320: {
+		struct olio_listio_args *p = params;
+		iarg[0] = p->mode; /* int */
+		uarg[1] = (intptr_t) p->acb_list; /* struct oaiocb *const * */
+		iarg[2] = p->nent; /* int */
+		uarg[3] = (intptr_t) p->sig; /* struct osigevent * */
+		*n_args = 4;
+		break;
+	}
+	/* yield */
+	case 321: {
+		*n_args = 0;
+		break;
+	}
+	/* mlockall */
+	case 324: {
+		struct mlockall_args *p = params;
+		iarg[0] = p->how; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* munlockall */
+	case 325: {
+		*n_args = 0;
+		break;
+	}
+	/* __getcwd */
+	case 326: {
+		struct __getcwd_args *p = params;
+		uarg[0] = (intptr_t) p->buf; /* u_char * */
+		uarg[1] = p->buflen; /* u_int */
+		*n_args = 2;
+		break;
+	}
+	/* sched_setparam */
+	case 327: {
+		struct sched_setparam_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		uarg[1] = (intptr_t) p->param; /* const struct sched_param * */
+		*n_args = 2;
+		break;
+	}
+	/* sched_getparam */
+	case 328: {
+		struct sched_getparam_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		uarg[1] = (intptr_t) p->param; /* struct sched_param * */
+		*n_args = 2;
+		break;
+	}
+	/* sched_setscheduler */
+	case 329: {
+		struct sched_setscheduler_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		iarg[1] = p->policy; /* int */
+		uarg[2] = (intptr_t) p->param; /* const struct sched_param * */
+		*n_args = 3;
+		break;
+	}
+	/* sched_getscheduler */
+	case 330: {
+		struct sched_getscheduler_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		*n_args = 1;
+		break;
+	}
+	/* sched_yield */
+	case 331: {
+		*n_args = 0;
+		break;
+	}
+	/* sched_get_priority_max */
+	case 332: {
+		struct sched_get_priority_max_args *p = params;
+		iarg[0] = p->policy; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* sched_get_priority_min */
+	case 333: {
+		struct sched_get_priority_min_args *p = params;
+		iarg[0] = p->policy; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* sched_rr_get_interval */
+	case 334: {
+		struct sched_rr_get_interval_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		uarg[1] = (intptr_t) p->interval; /* struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* utrace */
+	case 335: {
+		struct utrace_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* const void * */
+		uarg[1] = p->len; /* size_t */
+		*n_args = 2;
+		break;
+	}
+	/* kldsym */
+	case 337: {
+		struct kldsym_args *p = params;
+		iarg[0] = p->fileid; /* int */
+		iarg[1] = p->cmd; /* int */
+		uarg[2] = (intptr_t) p->data; /* void * */
+		*n_args = 3;
+		break;
+	}
+	/* jail */
+	case 338: {
+		struct jail_args *p = params;
+		uarg[0] = (intptr_t) p->jail; /* struct jail * */
+		*n_args = 1;
+		break;
+	}
+	/* nnpfs_syscall */
+	case 339: {
+		struct nnpfs_syscall_args *p = params;
+		iarg[0] = p->operation; /* int */
+		uarg[1] = (intptr_t) p->a_pathP; /* char * */
+		iarg[2] = p->a_opcode; /* int */
+		uarg[3] = (intptr_t) p->a_paramsP; /* void * */
+		iarg[4] = p->a_followSymlinks; /* int */
+		*n_args = 5;
+		break;
+	}
+	/* sigprocmask */
+	case 340: {
+		struct sigprocmask_args *p = params;
+		iarg[0] = p->how; /* int */
+		uarg[1] = (intptr_t) p->set; /* const sigset_t * */
+		uarg[2] = (intptr_t) p->oset; /* sigset_t * */
+		*n_args = 3;
+		break;
+	}
+	/* sigsuspend */
+	case 341: {
+		struct sigsuspend_args *p = params;
+		uarg[0] = (intptr_t) p->sigmask; /* const sigset_t * */
+		*n_args = 1;
+		break;
+	}
+	/* sigpending */
+	case 343: {
+		struct sigpending_args *p = params;
+		uarg[0] = (intptr_t) p->set; /* sigset_t * */
+		*n_args = 1;
+		break;
+	}
+	/* sigtimedwait */
+	case 345: {
+		struct sigtimedwait_args *p = params;
+		uarg[0] = (intptr_t) p->set; /* const sigset_t * */
+		uarg[1] = (intptr_t) p->info; /* siginfo_t * */
+		uarg[2] = (intptr_t) p->timeout; /* const struct timespec * */
+		*n_args = 3;
+		break;
+	}
+	/* sigwaitinfo */
+	case 346: {
+		struct sigwaitinfo_args *p = params;
+		uarg[0] = (intptr_t) p->set; /* const sigset_t * */
+		uarg[1] = (intptr_t) p->info; /* siginfo_t * */
+		*n_args = 2;
+		break;
+	}
+	/* __acl_get_file */
+	case 347: {
+		struct __acl_get_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_set_file */
+	case 348: {
+		struct __acl_set_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_get_fd */
+	case 349: {
+		struct __acl_get_fd_args *p = params;
+		iarg[0] = p->filedes; /* int */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_set_fd */
+	case 350: {
+		struct __acl_set_fd_args *p = params;
+		iarg[0] = p->filedes; /* int */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_delete_file */
+	case 351: {
+		struct __acl_delete_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		*n_args = 2;
+		break;
+	}
+	/* __acl_delete_fd */
+	case 352: {
+		struct __acl_delete_fd_args *p = params;
+		iarg[0] = p->filedes; /* int */
+		iarg[1] = p->type; /* acl_type_t */
+		*n_args = 2;
+		break;
+	}
+	/* __acl_aclcheck_file */
+	case 353: {
+		struct __acl_aclcheck_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_aclcheck_fd */
+	case 354: {
+		struct __acl_aclcheck_fd_args *p = params;
+		iarg[0] = p->filedes; /* int */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* extattrctl */
+	case 355: {
+		struct extattrctl_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->cmd; /* int */
+		uarg[2] = (intptr_t) p->filename; /* const char * */
+		iarg[3] = p->attrnamespace; /* int */
+		uarg[4] = (intptr_t) p->attrname; /* const char * */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_set_file */
+	case 356: {
+		struct extattr_set_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		uarg[3] = (intptr_t) p->data; /* void * */
+		uarg[4] = p->nbytes; /* size_t */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_get_file */
+	case 357: {
+		struct extattr_get_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		uarg[3] = (intptr_t) p->data; /* void * */
+		uarg[4] = p->nbytes; /* size_t */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_delete_file */
+	case 358: {
+		struct extattr_delete_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		*n_args = 3;
+		break;
+	}
+	/* aio_waitcomplete */
+	case 359: {
+		struct aio_waitcomplete_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb ** */
+		uarg[1] = (intptr_t) p->timeout; /* struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* getresuid */
+	case 360: {
+		struct getresuid_args *p = params;
+		uarg[0] = (intptr_t) p->ruid; /* uid_t * */
+		uarg[1] = (intptr_t) p->euid; /* uid_t * */
+		uarg[2] = (intptr_t) p->suid; /* uid_t * */
+		*n_args = 3;
+		break;
+	}
+	/* getresgid */
+	case 361: {
+		struct getresgid_args *p = params;
+		uarg[0] = (intptr_t) p->rgid; /* gid_t * */
+		uarg[1] = (intptr_t) p->egid; /* gid_t * */
+		uarg[2] = (intptr_t) p->sgid; /* gid_t * */
+		*n_args = 3;
+		break;
+	}
+	/* kqueue */
+	case 362: {
+		*n_args = 0;
+		break;
+	}
+	/* kevent */
+	case 363: {
+		struct kevent_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->changelist; /* struct kevent * */
+		iarg[2] = p->nchanges; /* int */
+		uarg[3] = (intptr_t) p->eventlist; /* struct kevent * */
+		iarg[4] = p->nevents; /* int */
+		uarg[5] = (intptr_t) p->timeout; /* const struct timespec * */
+		*n_args = 6;
+		break;
+	}
+	/* extattr_set_fd */
+	case 371: {
+		struct extattr_set_fd_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		uarg[3] = (intptr_t) p->data; /* void * */
+		uarg[4] = p->nbytes; /* size_t */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_get_fd */
+	case 372: {
+		struct extattr_get_fd_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		uarg[3] = (intptr_t) p->data; /* void * */
+		uarg[4] = p->nbytes; /* size_t */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_delete_fd */
+	case 373: {
+		struct extattr_delete_fd_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		*n_args = 3;
+		break;
+	}
+	/* __setugid */
+	case 374: {
+		struct __setugid_args *p = params;
+		iarg[0] = p->flag; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* eaccess */
+	case 376: {
+		struct eaccess_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->amode; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* afs3_syscall */
+	case 377: {
+		struct afs3_syscall_args *p = params;
+		iarg[0] = p->syscall; /* long */
+		iarg[1] = p->parm1; /* long */
+		iarg[2] = p->parm2; /* long */
+		iarg[3] = p->parm3; /* long */
+		iarg[4] = p->parm4; /* long */
+		iarg[5] = p->parm5; /* long */
+		iarg[6] = p->parm6; /* long */
+		*n_args = 7;
+		break;
+	}
+	/* nmount */
+	case 378: {
+		struct nmount_args *p = params;
+		uarg[0] = (intptr_t) p->iovp; /* struct iovec * */
+		uarg[1] = p->iovcnt; /* unsigned int */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* __mac_get_proc */
+	case 384: {
+		struct __mac_get_proc_args *p = params;
+		uarg[0] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 1;
+		break;
+	}
+	/* __mac_set_proc */
+	case 385: {
+		struct __mac_set_proc_args *p = params;
+		uarg[0] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 1;
+		break;
+	}
+	/* __mac_get_fd */
+	case 386: {
+		struct __mac_get_fd_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* __mac_get_file */
+	case 387: {
+		struct __mac_get_file_args *p = params;
+		uarg[0] = (intptr_t) p->path_p; /* const char * */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* __mac_set_fd */
+	case 388: {
+		struct __mac_set_fd_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* __mac_set_file */
+	case 389: {
+		struct __mac_set_file_args *p = params;
+		uarg[0] = (intptr_t) p->path_p; /* const char * */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* kenv */
+	case 390: {
+		struct kenv_args *p = params;
+		iarg[0] = p->what; /* int */
+		uarg[1] = (intptr_t) p->name; /* const char * */
+		uarg[2] = (intptr_t) p->value; /* char * */
+		iarg[3] = p->len; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* lchflags */
+	case 391: {
+		struct lchflags_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		uarg[1] = p->flags; /* u_long */
+		*n_args = 2;
+		break;
+	}
+	/* uuidgen */
+	case 392: {
+		struct uuidgen_args *p = params;
+		uarg[0] = (intptr_t) p->store; /* struct uuid * */
+		iarg[1] = p->count; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* sendfile */
+	case 393: {
+		struct sendfile_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->s; /* int */
+		iarg[2] = p->offset; /* off_t */
+		uarg[3] = p->nbytes; /* size_t */
+		uarg[4] = (intptr_t) p->hdtr; /* struct sf_hdtr * */
+		uarg[5] = (intptr_t) p->sbytes; /* off_t * */
+		iarg[6] = p->flags; /* int */
+		*n_args = 7;
+		break;
+	}
+	/* mac_syscall */
+	case 394: {
+		struct mac_syscall_args *p = params;
+		uarg[0] = (intptr_t) p->policy; /* const char * */
+		iarg[1] = p->call; /* int */
+		uarg[2] = (intptr_t) p->arg; /* void * */
+		*n_args = 3;
+		break;
+	}
+	/* getfsstat */
+	case 395: {
+		struct getfsstat_args *p = params;
+		uarg[0] = (intptr_t) p->buf; /* struct statfs * */
+		iarg[1] = p->bufsize; /* long */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* statfs */
+	case 396: {
+		struct statfs_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->buf; /* struct statfs * */
+		*n_args = 2;
+		break;
+	}
+	/* fstatfs */
+	case 397: {
+		struct fstatfs_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* struct statfs * */
+		*n_args = 2;
+		break;
+	}
+	/* fhstatfs */
+	case 398: {
+		struct fhstatfs_args *p = params;
+		uarg[0] = (intptr_t) p->u_fhp; /* const struct fhandle * */
+		uarg[1] = (intptr_t) p->buf; /* struct statfs * */
+		*n_args = 2;
+		break;
+	}
+	/* ksem_close */
+	case 400: {
+		struct ksem_close_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		*n_args = 1;
+		break;
+	}
+	/* ksem_post */
+	case 401: {
+		struct ksem_post_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		*n_args = 1;
+		break;
+	}
+	/* ksem_wait */
+	case 402: {
+		struct ksem_wait_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		*n_args = 1;
+		break;
+	}
+	/* ksem_trywait */
+	case 403: {
+		struct ksem_trywait_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		*n_args = 1;
+		break;
+	}
+	/* ksem_init */
+	case 404: {
+		struct ksem_init_args *p = params;
+		uarg[0] = (intptr_t) p->idp; /* semid_t * */
+		uarg[1] = p->value; /* unsigned int */
+		*n_args = 2;
+		break;
+	}
+	/* ksem_open */
+	case 405: {
+		struct ksem_open_args *p = params;
+		uarg[0] = (intptr_t) p->idp; /* semid_t * */
+		uarg[1] = (intptr_t) p->name; /* const char * */
+		iarg[2] = p->oflag; /* int */
+		iarg[3] = p->mode; /* mode_t */
+		uarg[4] = p->value; /* unsigned int */
+		*n_args = 5;
+		break;
+	}
+	/* ksem_unlink */
+	case 406: {
+		struct ksem_unlink_args *p = params;
+		uarg[0] = (intptr_t) p->name; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* ksem_getvalue */
+	case 407: {
+		struct ksem_getvalue_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		uarg[1] = (intptr_t) p->val; /* int * */
+		*n_args = 2;
+		break;
+	}
+	/* ksem_destroy */
+	case 408: {
+		struct ksem_destroy_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		*n_args = 1;
+		break;
+	}
+	/* __mac_get_pid */
+	case 409: {
+		struct __mac_get_pid_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* __mac_get_link */
+	case 410: {
+		struct __mac_get_link_args *p = params;
+		uarg[0] = (intptr_t) p->path_p; /* const char * */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* __mac_set_link */
+	case 411: {
+		struct __mac_set_link_args *p = params;
+		uarg[0] = (intptr_t) p->path_p; /* const char * */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* extattr_set_link */
+	case 412: {
+		struct extattr_set_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		uarg[3] = (intptr_t) p->data; /* void * */
+		uarg[4] = p->nbytes; /* size_t */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_get_link */
+	case 413: {
+		struct extattr_get_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		uarg[3] = (intptr_t) p->data; /* void * */
+		uarg[4] = p->nbytes; /* size_t */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_delete_link */
+	case 414: {
+		struct extattr_delete_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		*n_args = 3;
+		break;
+	}
+	/* __mac_execve */
+	case 415: {
+		struct __mac_execve_args *p = params;
+		uarg[0] = (intptr_t) p->fname; /* char * */
+		uarg[1] = (intptr_t) p->argv; /* char ** */
+		uarg[2] = (intptr_t) p->envv; /* char ** */
+		uarg[3] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 4;
+		break;
+	}
+	/* sigaction */
+	case 416: {
+		struct sigaction_args *p = params;
+		iarg[0] = p->sig; /* int */
+		uarg[1] = (intptr_t) p->act; /* const struct sigaction * */
+		uarg[2] = (intptr_t) p->oact; /* struct sigaction * */
+		*n_args = 3;
+		break;
+	}
+	/* sigreturn */
+	case 417: {
+		struct sigreturn_args *p = params;
+		uarg[0] = (intptr_t) p->sigcntxp; /* const struct __ucontext * */
+		*n_args = 1;
+		break;
+	}
+	/* getcontext */
+	case 421: {
+		struct getcontext_args *p = params;
+		uarg[0] = (intptr_t) p->ucp; /* struct __ucontext * */
+		*n_args = 1;
+		break;
+	}
+	/* setcontext */
+	case 422: {
+		struct setcontext_args *p = params;
+		uarg[0] = (intptr_t) p->ucp; /* const struct __ucontext * */
+		*n_args = 1;
+		break;
+	}
+	/* swapcontext */
+	case 423: {
+		struct swapcontext_args *p = params;
+		uarg[0] = (intptr_t) p->oucp; /* struct __ucontext * */
+		uarg[1] = (intptr_t) p->ucp; /* const struct __ucontext * */
+		*n_args = 2;
+		break;
+	}
+	/* swapoff */
+	case 424: {
+		struct swapoff_args *p = params;
+		uarg[0] = (intptr_t) p->name; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* __acl_get_link */
+	case 425: {
+		struct __acl_get_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_set_link */
+	case 426: {
+		struct __acl_set_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_delete_link */
+	case 427: {
+		struct __acl_delete_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		*n_args = 2;
+		break;
+	}
+	/* __acl_aclcheck_link */
+	case 428: {
+		struct __acl_aclcheck_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* sigwait */
+	case 429: {
+		struct sigwait_args *p = params;
+		uarg[0] = (intptr_t) p->set; /* const sigset_t * */
+		uarg[1] = (intptr_t) p->sig; /* int * */
+		*n_args = 2;
+		break;
+	}
+	/* thr_create */
+	case 430: {
+		struct thr_create_args *p = params;
+		uarg[0] = (intptr_t) p->ctx; /* ucontext_t * */
+		uarg[1] = (intptr_t) p->id; /* long * */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* thr_exit */
+	case 431: {
+		struct thr_exit_args *p = params;
+		uarg[0] = (intptr_t) p->state; /* long * */
+		*n_args = 1;
+		break;
+	}
+	/* thr_self */
+	case 432: {
+		struct thr_self_args *p = params;
+		uarg[0] = (intptr_t) p->id; /* long * */
+		*n_args = 1;
+		break;
+	}
+	/* thr_kill */
+	case 433: {
+		struct thr_kill_args *p = params;
+		iarg[0] = p->id; /* long */
+		iarg[1] = p->sig; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* _umtx_lock */
+	case 434: {
+		struct _umtx_lock_args *p = params;
+		uarg[0] = (intptr_t) p->umtx; /* struct umtx * */
+		*n_args = 1;
+		break;
+	}
+	/* _umtx_unlock */
+	case 435: {
+		struct _umtx_unlock_args *p = params;
+		uarg[0] = (intptr_t) p->umtx; /* struct umtx * */
+		*n_args = 1;
+		break;
+	}
+	/* jail_attach */
+	case 436: {
+		struct jail_attach_args *p = params;
+		iarg[0] = p->jid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* extattr_list_fd */
+	case 437: {
+		struct extattr_list_fd_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->data; /* void * */
+		uarg[3] = p->nbytes; /* size_t */
+		*n_args = 4;
+		break;
+	}
+	/* extattr_list_file */
+	case 438: {
+		struct extattr_list_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->data; /* void * */
+		uarg[3] = p->nbytes; /* size_t */
+		*n_args = 4;
+		break;
+	}
+	/* extattr_list_link */
+	case 439: {
+		struct extattr_list_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->data; /* void * */
+		uarg[3] = p->nbytes; /* size_t */
+		*n_args = 4;
+		break;
+	}
+	/* ksem_timedwait */
+	case 441: {
+		struct ksem_timedwait_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		uarg[1] = (intptr_t) p->abstime; /* const struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* thr_suspend */
+	case 442: {
+		struct thr_suspend_args *p = params;
+		uarg[0] = (intptr_t) p->timeout; /* const struct timespec * */
+		*n_args = 1;
+		break;
+	}
+	/* thr_wake */
+	case 443: {
+		struct thr_wake_args *p = params;
+		iarg[0] = p->id; /* long */
+		*n_args = 1;
+		break;
+	}
+	/* kldunloadf */
+	case 444: {
+		struct kldunloadf_args *p = params;
+		iarg[0] = p->fileid; /* int */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* audit */
+	case 445: {
+		struct audit_args *p = params;
+		uarg[0] = (intptr_t) p->record; /* const void * */
+		uarg[1] = p->length; /* u_int */
+		*n_args = 2;
+		break;
+	}
+	/* auditon */
+	case 446: {
+		struct auditon_args *p = params;
+		iarg[0] = p->cmd; /* int */
+		uarg[1] = (intptr_t) p->data; /* void * */
+		uarg[2] = p->length; /* u_int */
+		*n_args = 3;
+		break;
+	}
+	/* getauid */
+	case 447: {
+		struct getauid_args *p = params;
+		uarg[0] = (intptr_t) p->auid; /* uid_t * */
+		*n_args = 1;
+		break;
+	}
+	/* setauid */
+	case 448: {
+		struct setauid_args *p = params;
+		uarg[0] = (intptr_t) p->auid; /* uid_t * */
+		*n_args = 1;
+		break;
+	}
+	/* getaudit */
+	case 449: {
+		struct getaudit_args *p = params;
+		uarg[0] = (intptr_t) p->auditinfo; /* struct auditinfo * */
+		*n_args = 1;
+		break;
+	}
+	/* setaudit */
+	case 450: {
+		struct setaudit_args *p = params;
+		uarg[0] = (intptr_t) p->auditinfo; /* struct auditinfo * */
+		*n_args = 1;
+		break;
+	}
+	/* getaudit_addr */
+	case 451: {
+		struct getaudit_addr_args *p = params;
+		uarg[0] = (intptr_t) p->auditinfo_addr; /* struct auditinfo_addr * */
+		uarg[1] = p->length; /* u_int */
+		*n_args = 2;
+		break;
+	}
+	/* setaudit_addr */
+	case 452: {
+		struct setaudit_addr_args *p = params;
+		uarg[0] = (intptr_t) p->auditinfo_addr; /* struct auditinfo_addr * */
+		uarg[1] = p->length; /* u_int */
+		*n_args = 2;
+		break;
+	}
+	/* auditctl */
+	case 453: {
+		struct auditctl_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* _umtx_op */
+	case 454: {
+		struct _umtx_op_args *p = params;
+		uarg[0] = (intptr_t) p->obj; /* void * */
+		iarg[1] = p->op; /* int */
+		uarg[2] = p->val; /* u_long */
+		uarg[3] = (intptr_t) p->uaddr1; /* void * */
+		uarg[4] = (intptr_t) p->uaddr2; /* void * */
+		*n_args = 5;
+		break;
+	}
+	/* thr_new */
+	case 455: {
+		struct thr_new_args *p = params;
+		uarg[0] = (intptr_t) p->param; /* struct thr_param * */
+		iarg[1] = p->param_size; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* sigqueue */
+	case 456: {
+		struct sigqueue_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		iarg[1] = p->signum; /* int */
+		uarg[2] = (intptr_t) p->value; /* void * */
+		*n_args = 3;
+		break;
+	}
+	/* kmq_open */
+	case 457: {
+		struct kmq_open_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->flags; /* int */
+		iarg[2] = p->mode; /* mode_t */
+		uarg[3] = (intptr_t) p->attr; /* const struct mq_attr * */
+		*n_args = 4;
+		break;
+	}
+	/* kmq_setattr */
+	case 458: {
+		struct kmq_setattr_args *p = params;
+		iarg[0] = p->mqd; /* int */
+		uarg[1] = (intptr_t) p->attr; /* const struct mq_attr * */
+		uarg[2] = (intptr_t) p->oattr; /* struct mq_attr * */
+		*n_args = 3;
+		break;
+	}
+	/* kmq_timedreceive */
+	case 459: {
+		struct kmq_timedreceive_args *p = params;
+		iarg[0] = p->mqd; /* int */
+		uarg[1] = (intptr_t) p->msg_ptr; /* char * */
+		uarg[2] = p->msg_len; /* size_t */
+		uarg[3] = (intptr_t) p->msg_prio; /* unsigned * */
+		uarg[4] = (intptr_t) p->abs_timeout; /* const struct timespec * */
+		*n_args = 5;
+		break;
+	}
+	/* kmq_timedsend */
+	case 460: {
+		struct kmq_timedsend_args *p = params;
+		iarg[0] = p->mqd; /* int */
+		uarg[1] = (intptr_t) p->msg_ptr; /* const char * */
+		uarg[2] = p->msg_len; /* size_t */
+		uarg[3] = p->msg_prio; /* unsigned */
+		uarg[4] = (intptr_t) p->abs_timeout; /* const struct timespec * */
+		*n_args = 5;
+		break;
+	}
+	/* kmq_notify */
+	case 461: {
+		struct kmq_notify_args *p = params;
+		iarg[0] = p->mqd; /* int */
+		uarg[1] = (intptr_t) p->sigev; /* const struct sigevent * */
+		*n_args = 2;
+		break;
+	}
+	/* kmq_unlink */
+	case 462: {
+		struct kmq_unlink_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* abort2 */
+	case 463: {
+		struct abort2_args *p = params;
+		uarg[0] = (intptr_t) p->why; /* const char * */
+		iarg[1] = p->nargs; /* int */
+		uarg[2] = (intptr_t) p->args; /* void ** */
+		*n_args = 3;
+		break;
+	}
+	/* thr_set_name */
+	case 464: {
+		struct thr_set_name_args *p = params;
+		iarg[0] = p->id; /* long */
+		uarg[1] = (intptr_t) p->name; /* const char * */
+		*n_args = 2;
+		break;
+	}
+	/* aio_fsync */
+	case 465: {
+		struct aio_fsync_args *p = params;
+		iarg[0] = p->op; /* int */
+		uarg[1] = (intptr_t) p->aiocbp; /* struct aiocb * */
+		*n_args = 2;
+		break;
+	}
+	/* rtprio_thread */
+	case 466: {
+		struct rtprio_thread_args *p = params;
+		iarg[0] = p->function; /* int */
+		iarg[1] = p->lwpid; /* lwpid_t */
+		uarg[2] = (intptr_t) p->rtp; /* struct rtprio * */
+		*n_args = 3;
+		break;
+	}
+	/* sctp_peeloff */
+	case 471: {
+		struct sctp_peeloff_args *p = params;
+		iarg[0] = p->sd; /* int */
+		uarg[1] = p->name; /* uint32_t */
+		*n_args = 2;
+		break;
+	}
+	/* sctp_generic_sendmsg */
+	case 472: {
+		struct sctp_generic_sendmsg_args *p = params;
+		iarg[0] = p->sd; /* int */
+		uarg[1] = (intptr_t) p->msg; /* caddr_t */
+		iarg[2] = p->mlen; /* int */
+		uarg[3] = (intptr_t) p->to; /* caddr_t */
+		iarg[4] = p->tolen; /* __socklen_t */
+		uarg[5] = (intptr_t) p->sinfo; /* struct sctp_sndrcvinfo * */
+		iarg[6] = p->flags; /* int */
+		*n_args = 7;
+		break;
+	}
+	/* sctp_generic_sendmsg_iov */
+	case 473: {
+		struct sctp_generic_sendmsg_iov_args *p = params;
+		iarg[0] = p->sd; /* int */
+		uarg[1] = (intptr_t) p->iov; /* struct iovec * */
+		iarg[2] = p->iovlen; /* int */
+		uarg[3] = (intptr_t) p->to; /* caddr_t */
+		iarg[4] = p->tolen; /* __socklen_t */
+		uarg[5] = (intptr_t) p->sinfo; /* struct sctp_sndrcvinfo * */
+		iarg[6] = p->flags; /* int */
+		*n_args = 7;
+		break;
+	}
+	/* sctp_generic_recvmsg */
+	case 474: {
+		struct sctp_generic_recvmsg_args *p = params;
+		iarg[0] = p->sd; /* int */
+		uarg[1] = (intptr_t) p->iov; /* struct iovec * */
+		iarg[2] = p->iovlen; /* int */
+		uarg[3] = (intptr_t) p->from; /* struct sockaddr * */
+		uarg[4] = (intptr_t) p->fromlenaddr; /* __socklen_t * */
+		uarg[5] = (intptr_t) p->sinfo; /* struct sctp_sndrcvinfo * */
+		uarg[6] = (intptr_t) p->msg_flags; /* int * */
+		*n_args = 7;
+		break;
+	}
+	/* pread */
+	case 475: {
+		struct pread_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* void * */
+		uarg[2] = p->nbyte; /* size_t */
+		iarg[3] = p->offset; /* off_t */
+		*n_args = 4;
+		break;
+	}
+	/* pwrite */
+	case 476: {
+		struct pwrite_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* const void * */
+		uarg[2] = p->nbyte; /* size_t */
+		iarg[3] = p->offset; /* off_t */
+		*n_args = 4;
+		break;
+	}
+	/* mmap */
+	case 477: {
+		struct mmap_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* caddr_t */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->prot; /* int */
+		iarg[3] = p->flags; /* int */
+		iarg[4] = p->fd; /* int */
+		iarg[5] = p->pos; /* off_t */
+		*n_args = 6;
+		break;
+	}
+	/* lseek */
+	case 478: {
+		struct lseek_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->offset; /* off_t */
+		iarg[2] = p->whence; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* truncate */
+	case 479: {
+		struct truncate_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->length; /* off_t */
+		*n_args = 2;
+		break;
+	}
+	/* ftruncate */
+	case 480: {
+		struct ftruncate_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->length; /* off_t */
+		*n_args = 2;
+		break;
+	}
+	/* thr_kill2 */
+	case 481: {
+		struct thr_kill2_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		iarg[1] = p->id; /* long */
+		iarg[2] = p->sig; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* shm_open */
+	case 482: {
+		struct shm_open_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->flags; /* int */
+		iarg[2] = p->mode; /* mode_t */
+		*n_args = 3;
+		break;
+	}
+	/* shm_unlink */
+	case 483: {
+		struct shm_unlink_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* cpuset */
+	case 484: {
+		struct cpuset_args *p = params;
+		uarg[0] = (intptr_t) p->setid; /* cpusetid_t * */
+		*n_args = 1;
+		break;
+	}
+	/* cpuset_setid */
+	case 485: {
+		struct cpuset_setid_args *p = params;
+		iarg[0] = p->which; /* cpuwhich_t */
+		iarg[1] = p->id; /* id_t */
+		iarg[2] = p->setid; /* cpusetid_t */
+		*n_args = 3;
+		break;
+	}
+	/* cpuset_getid */
+	case 486: {
+		struct cpuset_getid_args *p = params;
+		iarg[0] = p->level; /* cpulevel_t */
+		iarg[1] = p->which; /* cpuwhich_t */
+		iarg[2] = p->id; /* id_t */
+		uarg[3] = (intptr_t) p->setid; /* cpusetid_t * */
+		*n_args = 4;
+		break;
+	}
+	/* cpuset_getaffinity */
+	case 487: {
+		struct cpuset_getaffinity_args *p = params;
+		iarg[0] = p->level; /* cpulevel_t */
+		iarg[1] = p->which; /* cpuwhich_t */
+		iarg[2] = p->id; /* id_t */
+		uarg[3] = p->cpusetsize; /* size_t */
+		uarg[4] = (intptr_t) p->mask; /* cpuset_t * */
+		*n_args = 5;
+		break;
+	}
+	/* cpuset_setaffinity */
+	case 488: {
+		struct cpuset_setaffinity_args *p = params;
+		iarg[0] = p->level; /* cpulevel_t */
+		iarg[1] = p->which; /* cpuwhich_t */
+		iarg[2] = p->id; /* id_t */
+		uarg[3] = p->cpusetsize; /* size_t */
+		uarg[4] = (intptr_t) p->mask; /* const cpuset_t * */
+		*n_args = 5;
+		break;
+	}
+	/* faccessat */
+	case 489: {
+		struct faccessat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->path; /* char * */
+		iarg[2] = p->amode; /* int */
+		iarg[3] = p->flag; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* fchmodat */
+	case 490: {
+		struct fchmodat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->path; /* char * */
+		iarg[2] = p->mode; /* mode_t */
+		iarg[3] = p->flag; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* fchownat */
+	case 491: {
+		struct fchownat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->path; /* char * */
+		uarg[2] = p->uid; /* uid_t */
+		iarg[3] = p->gid; /* gid_t */
+		iarg[4] = p->flag; /* int */
+		*n_args = 5;
+		break;
+	}
+	/* fexecve */
+	case 492: {
+		struct fexecve_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->argv; /* char ** */
+		uarg[2] = (intptr_t) p->envv; /* char ** */
+		*n_args = 3;
+		break;
+	}
+	/* fstatat */
+	case 493: {
+		struct fstatat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->path; /* char * */
+		uarg[2] = (intptr_t) p->buf; /* struct stat * */
+		iarg[3] = p->flag; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* futimesat */
+	case 494: {
+		struct futimesat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->path; /* char * */
+		uarg[2] = (intptr_t) p->times; /* struct timeval * */
+		*n_args = 3;
+		break;
+	}
+	/* linkat */
+	case 495: {
+		struct linkat_args *p = params;
+		iarg[0] = p->fd1; /* int */
+		uarg[1] = (intptr_t) p->path1; /* char * */
+		iarg[2] = p->fd2; /* int */
+		uarg[3] = (intptr_t) p->path2; /* char * */
+		iarg[4] = p->flag; /* int */
+		*n_args = 5;
+		break;
+	}
+	/* mkdirat */
+	case 496: {
+		struct mkdirat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->path; /* char * */
+		iarg[2] = p->mode; /* mode_t */
+		*n_args = 3;
+		break;
+	}
+	/* mkfifoat */
+	case 497: {
+		struct mkfifoat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->path; /* char * */
+		iarg[2] = p->mode; /* mode_t */
+		*n_args = 3;
+		break;
+	}
+	/* mknodat */
+	case 498: {
+		struct mknodat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->path; /* char * */
+		iarg[2] = p->mode; /* mode_t */
+		iarg[3] = p->dev; /* dev_t */
+		*n_args = 4;
+		break;
+	}
+	/* openat */
+	case 499: {
+		struct openat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->path; /* char * */
+		iarg[2] = p->flag; /* int */
+		iarg[3] = p->mode; /* mode_t */
+		*n_args = 4;
+		break;
+	}
+	/* readlinkat */
+	case 500: {
+		struct readlinkat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->path; /* char * */
+		uarg[2] = (intptr_t) p->buf; /* char * */
+		uarg[3] = p->bufsize; /* size_t */
+		*n_args = 4;
+		break;
+	}
+	/* renameat */
+	case 501: {
+		struct renameat_args *p = params;
+		iarg[0] = p->oldfd; /* int */
+		uarg[1] = (intptr_t) p->old; /* char * */
+		iarg[2] = p->newfd; /* int */
+		uarg[3] = (intptr_t) p->new; /* char * */
+		*n_args = 4;
+		break;
+	}
+	/* symlinkat */
+	case 502: {
+		struct symlinkat_args *p = params;
+		uarg[0] = (intptr_t) p->path1; /* char * */
+		iarg[1] = p->fd; /* int */
+		uarg[2] = (intptr_t) p->path2; /* char * */
+		*n_args = 3;
+		break;
+	}
+	/* unlinkat */
+	case 503: {
+		struct unlinkat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->path; /* char * */
+		iarg[2] = p->flag; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* posix_openpt */
+	case 504: {
+		struct posix_openpt_args *p = params;
+		iarg[0] = p->flags; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* gssd_syscall */
+	case 505: {
+		struct gssd_syscall_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* jail_get */
+	case 506: {
+		struct jail_get_args *p = params;
+		uarg[0] = (intptr_t) p->iovp; /* struct iovec * */
+		uarg[1] = p->iovcnt; /* unsigned int */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* jail_set */
+	case 507: {
+		struct jail_set_args *p = params;
+		uarg[0] = (intptr_t) p->iovp; /* struct iovec * */
+		uarg[1] = p->iovcnt; /* unsigned int */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* jail_remove */
+	case 508: {
+		struct jail_remove_args *p = params;
+		iarg[0] = p->jid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* closefrom */
+	case 509: {
+		struct closefrom_args *p = params;
+		iarg[0] = p->lowfd; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* __semctl */
+	case 510: {
+		struct __semctl_args *p = params;
+		iarg[0] = p->semid; /* int */
+		iarg[1] = p->semnum; /* int */
+		iarg[2] = p->cmd; /* int */
+		uarg[3] = (intptr_t) p->arg; /* union semun * */
+		*n_args = 4;
+		break;
+	}
+	/* msgctl */
+	case 511: {
+		struct msgctl_args *p = params;
+		iarg[0] = p->msqid; /* int */
+		iarg[1] = p->cmd; /* int */
+		uarg[2] = (intptr_t) p->buf; /* struct msqid_ds * */
+		*n_args = 3;
+		break;
+	}
+	/* shmctl */
+	case 512: {
+		struct shmctl_args *p = params;
+		iarg[0] = p->shmid; /* int */
+		iarg[1] = p->cmd; /* int */
+		uarg[2] = (intptr_t) p->buf; /* struct shmid_ds * */
+		*n_args = 3;
+		break;
+	}
+	/* lpathconf */
+	case 513: {
+		struct lpathconf_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->name; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* __cap_rights_get */
+	case 515: {
+		struct __cap_rights_get_args *p = params;
+		iarg[0] = p->version; /* int */
+		iarg[1] = p->fd; /* int */
+		uarg[2] = (intptr_t) p->rightsp; /* cap_rights_t * */
+		*n_args = 3;
+		break;
+	}
+	/* cap_enter */
+	case 516: {
+		*n_args = 0;
+		break;
+	}
+	/* cap_getmode */
+	case 517: {
+		struct cap_getmode_args *p = params;
+		uarg[0] = (intptr_t) p->modep; /* u_int * */
+		*n_args = 1;
+		break;
+	}
+	/* pdfork */
+	case 518: {
+		struct pdfork_args *p = params;
+		uarg[0] = (intptr_t) p->fdp; /* int * */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* pdkill */
+	case 519: {
+		struct pdkill_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->signum; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* pdgetpid */
+	case 520: {
+		struct pdgetpid_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->pidp; /* pid_t * */
+		*n_args = 2;
+		break;
+	}
+	/* pselect */
+	case 522: {
+		struct pselect_args *p = params;
+		iarg[0] = p->nd; /* int */
+		uarg[1] = (intptr_t) p->in; /* fd_set * */
+		uarg[2] = (intptr_t) p->ou; /* fd_set * */
+		uarg[3] = (intptr_t) p->ex; /* fd_set * */
+		uarg[4] = (intptr_t) p->ts; /* const struct timespec * */
+		uarg[5] = (intptr_t) p->sm; /* const sigset_t * */
+		*n_args = 6;
+		break;
+	}
+	/* getloginclass */
+	case 523: {
+		struct getloginclass_args *p = params;
+		uarg[0] = (intptr_t) p->namebuf; /* char * */
+		uarg[1] = p->namelen; /* size_t */
+		*n_args = 2;
+		break;
+	}
+	/* setloginclass */
+	case 524: {
+		struct setloginclass_args *p = params;
+		uarg[0] = (intptr_t) p->namebuf; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* rctl_get_racct */
+	case 525: {
+		struct rctl_get_racct_args *p = params;
+		uarg[0] = (intptr_t) p->inbufp; /* const void * */
+		uarg[1] = p->inbuflen; /* size_t */
+		uarg[2] = (intptr_t) p->outbufp; /* void * */
+		uarg[3] = p->outbuflen; /* size_t */
+		*n_args = 4;
+		break;
+	}
+	/* rctl_get_rules */
+	case 526: {
+		struct rctl_get_rules_args *p = params;
+		uarg[0] = (intptr_t) p->inbufp; /* const void * */
+		uarg[1] = p->inbuflen; /* size_t */
+		uarg[2] = (intptr_t) p->outbufp; /* void * */
+		uarg[3] = p->outbuflen; /* size_t */
+		*n_args = 4;
+		break;
+	}
+	/* rctl_get_limits */
+	case 527: {
+		struct rctl_get_limits_args *p = params;
+		uarg[0] = (intptr_t) p->inbufp; /* const void * */
+		uarg[1] = p->inbuflen; /* size_t */
+		uarg[2] = (intptr_t) p->outbufp; /* void * */
+		uarg[3] = p->outbuflen; /* size_t */
+		*n_args = 4;
+		break;
+	}
+	/* rctl_add_rule */
+	case 528: {
+		struct rctl_add_rule_args *p = params;
+		uarg[0] = (intptr_t) p->inbufp; /* const void * */
+		uarg[1] = p->inbuflen; /* size_t */
+		uarg[2] = (intptr_t) p->outbufp; /* void * */
+		uarg[3] = p->outbuflen; /* size_t */
+		*n_args = 4;
+		break;
+	}
+	/* rctl_remove_rule */
+	case 529: {
+		struct rctl_remove_rule_args *p = params;
+		uarg[0] = (intptr_t) p->inbufp; /* const void * */
+		uarg[1] = p->inbuflen; /* size_t */
+		uarg[2] = (intptr_t) p->outbufp; /* void * */
+		uarg[3] = p->outbuflen; /* size_t */
+		*n_args = 4;
+		break;
+	}
+	/* posix_fallocate */
+	case 530: {
+		struct posix_fallocate_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->offset; /* off_t */
+		iarg[2] = p->len; /* off_t */
+		*n_args = 3;
+		break;
+	}
+	/* posix_fadvise */
+	case 531: {
+		struct posix_fadvise_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->offset; /* off_t */
+		iarg[2] = p->len; /* off_t */
+		iarg[3] = p->advice; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* wait6 */
+	case 532: {
+		struct wait6_args *p = params;
+		iarg[0] = p->idtype; /* int */
+		iarg[1] = p->id; /* id_t */
+		uarg[2] = (intptr_t) p->status; /* int * */
+		iarg[3] = p->options; /* int */
+		uarg[4] = (intptr_t) p->wrusage; /* struct __wrusage * */
+		uarg[5] = (intptr_t) p->info; /* siginfo_t * */
+		*n_args = 6;
+		break;
+	}
+	/* cap_rights_limit */
+	case 533: {
+		struct cap_rights_limit_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->rightsp; /* cap_rights_t * */
+		*n_args = 2;
+		break;
+	}
+	/* cap_ioctls_limit */
+	case 534: {
+		struct cap_ioctls_limit_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->cmds; /* const u_long * */
+		uarg[2] = p->ncmds; /* size_t */
+		*n_args = 3;
+		break;
+	}
+	/* cap_ioctls_get */
+	case 535: {
+		struct cap_ioctls_get_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->cmds; /* u_long * */
+		uarg[2] = p->maxcmds; /* size_t */
+		*n_args = 3;
+		break;
+	}
+	/* cap_fcntls_limit */
+	case 536: {
+		struct cap_fcntls_limit_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = p->fcntlrights; /* uint32_t */
+		*n_args = 2;
+		break;
+	}
+	/* cap_fcntls_get */
+	case 537: {
+		struct cap_fcntls_get_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->fcntlrightsp; /* uint32_t * */
+		*n_args = 2;
+		break;
+	}
+	/* bindat */
+	case 538: {
+		struct bindat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->s; /* int */
+		uarg[2] = (intptr_t) p->name; /* caddr_t */
+		iarg[3] = p->namelen; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* connectat */
+	case 539: {
+		struct connectat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->s; /* int */
+		uarg[2] = (intptr_t) p->name; /* caddr_t */
+		iarg[3] = p->namelen; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* chflagsat */
+	case 540: {
+		struct chflagsat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->path; /* const char * */
+		uarg[2] = p->flags; /* u_long */
+		iarg[3] = p->atflag; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* accept4 */
+	case 541: {
+		struct accept4_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->name; /* struct sockaddr *__restrict */
+		uarg[2] = (intptr_t) p->anamelen; /* __socklen_t *__restrict */
+		iarg[3] = p->flags; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* pipe2 */
+	case 542: {
+		struct pipe2_args *p = params;
+		uarg[0] = (intptr_t) p->fildes; /* int * */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* aio_mlock */
+	case 543: {
+		struct aio_mlock_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+		*n_args = 1;
+		break;
+	}
+	default:
+		*n_args = 0;
+		break;
+	};
+}
+static void
+systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
+{
+	const char *p = NULL;
+	switch (sysnum) {
+	/* nosys */
+	case 0:
+		break;
+	/* sys_exit */
+	case 1:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fork */
+	case 2:
+		break;
+	/* read */
+	case 3:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "void *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* write */
+	case 4:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "const void *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* open */
+	case 5:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* close */
+	case 6:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* wait4 */
+	case 7:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "struct rusage *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* link */
+	case 9:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* unlink */
+	case 10:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* chdir */
+	case 12:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fchdir */
+	case 13:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* mknod */
+	case 14:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* chmod */
+	case 15:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* chown */
+	case 16:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* obreak */
+	case 17:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getpid */
+	case 20:
+		break;
+	/* mount */
+	case 21:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "caddr_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* unmount */
+	case 22:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setuid */
+	case 23:
+		switch(ndx) {
+		case 0:
+			p = "uid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getuid */
+	case 24:
+		break;
+	/* geteuid */
+	case 25:
+		break;
+	/* ptrace */
+	case 26:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "pid_t";
+			break;
+		case 2:
+			p = "caddr_t";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* recvmsg */
+	case 27:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct msghdr *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sendmsg */
+	case 28:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct msghdr *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* recvfrom */
+	case 29:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "caddr_t";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		case 3:
+			p = "int";
+			break;
+		case 4:
+			p = "struct sockaddr *__restrict";
+			break;
+		case 5:
+			p = "__socklen_t *__restrict";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* accept */
+	case 30:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct sockaddr *__restrict";
+			break;
+		case 2:
+			p = "__socklen_t *__restrict";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getpeername */
+	case 31:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct sockaddr *__restrict";
+			break;
+		case 2:
+			p = "__socklen_t *__restrict";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getsockname */
+	case 32:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct sockaddr *__restrict";
+			break;
+		case 2:
+			p = "__socklen_t *__restrict";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* access */
+	case 33:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* chflags */
+	case 34:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "u_long";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fchflags */
+	case 35:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "u_long";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sync */
+	case 36:
+		break;
+	/* kill */
+	case 37:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getppid */
+	case 39:
+		break;
+	/* dup */
+	case 41:
+		switch(ndx) {
+		case 0:
+			p = "u_int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* pipe */
+	case 42:
+		break;
+	/* getegid */
+	case 43:
+		break;
+	/* profil */
+	case 44:
+		switch(ndx) {
+		case 0:
+			p = "caddr_t";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		case 3:
+			p = "u_int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ktrace */
+	case 45:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getgid */
+	case 47:
+		break;
+	/* getlogin */
+	case 49:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "u_int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setlogin */
+	case 50:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* acct */
+	case 51:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sigaltstack */
+	case 53:
+		switch(ndx) {
+		case 0:
+			p = "stack_t *";
+			break;
+		case 1:
+			p = "stack_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ioctl */
+	case 54:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "u_long";
+			break;
+		case 2:
+			p = "caddr_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* reboot */
+	case 55:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* revoke */
+	case 56:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* symlink */
+	case 57:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* readlink */
+	case 58:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* execve */
+	case 59:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "char **";
+			break;
+		case 2:
+			p = "char **";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* umask */
+	case 60:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* chroot */
+	case 61:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* msync */
+	case 65:
+		switch(ndx) {
+		case 0:
+			p = "void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* vfork */
+	case 66:
+		break;
+	/* sbrk */
+	case 69:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sstk */
+	case 70:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ovadvise */
+	case 72:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* munmap */
+	case 73:
+		switch(ndx) {
+		case 0:
+			p = "void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* mprotect */
+	case 74:
+		switch(ndx) {
+		case 0:
+			p = "const void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* madvise */
+	case 75:
+		switch(ndx) {
+		case 0:
+			p = "void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* mincore */
+	case 78:
+		switch(ndx) {
+		case 0:
+			p = "const void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getgroups */
+	case 79:
+		switch(ndx) {
+		case 0:
+			p = "u_int";
+			break;
+		case 1:
+			p = "gid_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setgroups */
+	case 80:
+		switch(ndx) {
+		case 0:
+			p = "u_int";
+			break;
+		case 1:
+			p = "gid_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getpgrp */
+	case 81:
+		break;
+	/* setpgid */
+	case 82:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setitimer */
+	case 83:
+		switch(ndx) {
+		case 0:
+			p = "u_int";
+			break;
+		case 1:
+			p = "struct itimerval *";
+			break;
+		case 2:
+			p = "struct itimerval *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* swapon */
+	case 85:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getitimer */
+	case 86:
+		switch(ndx) {
+		case 0:
+			p = "u_int";
+			break;
+		case 1:
+			p = "struct itimerval *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getdtablesize */
+	case 89:
+		break;
+	/* dup2 */
+	case 90:
+		switch(ndx) {
+		case 0:
+			p = "u_int";
+			break;
+		case 1:
+			p = "u_int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fcntl */
+	case 92:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "long";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* select */
+	case 93:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "fd_set *";
+			break;
+		case 2:
+			p = "fd_set *";
+			break;
+		case 3:
+			p = "fd_set *";
+			break;
+		case 4:
+			p = "struct timeval *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fsync */
+	case 95:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setpriority */
+	case 96:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* socket */
+	case 97:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* connect */
+	case 98:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "caddr_t";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getpriority */
+	case 100:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* bind */
+	case 104:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "caddr_t";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setsockopt */
+	case 105:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "caddr_t";
+			break;
+		case 4:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* listen */
+	case 106:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* gettimeofday */
+	case 116:
+		switch(ndx) {
+		case 0:
+			p = "struct timeval *";
+			break;
+		case 1:
+			p = "struct timezone *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getrusage */
+	case 117:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct rusage *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getsockopt */
+	case 118:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "caddr_t";
+			break;
+		case 4:
+			p = "int *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* readv */
+	case 120:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct iovec *";
+			break;
+		case 2:
+			p = "u_int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* writev */
+	case 121:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct iovec *";
+			break;
+		case 2:
+			p = "u_int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* settimeofday */
+	case 122:
+		switch(ndx) {
+		case 0:
+			p = "struct timeval *";
+			break;
+		case 1:
+			p = "struct timezone *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fchown */
+	case 123:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fchmod */
+	case 124:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setreuid */
+	case 126:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setregid */
+	case 127:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* rename */
+	case 128:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* flock */
+	case 131:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* mkfifo */
+	case 132:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sendto */
+	case 133:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "caddr_t";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		case 3:
+			p = "int";
+			break;
+		case 4:
+			p = "caddr_t";
+			break;
+		case 5:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* shutdown */
+	case 134:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* socketpair */
+	case 135:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "int *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* mkdir */
+	case 136:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* rmdir */
+	case 137:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* utimes */
+	case 138:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "struct timeval *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* adjtime */
+	case 140:
+		switch(ndx) {
+		case 0:
+			p = "struct timeval *";
+			break;
+		case 1:
+			p = "struct timeval *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setsid */
+	case 147:
+		break;
+	/* quotactl */
+	case 148:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "caddr_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* nlm_syscall */
+	case 154:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "char **";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* nfssvc */
+	case 155:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "caddr_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* lgetfh */
+	case 160:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "struct fhandle *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getfh */
+	case 161:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "struct fhandle *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sysarch */
+	case 165:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* rtprio */
+	case 166:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "pid_t";
+			break;
+		case 2:
+			p = "struct rtprio *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* semsys */
+	case 169:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "int";
+			break;
+		case 4:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* msgsys */
+	case 170:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "int";
+			break;
+		case 4:
+			p = "int";
+			break;
+		case 5:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* shmsys */
+	case 171:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* freebsd6_pread */
+	case 173:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "void *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		case 3:
+			p = "int";
+			break;
+		case 4:
+			p = "off_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* freebsd6_pwrite */
+	case 174:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "const void *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		case 3:
+			p = "int";
+			break;
+		case 4:
+			p = "off_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setfib */
+	case 175:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ntp_adjtime */
+	case 176:
+		switch(ndx) {
+		case 0:
+			p = "struct timex *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setgid */
+	case 181:
+		switch(ndx) {
+		case 0:
+			p = "gid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setegid */
+	case 182:
+		switch(ndx) {
+		case 0:
+			p = "gid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* seteuid */
+	case 183:
+		switch(ndx) {
+		case 0:
+			p = "uid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* stat */
+	case 188:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "struct stat *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fstat */
+	case 189:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct stat *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* lstat */
+	case 190:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "struct stat *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* pathconf */
+	case 191:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fpathconf */
+	case 192:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getrlimit */
+	case 194:
+		switch(ndx) {
+		case 0:
+			p = "u_int";
+			break;
+		case 1:
+			p = "struct rlimit *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setrlimit */
+	case 195:
+		switch(ndx) {
+		case 0:
+			p = "u_int";
+			break;
+		case 1:
+			p = "struct rlimit *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getdirentries */
+	case 196:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "u_int";
+			break;
+		case 3:
+			p = "long *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* freebsd6_mmap */
+	case 197:
+		switch(ndx) {
+		case 0:
+			p = "caddr_t";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "int";
+			break;
+		case 4:
+			p = "int";
+			break;
+		case 5:
+			p = "int";
+			break;
+		case 6:
+			p = "off_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* nosys */
+	case 198:
+		break;
+	/* freebsd6_lseek */
+	case 199:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "off_t";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* freebsd6_truncate */
+	case 200:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "off_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* freebsd6_ftruncate */
+	case 201:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "off_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __sysctl */
+	case 202:
+		switch(ndx) {
+		case 0:
+			p = "int *";
+			break;
+		case 1:
+			p = "u_int";
+			break;
+		case 2:
+			p = "void *";
+			break;
+		case 3:
+			p = "size_t *";
+			break;
+		case 4:
+			p = "void *";
+			break;
+		case 5:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* mlock */
+	case 203:
+		switch(ndx) {
+		case 0:
+			p = "const void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* munlock */
+	case 204:
+		switch(ndx) {
+		case 0:
+			p = "const void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* undelete */
+	case 205:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* futimes */
+	case 206:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct timeval *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getpgid */
+	case 207:
+		switch(ndx) {
+		case 0:
+			p = "pid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* poll */
+	case 209:
+		switch(ndx) {
+		case 0:
+			p = "struct pollfd *";
+			break;
+		case 1:
+			p = "u_int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* lkmnosys */
+	case 210:
+		break;
+	/* lkmnosys */
+	case 211:
+		break;
+	/* lkmnosys */
+	case 212:
+		break;
+	/* lkmnosys */
+	case 213:
+		break;
+	/* lkmnosys */
+	case 214:
+		break;
+	/* lkmnosys */
+	case 215:
+		break;
+	/* lkmnosys */
+	case 216:
+		break;
+	/* lkmnosys */
+	case 217:
+		break;
+	/* lkmnosys */
+	case 218:
+		break;
+	/* lkmnosys */
+	case 219:
+		break;
+	/* semget */
+	case 221:
+		switch(ndx) {
+		case 0:
+			p = "key_t";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* semop */
+	case 222:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct sembuf *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* msgget */
+	case 225:
+		switch(ndx) {
+		case 0:
+			p = "key_t";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* msgsnd */
+	case 226:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "const void *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* msgrcv */
+	case 227:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "void *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		case 3:
+			p = "long";
+			break;
+		case 4:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* shmat */
+	case 228:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "const void *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* shmdt */
+	case 230:
+		switch(ndx) {
+		case 0:
+			p = "const void *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* shmget */
+	case 231:
+		switch(ndx) {
+		case 0:
+			p = "key_t";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* clock_gettime */
+	case 232:
+		switch(ndx) {
+		case 0:
+			p = "clockid_t";
+			break;
+		case 1:
+			p = "struct timespec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* clock_settime */
+	case 233:
+		switch(ndx) {
+		case 0:
+			p = "clockid_t";
+			break;
+		case 1:
+			p = "const struct timespec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* clock_getres */
+	case 234:
+		switch(ndx) {
+		case 0:
+			p = "clockid_t";
+			break;
+		case 1:
+			p = "struct timespec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ktimer_create */
+	case 235:
+		switch(ndx) {
+		case 0:
+			p = "clockid_t";
+			break;
+		case 1:
+			p = "struct sigevent *";
+			break;
+		case 2:
+			p = "int *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ktimer_delete */
+	case 236:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ktimer_settime */
+	case 237:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "const struct itimerspec *";
+			break;
+		case 3:
+			p = "struct itimerspec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ktimer_gettime */
+	case 238:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct itimerspec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ktimer_getoverrun */
+	case 239:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* nanosleep */
+	case 240:
+		switch(ndx) {
+		case 0:
+			p = "const struct timespec *";
+			break;
+		case 1:
+			p = "struct timespec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ffclock_getcounter */
+	case 241:
+		switch(ndx) {
+		case 0:
+			p = "ffcounter *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ffclock_setestimate */
+	case 242:
+		switch(ndx) {
+		case 0:
+			p = "struct ffclock_estimate *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ffclock_getestimate */
+	case 243:
+		switch(ndx) {
+		case 0:
+			p = "struct ffclock_estimate *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* clock_getcpuclockid2 */
+	case 247:
+		switch(ndx) {
+		case 0:
+			p = "id_t";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "clockid_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ntp_gettime */
+	case 248:
+		switch(ndx) {
+		case 0:
+			p = "struct ntptimeval *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* minherit */
+	case 250:
+		switch(ndx) {
+		case 0:
+			p = "void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* rfork */
+	case 251:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* openbsd_poll */
+	case 252:
+		switch(ndx) {
+		case 0:
+			p = "struct pollfd *";
+			break;
+		case 1:
+			p = "u_int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* issetugid */
+	case 253:
+		break;
+	/* lchown */
+	case 254:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* aio_read */
+	case 255:
+		switch(ndx) {
+		case 0:
+			p = "struct aiocb *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* aio_write */
+	case 256:
+		switch(ndx) {
+		case 0:
+			p = "struct aiocb *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* lio_listio */
+	case 257:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct aiocb *const *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "struct sigevent *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getdents */
+	case 272:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* lchmod */
+	case 274:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "mode_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* lchown */
+	case 275:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "uid_t";
+			break;
+		case 2:
+			p = "gid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* lutimes */
+	case 276:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "struct timeval *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* msync */
+	case 277:
+		switch(ndx) {
+		case 0:
+			p = "void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* nstat */
+	case 278:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "struct nstat *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* nfstat */
+	case 279:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct nstat *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* nlstat */
+	case 280:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "struct nstat *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* preadv */
+	case 289:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct iovec *";
+			break;
+		case 2:
+			p = "u_int";
+			break;
+		case 3:
+			p = "off_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* pwritev */
+	case 290:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct iovec *";
+			break;
+		case 2:
+			p = "u_int";
+			break;
+		case 3:
+			p = "off_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fhopen */
+	case 298:
+		switch(ndx) {
+		case 0:
+			p = "const struct fhandle *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fhstat */
+	case 299:
+		switch(ndx) {
+		case 0:
+			p = "const struct fhandle *";
+			break;
+		case 1:
+			p = "struct stat *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* modnext */
+	case 300:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* modstat */
+	case 301:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct module_stat *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* modfnext */
+	case 302:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* modfind */
+	case 303:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kldload */
+	case 304:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kldunload */
+	case 305:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kldfind */
+	case 306:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kldnext */
+	case 307:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kldstat */
+	case 308:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct kld_file_stat *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kldfirstmod */
+	case 309:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getsid */
+	case 310:
+		switch(ndx) {
+		case 0:
+			p = "pid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setresuid */
+	case 311:
+		switch(ndx) {
+		case 0:
+			p = "uid_t";
+			break;
+		case 1:
+			p = "uid_t";
+			break;
+		case 2:
+			p = "uid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setresgid */
+	case 312:
+		switch(ndx) {
+		case 0:
+			p = "gid_t";
+			break;
+		case 1:
+			p = "gid_t";
+			break;
+		case 2:
+			p = "gid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* aio_return */
+	case 314:
+		switch(ndx) {
+		case 0:
+			p = "struct aiocb *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* aio_suspend */
+	case 315:
+		switch(ndx) {
+		case 0:
+			p = "struct aiocb *const *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "const struct timespec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* aio_cancel */
+	case 316:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct aiocb *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* aio_error */
+	case 317:
+		switch(ndx) {
+		case 0:
+			p = "struct aiocb *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* oaio_read */
+	case 318:
+		switch(ndx) {
+		case 0:
+			p = "struct oaiocb *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* oaio_write */
+	case 319:
+		switch(ndx) {
+		case 0:
+			p = "struct oaiocb *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* olio_listio */
+	case 320:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct oaiocb *const *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "struct osigevent *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* yield */
+	case 321:
+		break;
+	/* mlockall */
+	case 324:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* munlockall */
+	case 325:
+		break;
+	/* __getcwd */
+	case 326:
+		switch(ndx) {
+		case 0:
+			p = "u_char *";
+			break;
+		case 1:
+			p = "u_int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sched_setparam */
+	case 327:
+		switch(ndx) {
+		case 0:
+			p = "pid_t";
+			break;
+		case 1:
+			p = "const struct sched_param *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sched_getparam */
+	case 328:
+		switch(ndx) {
+		case 0:
+			p = "pid_t";
+			break;
+		case 1:
+			p = "struct sched_param *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sched_setscheduler */
+	case 329:
+		switch(ndx) {
+		case 0:
+			p = "pid_t";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "const struct sched_param *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sched_getscheduler */
+	case 330:
+		switch(ndx) {
+		case 0:
+			p = "pid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sched_yield */
+	case 331:
+		break;
+	/* sched_get_priority_max */
+	case 332:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sched_get_priority_min */
+	case 333:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sched_rr_get_interval */
+	case 334:
+		switch(ndx) {
+		case 0:
+			p = "pid_t";
+			break;
+		case 1:
+			p = "struct timespec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* utrace */
+	case 335:
+		switch(ndx) {
+		case 0:
+			p = "const void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kldsym */
+	case 337:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "void *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* jail */
+	case 338:
+		switch(ndx) {
+		case 0:
+			p = "struct jail *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* nnpfs_syscall */
+	case 339:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "void *";
+			break;
+		case 4:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sigprocmask */
+	case 340:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "const sigset_t *";
+			break;
+		case 2:
+			p = "sigset_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sigsuspend */
+	case 341:
+		switch(ndx) {
+		case 0:
+			p = "const sigset_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sigpending */
+	case 343:
+		switch(ndx) {
+		case 0:
+			p = "sigset_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sigtimedwait */
+	case 345:
+		switch(ndx) {
+		case 0:
+			p = "const sigset_t *";
+			break;
+		case 1:
+			p = "siginfo_t *";
+			break;
+		case 2:
+			p = "const struct timespec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sigwaitinfo */
+	case 346:
+		switch(ndx) {
+		case 0:
+			p = "const sigset_t *";
+			break;
+		case 1:
+			p = "siginfo_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __acl_get_file */
+	case 347:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "acl_type_t";
+			break;
+		case 2:
+			p = "struct acl *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __acl_set_file */
+	case 348:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "acl_type_t";
+			break;
+		case 2:
+			p = "struct acl *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __acl_get_fd */
+	case 349:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "acl_type_t";
+			break;
+		case 2:
+			p = "struct acl *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __acl_set_fd */
+	case 350:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "acl_type_t";
+			break;
+		case 2:
+			p = "struct acl *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __acl_delete_file */
+	case 351:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "acl_type_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __acl_delete_fd */
+	case 352:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "acl_type_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __acl_aclcheck_file */
+	case 353:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "acl_type_t";
+			break;
+		case 2:
+			p = "struct acl *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __acl_aclcheck_fd */
+	case 354:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "acl_type_t";
+			break;
+		case 2:
+			p = "struct acl *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* extattrctl */
+	case 355:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "const char *";
+			break;
+		case 3:
+			p = "int";
+			break;
+		case 4:
+			p = "const char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* extattr_set_file */
+	case 356:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "const char *";
+			break;
+		case 3:
+			p = "void *";
+			break;
+		case 4:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* extattr_get_file */
+	case 357:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "const char *";
+			break;
+		case 3:
+			p = "void *";
+			break;
+		case 4:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* extattr_delete_file */
+	case 358:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "const char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* aio_waitcomplete */
+	case 359:
+		switch(ndx) {
+		case 0:
+			p = "struct aiocb **";
+			break;
+		case 1:
+			p = "struct timespec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getresuid */
+	case 360:
+		switch(ndx) {
+		case 0:
+			p = "uid_t *";
+			break;
+		case 1:
+			p = "uid_t *";
+			break;
+		case 2:
+			p = "uid_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getresgid */
+	case 361:
+		switch(ndx) {
+		case 0:
+			p = "gid_t *";
+			break;
+		case 1:
+			p = "gid_t *";
+			break;
+		case 2:
+			p = "gid_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kqueue */
+	case 362:
+		break;
+	/* kevent */
+	case 363:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct kevent *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "struct kevent *";
+			break;
+		case 4:
+			p = "int";
+			break;
+		case 5:
+			p = "const struct timespec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* extattr_set_fd */
+	case 371:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "const char *";
+			break;
+		case 3:
+			p = "void *";
+			break;
+		case 4:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* extattr_get_fd */
+	case 372:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "const char *";
+			break;
+		case 3:
+			p = "void *";
+			break;
+		case 4:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* extattr_delete_fd */
+	case 373:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "const char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __setugid */
+	case 374:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* eaccess */
+	case 376:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* afs3_syscall */
+	case 377:
+		switch(ndx) {
+		case 0:
+			p = "long";
+			break;
+		case 1:
+			p = "long";
+			break;
+		case 2:
+			p = "long";
+			break;
+		case 3:
+			p = "long";
+			break;
+		case 4:
+			p = "long";
+			break;
+		case 5:
+			p = "long";
+			break;
+		case 6:
+			p = "long";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* nmount */
+	case 378:
+		switch(ndx) {
+		case 0:
+			p = "struct iovec *";
+			break;
+		case 1:
+			p = "unsigned int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __mac_get_proc */
+	case 384:
+		switch(ndx) {
+		case 0:
+			p = "struct mac *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __mac_set_proc */
+	case 385:
+		switch(ndx) {
+		case 0:
+			p = "struct mac *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __mac_get_fd */
+	case 386:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct mac *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __mac_get_file */
+	case 387:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "struct mac *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __mac_set_fd */
+	case 388:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct mac *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __mac_set_file */
+	case 389:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "struct mac *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kenv */
+	case 390:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "const char *";
+			break;
+		case 2:
+			p = "char *";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* lchflags */
+	case 391:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "u_long";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* uuidgen */
+	case 392:
+		switch(ndx) {
+		case 0:
+			p = "struct uuid *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sendfile */
+	case 393:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "off_t";
+			break;
+		case 3:
+			p = "size_t";
+			break;
+		case 4:
+			p = "struct sf_hdtr *";
+			break;
+		case 5:
+			p = "off_t *";
+			break;
+		case 6:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* mac_syscall */
+	case 394:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "void *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getfsstat */
+	case 395:
+		switch(ndx) {
+		case 0:
+			p = "struct statfs *";
+			break;
+		case 1:
+			p = "long";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* statfs */
+	case 396:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "struct statfs *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fstatfs */
+	case 397:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct statfs *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fhstatfs */
+	case 398:
+		switch(ndx) {
+		case 0:
+			p = "const struct fhandle *";
+			break;
+		case 1:
+			p = "struct statfs *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ksem_close */
+	case 400:
+		switch(ndx) {
+		case 0:
+			p = "semid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ksem_post */
+	case 401:
+		switch(ndx) {
+		case 0:
+			p = "semid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ksem_wait */
+	case 402:
+		switch(ndx) {
+		case 0:
+			p = "semid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ksem_trywait */
+	case 403:
+		switch(ndx) {
+		case 0:
+			p = "semid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ksem_init */
+	case 404:
+		switch(ndx) {
+		case 0:
+			p = "semid_t *";
+			break;
+		case 1:
+			p = "unsigned int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ksem_open */
+	case 405:
+		switch(ndx) {
+		case 0:
+			p = "semid_t *";
+			break;
+		case 1:
+			p = "const char *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "mode_t";
+			break;
+		case 4:
+			p = "unsigned int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ksem_unlink */
+	case 406:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ksem_getvalue */
+	case 407:
+		switch(ndx) {
+		case 0:
+			p = "semid_t";
+			break;
+		case 1:
+			p = "int *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ksem_destroy */
+	case 408:
+		switch(ndx) {
+		case 0:
+			p = "semid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __mac_get_pid */
+	case 409:
+		switch(ndx) {
+		case 0:
+			p = "pid_t";
+			break;
+		case 1:
+			p = "struct mac *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __mac_get_link */
+	case 410:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "struct mac *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __mac_set_link */
+	case 411:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "struct mac *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* extattr_set_link */
+	case 412:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "const char *";
+			break;
+		case 3:
+			p = "void *";
+			break;
+		case 4:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* extattr_get_link */
+	case 413:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "const char *";
+			break;
+		case 3:
+			p = "void *";
+			break;
+		case 4:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* extattr_delete_link */
+	case 414:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "const char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __mac_execve */
+	case 415:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "char **";
+			break;
+		case 2:
+			p = "char **";
+			break;
+		case 3:
+			p = "struct mac *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sigaction */
+	case 416:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "const struct sigaction *";
+			break;
+		case 2:
+			p = "struct sigaction *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sigreturn */
+	case 417:
+		switch(ndx) {
+		case 0:
+			p = "const struct __ucontext *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getcontext */
+	case 421:
+		switch(ndx) {
+		case 0:
+			p = "struct __ucontext *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setcontext */
+	case 422:
+		switch(ndx) {
+		case 0:
+			p = "const struct __ucontext *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* swapcontext */
+	case 423:
+		switch(ndx) {
+		case 0:
+			p = "struct __ucontext *";
+			break;
+		case 1:
+			p = "const struct __ucontext *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* swapoff */
+	case 424:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __acl_get_link */
+	case 425:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "acl_type_t";
+			break;
+		case 2:
+			p = "struct acl *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __acl_set_link */
+	case 426:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "acl_type_t";
+			break;
+		case 2:
+			p = "struct acl *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __acl_delete_link */
+	case 427:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "acl_type_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __acl_aclcheck_link */
+	case 428:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "acl_type_t";
+			break;
+		case 2:
+			p = "struct acl *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sigwait */
+	case 429:
+		switch(ndx) {
+		case 0:
+			p = "const sigset_t *";
+			break;
+		case 1:
+			p = "int *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* thr_create */
+	case 430:
+		switch(ndx) {
+		case 0:
+			p = "ucontext_t *";
+			break;
+		case 1:
+			p = "long *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* thr_exit */
+	case 431:
+		switch(ndx) {
+		case 0:
+			p = "long *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* thr_self */
+	case 432:
+		switch(ndx) {
+		case 0:
+			p = "long *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* thr_kill */
+	case 433:
+		switch(ndx) {
+		case 0:
+			p = "long";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* _umtx_lock */
+	case 434:
+		switch(ndx) {
+		case 0:
+			p = "struct umtx *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* _umtx_unlock */
+	case 435:
+		switch(ndx) {
+		case 0:
+			p = "struct umtx *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* jail_attach */
+	case 436:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* extattr_list_fd */
+	case 437:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "void *";
+			break;
+		case 3:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* extattr_list_file */
+	case 438:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "void *";
+			break;
+		case 3:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* extattr_list_link */
+	case 439:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "void *";
+			break;
+		case 3:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ksem_timedwait */
+	case 441:
+		switch(ndx) {
+		case 0:
+			p = "semid_t";
+			break;
+		case 1:
+			p = "const struct timespec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* thr_suspend */
+	case 442:
+		switch(ndx) {
+		case 0:
+			p = "const struct timespec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* thr_wake */
+	case 443:
+		switch(ndx) {
+		case 0:
+			p = "long";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kldunloadf */
+	case 444:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* audit */
+	case 445:
+		switch(ndx) {
+		case 0:
+			p = "const void *";
+			break;
+		case 1:
+			p = "u_int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* auditon */
+	case 446:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "void *";
+			break;
+		case 2:
+			p = "u_int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getauid */
+	case 447:
+		switch(ndx) {
+		case 0:
+			p = "uid_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setauid */
+	case 448:
+		switch(ndx) {
+		case 0:
+			p = "uid_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getaudit */
+	case 449:
+		switch(ndx) {
+		case 0:
+			p = "struct auditinfo *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setaudit */
+	case 450:
+		switch(ndx) {
+		case 0:
+			p = "struct auditinfo *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getaudit_addr */
+	case 451:
+		switch(ndx) {
+		case 0:
+			p = "struct auditinfo_addr *";
+			break;
+		case 1:
+			p = "u_int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setaudit_addr */
+	case 452:
+		switch(ndx) {
+		case 0:
+			p = "struct auditinfo_addr *";
+			break;
+		case 1:
+			p = "u_int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* auditctl */
+	case 453:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* _umtx_op */
+	case 454:
+		switch(ndx) {
+		case 0:
+			p = "void *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "u_long";
+			break;
+		case 3:
+			p = "void *";
+			break;
+		case 4:
+			p = "void *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* thr_new */
+	case 455:
+		switch(ndx) {
+		case 0:
+			p = "struct thr_param *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sigqueue */
+	case 456:
+		switch(ndx) {
+		case 0:
+			p = "pid_t";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "void *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kmq_open */
+	case 457:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "mode_t";
+			break;
+		case 3:
+			p = "const struct mq_attr *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kmq_setattr */
+	case 458:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "const struct mq_attr *";
+			break;
+		case 2:
+			p = "struct mq_attr *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kmq_timedreceive */
+	case 459:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		case 3:
+			p = "unsigned *";
+			break;
+		case 4:
+			p = "const struct timespec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kmq_timedsend */
+	case 460:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "const char *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		case 3:
+			p = "unsigned";
+			break;
+		case 4:
+			p = "const struct timespec *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kmq_notify */
+	case 461:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "const struct sigevent *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* kmq_unlink */
+	case 462:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* abort2 */
+	case 463:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "void **";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* thr_set_name */
+	case 464:
+		switch(ndx) {
+		case 0:
+			p = "long";
+			break;
+		case 1:
+			p = "const char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* aio_fsync */
+	case 465:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct aiocb *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* rtprio_thread */
+	case 466:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "lwpid_t";
+			break;
+		case 2:
+			p = "struct rtprio *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sctp_peeloff */
+	case 471:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "uint32_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sctp_generic_sendmsg */
+	case 472:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "caddr_t";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "caddr_t";
+			break;
+		case 4:
+			p = "__socklen_t";
+			break;
+		case 5:
+			p = "struct sctp_sndrcvinfo *";
+			break;
+		case 6:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sctp_generic_sendmsg_iov */
+	case 473:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct iovec *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "caddr_t";
+			break;
+		case 4:
+			p = "__socklen_t";
+			break;
+		case 5:
+			p = "struct sctp_sndrcvinfo *";
+			break;
+		case 6:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* sctp_generic_recvmsg */
+	case 474:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct iovec *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "struct sockaddr *";
+			break;
+		case 4:
+			p = "__socklen_t *";
+			break;
+		case 5:
+			p = "struct sctp_sndrcvinfo *";
+			break;
+		case 6:
+			p = "int *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* pread */
+	case 475:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "void *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		case 3:
+			p = "off_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* pwrite */
+	case 476:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "const void *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		case 3:
+			p = "off_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* mmap */
+	case 477:
+		switch(ndx) {
+		case 0:
+			p = "caddr_t";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "int";
+			break;
+		case 4:
+			p = "int";
+			break;
+		case 5:
+			p = "off_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* lseek */
+	case 478:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "off_t";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* truncate */
+	case 479:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "off_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* ftruncate */
+	case 480:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "off_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* thr_kill2 */
+	case 481:
+		switch(ndx) {
+		case 0:
+			p = "pid_t";
+			break;
+		case 1:
+			p = "long";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* shm_open */
+	case 482:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "mode_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* shm_unlink */
+	case 483:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* cpuset */
+	case 484:
+		switch(ndx) {
+		case 0:
+			p = "cpusetid_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* cpuset_setid */
+	case 485:
+		switch(ndx) {
+		case 0:
+			p = "cpuwhich_t";
+			break;
+		case 1:
+			p = "id_t";
+			break;
+		case 2:
+			p = "cpusetid_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* cpuset_getid */
+	case 486:
+		switch(ndx) {
+		case 0:
+			p = "cpulevel_t";
+			break;
+		case 1:
+			p = "cpuwhich_t";
+			break;
+		case 2:
+			p = "id_t";
+			break;
+		case 3:
+			p = "cpusetid_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* cpuset_getaffinity */
+	case 487:
+		switch(ndx) {
+		case 0:
+			p = "cpulevel_t";
+			break;
+		case 1:
+			p = "cpuwhich_t";
+			break;
+		case 2:
+			p = "id_t";
+			break;
+		case 3:
+			p = "size_t";
+			break;
+		case 4:
+			p = "cpuset_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* cpuset_setaffinity */
+	case 488:
+		switch(ndx) {
+		case 0:
+			p = "cpulevel_t";
+			break;
+		case 1:
+			p = "cpuwhich_t";
+			break;
+		case 2:
+			p = "id_t";
+			break;
+		case 3:
+			p = "size_t";
+			break;
+		case 4:
+			p = "const cpuset_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* faccessat */
+	case 489:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fchmodat */
+	case 490:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "mode_t";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fchownat */
+	case 491:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "uid_t";
+			break;
+		case 3:
+			p = "gid_t";
+			break;
+		case 4:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fexecve */
+	case 492:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char **";
+			break;
+		case 2:
+			p = "char **";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* fstatat */
+	case 493:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "struct stat *";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* futimesat */
+	case 494:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "struct timeval *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* linkat */
+	case 495:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "char *";
+			break;
+		case 4:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* mkdirat */
+	case 496:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "mode_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* mkfifoat */
+	case 497:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "mode_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* mknodat */
+	case 498:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "mode_t";
+			break;
+		case 3:
+			p = "dev_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* openat */
+	case 499:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "mode_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* readlinkat */
+	case 500:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "char *";
+			break;
+		case 3:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* renameat */
+	case 501:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* symlinkat */
+	case 502:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* unlinkat */
+	case 503:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "char *";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* posix_openpt */
+	case 504:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* gssd_syscall */
+	case 505:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* jail_get */
+	case 506:
+		switch(ndx) {
+		case 0:
+			p = "struct iovec *";
+			break;
+		case 1:
+			p = "unsigned int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* jail_set */
+	case 507:
+		switch(ndx) {
+		case 0:
+			p = "struct iovec *";
+			break;
+		case 1:
+			p = "unsigned int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* jail_remove */
+	case 508:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* closefrom */
+	case 509:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __semctl */
+	case 510:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "int";
+			break;
+		case 3:
+			p = "union semun *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* msgctl */
+	case 511:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "struct msqid_ds *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* shmctl */
+	case 512:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "struct shmid_ds *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* lpathconf */
+	case 513:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* __cap_rights_get */
+	case 515:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "cap_rights_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* cap_enter */
+	case 516:
+		break;
+	/* cap_getmode */
+	case 517:
+		switch(ndx) {
+		case 0:
+			p = "u_int *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* pdfork */
+	case 518:
+		switch(ndx) {
+		case 0:
+			p = "int *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* pdkill */
+	case 519:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* pdgetpid */
+	case 520:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "pid_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* pselect */
+	case 522:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "fd_set *";
+			break;
+		case 2:
+			p = "fd_set *";
+			break;
+		case 3:
+			p = "fd_set *";
+			break;
+		case 4:
+			p = "const struct timespec *";
+			break;
+		case 5:
+			p = "const sigset_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* getloginclass */
+	case 523:
+		switch(ndx) {
+		case 0:
+			p = "char *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* setloginclass */
+	case 524:
+		switch(ndx) {
+		case 0:
+			p = "const char *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* rctl_get_racct */
+	case 525:
+		switch(ndx) {
+		case 0:
+			p = "const void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "void *";
+			break;
+		case 3:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* rctl_get_rules */
+	case 526:
+		switch(ndx) {
+		case 0:
+			p = "const void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "void *";
+			break;
+		case 3:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* rctl_get_limits */
+	case 527:
+		switch(ndx) {
+		case 0:
+			p = "const void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "void *";
+			break;
+		case 3:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* rctl_add_rule */
+	case 528:
+		switch(ndx) {
+		case 0:
+			p = "const void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "void *";
+			break;
+		case 3:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* rctl_remove_rule */
+	case 529:
+		switch(ndx) {
+		case 0:
+			p = "const void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "void *";
+			break;
+		case 3:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* posix_fallocate */
+	case 530:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "off_t";
+			break;
+		case 2:
+			p = "off_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* posix_fadvise */
+	case 531:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "off_t";
+			break;
+		case 2:
+			p = "off_t";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* wait6 */
+	case 532:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "id_t";
+			break;
+		case 2:
+			p = "int *";
+			break;
+		case 3:
+			p = "int";
+			break;
+		case 4:
+			p = "struct __wrusage *";
+			break;
+		case 5:
+			p = "siginfo_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* cap_rights_limit */
+	case 533:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "cap_rights_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* cap_ioctls_limit */
+	case 534:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "const u_long *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* cap_ioctls_get */
+	case 535:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "u_long *";
+			break;
+		case 2:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* cap_fcntls_limit */
+	case 536:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "uint32_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* cap_fcntls_get */
+	case 537:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "uint32_t *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* bindat */
+	case 538:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "caddr_t";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* connectat */
+	case 539:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "caddr_t";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* chflagsat */
+	case 540:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "const char *";
+			break;
+		case 2:
+			p = "u_long";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* accept4 */
+	case 541:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct sockaddr *__restrict";
+			break;
+		case 2:
+			p = "__socklen_t *__restrict";
+			break;
+		case 3:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* pipe2 */
+	case 542:
+		switch(ndx) {
+		case 0:
+			p = "int *";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* aio_mlock */
+	case 543:
+		switch(ndx) {
+		case 0:
+			p = "struct aiocb *";
+			break;
+		default:
+			break;
+		};
+		break;
+	default:
+		break;
+	};
+	if (p != NULL)
+		strlcpy(desc, p, descsz);
+}
+static void
+systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
+{
+	const char *p = NULL;
+	switch (sysnum) {
+	/* nosys */
+	case 0:
+	/* sys_exit */
+	case 1:
+		if (ndx == 0 || ndx == 1)
+			p = "void";
+		break;
+	/* fork */
+	case 2:
+	/* read */
+	case 3:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* write */
+	case 4:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* open */
+	case 5:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* close */
+	case 6:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* wait4 */
+	case 7:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* link */
+	case 9:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* unlink */
+	case 10:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* chdir */
+	case 12:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fchdir */
+	case 13:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* mknod */
+	case 14:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* chmod */
+	case 15:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* chown */
+	case 16:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* obreak */
+	case 17:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getpid */
+	case 20:
+	/* mount */
+	case 21:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* unmount */
+	case 22:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setuid */
+	case 23:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getuid */
+	case 24:
+	/* geteuid */
+	case 25:
+	/* ptrace */
+	case 26:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* recvmsg */
+	case 27:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sendmsg */
+	case 28:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* recvfrom */
+	case 29:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* accept */
+	case 30:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getpeername */
+	case 31:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getsockname */
+	case 32:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* access */
+	case 33:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* chflags */
+	case 34:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fchflags */
+	case 35:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sync */
+	case 36:
+	/* kill */
+	case 37:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getppid */
+	case 39:
+	/* dup */
+	case 41:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* pipe */
+	case 42:
+	/* getegid */
+	case 43:
+	/* profil */
+	case 44:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ktrace */
+	case 45:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getgid */
+	case 47:
+	/* getlogin */
+	case 49:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setlogin */
+	case 50:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* acct */
+	case 51:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sigaltstack */
+	case 53:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ioctl */
+	case 54:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* reboot */
+	case 55:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* revoke */
+	case 56:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* symlink */
+	case 57:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* readlink */
+	case 58:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* execve */
+	case 59:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* umask */
+	case 60:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* chroot */
+	case 61:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* msync */
+	case 65:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* vfork */
+	case 66:
+	/* sbrk */
+	case 69:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sstk */
+	case 70:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ovadvise */
+	case 72:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* munmap */
+	case 73:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* mprotect */
+	case 74:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* madvise */
+	case 75:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* mincore */
+	case 78:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getgroups */
+	case 79:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setgroups */
+	case 80:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getpgrp */
+	case 81:
+	/* setpgid */
+	case 82:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setitimer */
+	case 83:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* swapon */
+	case 85:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getitimer */
+	case 86:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getdtablesize */
+	case 89:
+	/* dup2 */
+	case 90:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fcntl */
+	case 92:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* select */
+	case 93:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fsync */
+	case 95:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setpriority */
+	case 96:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* socket */
+	case 97:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* connect */
+	case 98:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getpriority */
+	case 100:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* bind */
+	case 104:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setsockopt */
+	case 105:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* listen */
+	case 106:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* gettimeofday */
+	case 116:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getrusage */
+	case 117:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getsockopt */
+	case 118:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* readv */
+	case 120:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* writev */
+	case 121:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* settimeofday */
+	case 122:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fchown */
+	case 123:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fchmod */
+	case 124:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setreuid */
+	case 126:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setregid */
+	case 127:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* rename */
+	case 128:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* flock */
+	case 131:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* mkfifo */
+	case 132:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sendto */
+	case 133:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* shutdown */
+	case 134:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* socketpair */
+	case 135:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* mkdir */
+	case 136:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* rmdir */
+	case 137:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* utimes */
+	case 138:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* adjtime */
+	case 140:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setsid */
+	case 147:
+	/* quotactl */
+	case 148:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* nlm_syscall */
+	case 154:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* nfssvc */
+	case 155:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* lgetfh */
+	case 160:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getfh */
+	case 161:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sysarch */
+	case 165:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* rtprio */
+	case 166:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* semsys */
+	case 169:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* msgsys */
+	case 170:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* shmsys */
+	case 171:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* freebsd6_pread */
+	case 173:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* freebsd6_pwrite */
+	case 174:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* setfib */
+	case 175:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ntp_adjtime */
+	case 176:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setgid */
+	case 181:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setegid */
+	case 182:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* seteuid */
+	case 183:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* stat */
+	case 188:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fstat */
+	case 189:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* lstat */
+	case 190:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* pathconf */
+	case 191:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fpathconf */
+	case 192:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getrlimit */
+	case 194:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setrlimit */
+	case 195:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getdirentries */
+	case 196:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* freebsd6_mmap */
+	case 197:
+		if (ndx == 0 || ndx == 1)
+			p = "caddr_t";
+		break;
+	/* nosys */
+	case 198:
+	/* freebsd6_lseek */
+	case 199:
+		if (ndx == 0 || ndx == 1)
+			p = "off_t";
+		break;
+	/* freebsd6_truncate */
+	case 200:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* freebsd6_ftruncate */
+	case 201:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __sysctl */
+	case 202:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* mlock */
+	case 203:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* munlock */
+	case 204:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* undelete */
+	case 205:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* futimes */
+	case 206:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getpgid */
+	case 207:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* poll */
+	case 209:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* lkmnosys */
+	case 210:
+	/* lkmnosys */
+	case 211:
+	/* lkmnosys */
+	case 212:
+	/* lkmnosys */
+	case 213:
+	/* lkmnosys */
+	case 214:
+	/* lkmnosys */
+	case 215:
+	/* lkmnosys */
+	case 216:
+	/* lkmnosys */
+	case 217:
+	/* lkmnosys */
+	case 218:
+	/* lkmnosys */
+	case 219:
+	/* semget */
+	case 221:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* semop */
+	case 222:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* msgget */
+	case 225:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* msgsnd */
+	case 226:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* msgrcv */
+	case 227:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* shmat */
+	case 228:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* shmdt */
+	case 230:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* shmget */
+	case 231:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* clock_gettime */
+	case 232:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* clock_settime */
+	case 233:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* clock_getres */
+	case 234:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ktimer_create */
+	case 235:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ktimer_delete */
+	case 236:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ktimer_settime */
+	case 237:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ktimer_gettime */
+	case 238:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ktimer_getoverrun */
+	case 239:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* nanosleep */
+	case 240:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ffclock_getcounter */
+	case 241:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ffclock_setestimate */
+	case 242:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ffclock_getestimate */
+	case 243:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* clock_getcpuclockid2 */
+	case 247:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ntp_gettime */
+	case 248:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* minherit */
+	case 250:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* rfork */
+	case 251:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* openbsd_poll */
+	case 252:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* issetugid */
+	case 253:
+	/* lchown */
+	case 254:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* aio_read */
+	case 255:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* aio_write */
+	case 256:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* lio_listio */
+	case 257:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getdents */
+	case 272:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* lchmod */
+	case 274:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* lchown */
+	case 275:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* lutimes */
+	case 276:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* msync */
+	case 277:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* nstat */
+	case 278:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* nfstat */
+	case 279:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* nlstat */
+	case 280:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* preadv */
+	case 289:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* pwritev */
+	case 290:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* fhopen */
+	case 298:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fhstat */
+	case 299:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* modnext */
+	case 300:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* modstat */
+	case 301:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* modfnext */
+	case 302:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* modfind */
+	case 303:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kldload */
+	case 304:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kldunload */
+	case 305:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kldfind */
+	case 306:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kldnext */
+	case 307:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kldstat */
+	case 308:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kldfirstmod */
+	case 309:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getsid */
+	case 310:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setresuid */
+	case 311:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setresgid */
+	case 312:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* aio_return */
+	case 314:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* aio_suspend */
+	case 315:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* aio_cancel */
+	case 316:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* aio_error */
+	case 317:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* oaio_read */
+	case 318:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* oaio_write */
+	case 319:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* olio_listio */
+	case 320:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* yield */
+	case 321:
+	/* mlockall */
+	case 324:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* munlockall */
+	case 325:
+	/* __getcwd */
+	case 326:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sched_setparam */
+	case 327:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sched_getparam */
+	case 328:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sched_setscheduler */
+	case 329:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sched_getscheduler */
+	case 330:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sched_yield */
+	case 331:
+	/* sched_get_priority_max */
+	case 332:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sched_get_priority_min */
+	case 333:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sched_rr_get_interval */
+	case 334:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* utrace */
+	case 335:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kldsym */
+	case 337:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* jail */
+	case 338:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* nnpfs_syscall */
+	case 339:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sigprocmask */
+	case 340:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sigsuspend */
+	case 341:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sigpending */
+	case 343:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sigtimedwait */
+	case 345:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sigwaitinfo */
+	case 346:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __acl_get_file */
+	case 347:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __acl_set_file */
+	case 348:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __acl_get_fd */
+	case 349:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __acl_set_fd */
+	case 350:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __acl_delete_file */
+	case 351:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __acl_delete_fd */
+	case 352:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __acl_aclcheck_file */
+	case 353:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __acl_aclcheck_fd */
+	case 354:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* extattrctl */
+	case 355:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* extattr_set_file */
+	case 356:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* extattr_get_file */
+	case 357:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* extattr_delete_file */
+	case 358:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* aio_waitcomplete */
+	case 359:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getresuid */
+	case 360:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getresgid */
+	case 361:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kqueue */
+	case 362:
+	/* kevent */
+	case 363:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* extattr_set_fd */
+	case 371:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* extattr_get_fd */
+	case 372:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* extattr_delete_fd */
+	case 373:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __setugid */
+	case 374:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* eaccess */
+	case 376:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* afs3_syscall */
+	case 377:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* nmount */
+	case 378:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __mac_get_proc */
+	case 384:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __mac_set_proc */
+	case 385:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __mac_get_fd */
+	case 386:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __mac_get_file */
+	case 387:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __mac_set_fd */
+	case 388:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __mac_set_file */
+	case 389:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kenv */
+	case 390:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* lchflags */
+	case 391:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* uuidgen */
+	case 392:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sendfile */
+	case 393:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* mac_syscall */
+	case 394:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getfsstat */
+	case 395:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* statfs */
+	case 396:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fstatfs */
+	case 397:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fhstatfs */
+	case 398:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ksem_close */
+	case 400:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ksem_post */
+	case 401:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ksem_wait */
+	case 402:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ksem_trywait */
+	case 403:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ksem_init */
+	case 404:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ksem_open */
+	case 405:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ksem_unlink */
+	case 406:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ksem_getvalue */
+	case 407:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ksem_destroy */
+	case 408:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __mac_get_pid */
+	case 409:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __mac_get_link */
+	case 410:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __mac_set_link */
+	case 411:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* extattr_set_link */
+	case 412:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* extattr_get_link */
+	case 413:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* extattr_delete_link */
+	case 414:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __mac_execve */
+	case 415:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sigaction */
+	case 416:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sigreturn */
+	case 417:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getcontext */
+	case 421:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setcontext */
+	case 422:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* swapcontext */
+	case 423:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* swapoff */
+	case 424:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __acl_get_link */
+	case 425:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __acl_set_link */
+	case 426:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __acl_delete_link */
+	case 427:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __acl_aclcheck_link */
+	case 428:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sigwait */
+	case 429:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* thr_create */
+	case 430:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* thr_exit */
+	case 431:
+		if (ndx == 0 || ndx == 1)
+			p = "void";
+		break;
+	/* thr_self */
+	case 432:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* thr_kill */
+	case 433:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* _umtx_lock */
+	case 434:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* _umtx_unlock */
+	case 435:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* jail_attach */
+	case 436:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* extattr_list_fd */
+	case 437:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* extattr_list_file */
+	case 438:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* extattr_list_link */
+	case 439:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* ksem_timedwait */
+	case 441:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* thr_suspend */
+	case 442:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* thr_wake */
+	case 443:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kldunloadf */
+	case 444:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* audit */
+	case 445:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* auditon */
+	case 446:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getauid */
+	case 447:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setauid */
+	case 448:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getaudit */
+	case 449:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setaudit */
+	case 450:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getaudit_addr */
+	case 451:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setaudit_addr */
+	case 452:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* auditctl */
+	case 453:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* _umtx_op */
+	case 454:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* thr_new */
+	case 455:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sigqueue */
+	case 456:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kmq_open */
+	case 457:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kmq_setattr */
+	case 458:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kmq_timedreceive */
+	case 459:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kmq_timedsend */
+	case 460:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kmq_notify */
+	case 461:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* kmq_unlink */
+	case 462:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* abort2 */
+	case 463:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* thr_set_name */
+	case 464:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* aio_fsync */
+	case 465:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* rtprio_thread */
+	case 466:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sctp_peeloff */
+	case 471:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sctp_generic_sendmsg */
+	case 472:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sctp_generic_sendmsg_iov */
+	case 473:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* sctp_generic_recvmsg */
+	case 474:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* pread */
+	case 475:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* pwrite */
+	case 476:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* mmap */
+	case 477:
+		if (ndx == 0 || ndx == 1)
+			p = "caddr_t";
+		break;
+	/* lseek */
+	case 478:
+		if (ndx == 0 || ndx == 1)
+			p = "off_t";
+		break;
+	/* truncate */
+	case 479:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* ftruncate */
+	case 480:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* thr_kill2 */
+	case 481:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* shm_open */
+	case 482:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* shm_unlink */
+	case 483:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* cpuset */
+	case 484:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* cpuset_setid */
+	case 485:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* cpuset_getid */
+	case 486:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* cpuset_getaffinity */
+	case 487:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* cpuset_setaffinity */
+	case 488:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* faccessat */
+	case 489:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fchmodat */
+	case 490:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fchownat */
+	case 491:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fexecve */
+	case 492:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* fstatat */
+	case 493:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* futimesat */
+	case 494:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* linkat */
+	case 495:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* mkdirat */
+	case 496:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* mkfifoat */
+	case 497:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* mknodat */
+	case 498:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* openat */
+	case 499:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* readlinkat */
+	case 500:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* renameat */
+	case 501:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* symlinkat */
+	case 502:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* unlinkat */
+	case 503:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* posix_openpt */
+	case 504:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* gssd_syscall */
+	case 505:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* jail_get */
+	case 506:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* jail_set */
+	case 507:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* jail_remove */
+	case 508:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* closefrom */
+	case 509:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __semctl */
+	case 510:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* msgctl */
+	case 511:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* shmctl */
+	case 512:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* lpathconf */
+	case 513:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* __cap_rights_get */
+	case 515:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* cap_enter */
+	case 516:
+	/* cap_getmode */
+	case 517:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* pdfork */
+	case 518:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* pdkill */
+	case 519:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* pdgetpid */
+	case 520:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* pselect */
+	case 522:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* getloginclass */
+	case 523:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* setloginclass */
+	case 524:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* rctl_get_racct */
+	case 525:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* rctl_get_rules */
+	case 526:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* rctl_get_limits */
+	case 527:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* rctl_add_rule */
+	case 528:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* rctl_remove_rule */
+	case 529:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* posix_fallocate */
+	case 530:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* posix_fadvise */
+	case 531:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* wait6 */
+	case 532:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* cap_rights_limit */
+	case 533:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* cap_ioctls_limit */
+	case 534:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* cap_ioctls_get */
+	case 535:
+		if (ndx == 0 || ndx == 1)
+			p = "ssize_t";
+		break;
+	/* cap_fcntls_limit */
+	case 536:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* cap_fcntls_get */
+	case 537:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* bindat */
+	case 538:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* connectat */
+	case 539:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* chflagsat */
+	case 540:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* accept4 */
+	case 541:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* pipe2 */
+	case 542:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* aio_mlock */
+	case 543:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	default:
+		break;
+	};
+	if (p != NULL)
+		strlcpy(desc, p, descsz);
+}
diff --git a/sys/kern/sysv_ipc.c b/sys/kern/sysv_ipc.c
new file mode 100644
index 0000000..e402cb5
--- /dev/null
+++ b/sys/kern/sysv_ipc.c
@@ -0,0 +1,246 @@
+/*	$NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $	*/
+/*-
+ * Copyright (c) 1994 Herb Peyerl <hpeyerl@novatel.ca>
+ * Copyright (c) 2006 nCircle Network Security, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by Robert N. M. Watson for the TrustedBSD
+ * Project under contract to nCircle Network Security, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Herb Peyerl.
+ * 4. The name of Herb Peyerl may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sem.h>
+#include <sys/shm.h>
+#include <sys/ipc.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/ucred.h>
+
+void (*shmfork_hook)(struct proc *, struct proc *) = NULL;
+void (*shmexit_hook)(struct vmspace *) = NULL;
+
+/* called from kern_fork.c */
+void
+shmfork(p1, p2)
+	struct proc *p1, *p2;
+{
+
+	if (shmfork_hook != NULL)
+		shmfork_hook(p1, p2);
+	return;
+}
+
+/* called from kern_exit.c */
+void
+shmexit(struct vmspace *vm)
+{
+
+	if (shmexit_hook != NULL)
+		shmexit_hook(vm);
+	return;
+}
+
+/*
+ * Check for IPC permission.
+ *
+ * Note: The MAC Framework does not require any modifications to the
+ * ipcperm() function, as access control checks are performed throughout the
+ * implementation of each primitive.  Those entry point calls complement the
+ * ipcperm() discertionary checks.  Unlike file system discretionary access
+ * control, the original create of an object is given the same rights as the
+ * current owner.
+ */
+int
+ipcperm(struct thread *td, struct ipc_perm *perm, int acc_mode)
+{
+	struct ucred *cred = td->td_ucred;
+	int error, obj_mode, dac_granted, priv_granted;
+
+	dac_granted = 0;
+	if (cred->cr_uid == perm->cuid || cred->cr_uid == perm->uid) {
+		obj_mode = perm->mode;
+		dac_granted |= IPC_M;
+	} else if (groupmember(perm->gid, cred) ||
+	    groupmember(perm->cgid, cred)) {
+		obj_mode = perm->mode;
+		obj_mode <<= 3;
+	} else {
+		obj_mode = perm->mode;
+		obj_mode <<= 6;
+	}
+
+	/*
+	 * While the System V IPC permission model allows IPC_M to be
+	 * granted, as part of the mode, our implementation requires
+	 * privilege to adminster the object if not the owner or creator.
+	 */
+#if 0
+	if (obj_mode & IPC_M)
+		dac_granted |= IPC_M;
+#endif
+	if (obj_mode & IPC_R)
+		dac_granted |= IPC_R;
+	if (obj_mode & IPC_W)
+		dac_granted |= IPC_W;
+
+	/*
+	 * Simple case: all required rights are granted by DAC.
+	 */
+	if ((dac_granted & acc_mode) == acc_mode)
+		return (0);
+
+	/*
+	 * Privilege is required to satisfy the request.
+	 */
+	priv_granted = 0;
+	if ((acc_mode & IPC_M) && !(dac_granted & IPC_M)) {
+		error = priv_check(td, PRIV_IPC_ADMIN);
+		if (error == 0)
+			priv_granted |= IPC_M;
+	}
+
+	if ((acc_mode & IPC_R) && !(dac_granted & IPC_R)) {
+		error = priv_check(td, PRIV_IPC_READ);
+		if (error == 0)
+			priv_granted |= IPC_R;
+	}
+
+	if ((acc_mode & IPC_W) && !(dac_granted & IPC_W)) {
+		error = priv_check(td, PRIV_IPC_WRITE);
+		if (error == 0)
+			priv_granted |= IPC_W;
+	}
+
+	if (((dac_granted | priv_granted) & acc_mode) == acc_mode)
+		return (0);
+	else
+		return (EACCES);
+}
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+void
+ipcperm_old2new(struct ipc_perm_old *old, struct ipc_perm *new)
+{
+
+	new->cuid = old->cuid;
+	new->cgid = old->cgid;
+	new->uid = old->uid;
+	new->gid = old->gid;
+	new->mode = old->mode;
+	new->seq = old->seq;
+	new->key = old->key;
+}
+
+void
+ipcperm_new2old(struct ipc_perm *new, struct ipc_perm_old *old)
+{
+
+	/* XXX: How to handle ID's > USHORT_MAX? */
+	old->cuid = new->cuid;
+	old->cgid = new->cgid;
+	old->uid = new->uid;
+	old->gid = new->gid;
+	old->mode = new->mode;
+	old->seq = new->seq;
+	old->key = new->key;
+}
+#endif
+
+#ifdef COMPAT_FREEBSD32
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_ipc.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+void
+freebsd32_ipcperm_old_in(struct ipc_perm32_old *ip32, struct ipc_perm *ip)
+{
+
+	CP(*ip32, *ip, cuid);
+	CP(*ip32, *ip, cgid);
+	CP(*ip32, *ip, uid);
+	CP(*ip32, *ip, gid);
+	CP(*ip32, *ip, mode);
+	CP(*ip32, *ip, seq);
+	CP(*ip32, *ip, key);
+}
+
+void
+freebsd32_ipcperm_old_out(struct ipc_perm *ip, struct ipc_perm32_old *ip32)
+{
+
+	CP(*ip, *ip32, cuid);
+	CP(*ip, *ip32, cgid);
+	CP(*ip, *ip32, uid);
+	CP(*ip, *ip32, gid);
+	CP(*ip, *ip32, mode);
+	CP(*ip, *ip32, seq);
+	CP(*ip, *ip32, key);
+}
+#endif
+
+void
+freebsd32_ipcperm_in(struct ipc_perm32 *ip32, struct ipc_perm *ip)
+{
+
+	CP(*ip32, *ip, cuid);
+	CP(*ip32, *ip, cgid);
+	CP(*ip32, *ip, uid);
+	CP(*ip32, *ip, gid);
+	CP(*ip32, *ip, mode);
+	CP(*ip32, *ip, seq);
+	CP(*ip32, *ip, key);
+}
+
+void
+freebsd32_ipcperm_out(struct ipc_perm *ip, struct ipc_perm32 *ip32)
+{
+
+	CP(*ip, *ip32, cuid);
+	CP(*ip, *ip32, cgid);
+	CP(*ip, *ip32, uid);
+	CP(*ip, *ip32, gid);
+	CP(*ip, *ip32, mode);
+	CP(*ip, *ip32, seq);
+	CP(*ip, *ip32, key);
+}
+#endif
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
new file mode 100644
index 0000000..d58cb7e
--- /dev/null
+++ b/sys/kern/sysv_msg.c
@@ -0,0 +1,1592 @@
+/*-
+ * Implementation of SVID messages
+ *
+ * Author:  Daniel Boulet
+ *
+ * Copyright 1993 Daniel Boulet and RTMX Inc.
+ *
+ * This system call was implemented by Daniel Boulet under contract from RTMX.
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+/*-
+ * Copyright (c) 2003-2005 McAfee, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project in part by McAfee
+ * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
+ * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
+ * program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/module.h>
+#include <sys/msg.h>
+#include <sys/racct.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/jail.h>
+
+#include <security/mac/mac_framework.h>
+
+FEATURE(sysv_msg, "System V message queues support");
+
+static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");
+
+static int msginit(void);
+static int msgunload(void);
+static int sysvmsg_modload(struct module *, int, void *);
+
+
+#ifdef MSG_DEBUG
+#define DPRINTF(a)	printf a
+#else
+#define DPRINTF(a)	(void)0
+#endif
+
+static void msg_freehdr(struct msg *msghdr);
+
+#ifndef MSGSSZ
+#define MSGSSZ	8		/* Each segment must be 2^N long */
+#endif
+#ifndef MSGSEG
+#define MSGSEG	2048		/* must be less than 32767 */
+#endif
+#define MSGMAX	(MSGSSZ*MSGSEG)
+#ifndef MSGMNB
+#define MSGMNB	2048		/* max # of bytes in a queue */
+#endif
+#ifndef MSGMNI
+#define MSGMNI	40
+#endif
+#ifndef MSGTQL
+#define MSGTQL	40
+#endif
+
+/*
+ * Based on the configuration parameters described in an SVR2 (yes, two)
+ * config(1m) man page.
+ *
+ * Each message is broken up and stored in segments that are msgssz bytes
+ * long.  For efficiency reasons, this should be a power of two.  Also,
+ * it doesn't make sense if it is less than 8 or greater than about 256.
+ * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of
+ * two between 8 and 1024 inclusive (and panic's if it isn't).
+ */
+struct msginfo msginfo = {
+                MSGMAX,         /* max chars in a message */
+                MSGMNI,         /* # of message queue identifiers */
+                MSGMNB,         /* max chars in a queue */
+                MSGTQL,         /* max messages in system */
+                MSGSSZ,         /* size of a message segment */
+                		/* (must be small power of 2 greater than 4) */
+                MSGSEG          /* number of message segments */
+};
+
+/*
+ * macros to convert between msqid_ds's and msqid's.
+ * (specific to this implementation)
+ */
+#define MSQID(ix,ds)	((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000))
+#define MSQID_IX(id)	((id) & 0xffff)
+#define MSQID_SEQ(id)	(((id) >> 16) & 0xffff)
+
+/*
+ * The rest of this file is specific to this particular implementation.
+ */
+
+struct msgmap {
+	short	next;		/* next segment in buffer */
+    				/* -1 -> available */
+    				/* 0..(MSGSEG-1) -> index of next segment */
+};
+
+#define MSG_LOCKED	01000	/* Is this msqid_ds locked? */
+
+static int nfree_msgmaps;	/* # of free map entries */
+static short free_msgmaps;	/* head of linked list of free map entries */
+static struct msg *free_msghdrs;/* list of free msg headers */
+static char *msgpool;		/* MSGMAX byte long msg buffer pool */
+static struct msgmap *msgmaps;	/* MSGSEG msgmap structures */
+static struct msg *msghdrs;	/* MSGTQL msg headers */
+static struct msqid_kernel *msqids;	/* MSGMNI msqid_kernel struct's */
+static struct mtx msq_mtx;	/* global mutex for message queues. */
+
+static struct syscall_helper_data msg_syscalls[] = {
+	SYSCALL_INIT_HELPER(msgctl),
+	SYSCALL_INIT_HELPER(msgget),
+	SYSCALL_INIT_HELPER(msgsnd),
+	SYSCALL_INIT_HELPER(msgrcv),
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+	SYSCALL_INIT_HELPER(msgsys),
+	SYSCALL_INIT_HELPER_COMPAT(freebsd7_msgctl),
+#endif
+	SYSCALL_INIT_LAST
+};
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_ipc.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+static struct syscall_helper_data msg32_syscalls[] = {
+	SYSCALL32_INIT_HELPER(freebsd32_msgctl),
+	SYSCALL32_INIT_HELPER(freebsd32_msgsnd),
+	SYSCALL32_INIT_HELPER(freebsd32_msgrcv),
+	SYSCALL32_INIT_HELPER_COMPAT(msgget),
+	SYSCALL32_INIT_HELPER(freebsd32_msgsys),
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_msgctl),
+#endif
+	SYSCALL_INIT_LAST
+};
+#endif
+
+static int
+msginit()
+{
+	int i, error;
+
+	TUNABLE_INT_FETCH("kern.ipc.msgseg", &msginfo.msgseg);
+	TUNABLE_INT_FETCH("kern.ipc.msgssz", &msginfo.msgssz);
+	msginfo.msgmax = msginfo.msgseg * msginfo.msgssz;
+	TUNABLE_INT_FETCH("kern.ipc.msgmni", &msginfo.msgmni);
+	TUNABLE_INT_FETCH("kern.ipc.msgmnb", &msginfo.msgmnb);
+	TUNABLE_INT_FETCH("kern.ipc.msgtql", &msginfo.msgtql);
+
+	msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK);
+	msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK);
+	msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK);
+	msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG,
+	    M_WAITOK);
+
+	/*
+	 * msginfo.msgssz should be a power of two for efficiency reasons.
+	 * It is also pretty silly if msginfo.msgssz is less than 8
+	 * or greater than about 256 so ...
+	 */
+
+	i = 8;
+	while (i < 1024 && i != msginfo.msgssz)
+		i <<= 1;
+    	if (i != msginfo.msgssz) {
+		DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
+		    msginfo.msgssz));
+		panic("msginfo.msgssz not a small power of 2");
+	}
+
+	if (msginfo.msgseg > 32767) {
+		DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg));
+		panic("msginfo.msgseg > 32767");
+	}
+
+	for (i = 0; i < msginfo.msgseg; i++) {
+		if (i > 0)
+			msgmaps[i-1].next = i;
+		msgmaps[i].next = -1;	/* implies entry is available */
+	}
+	free_msgmaps = 0;
+	nfree_msgmaps = msginfo.msgseg;
+
+	for (i = 0; i < msginfo.msgtql; i++) {
+		msghdrs[i].msg_type = 0;
+		if (i > 0)
+			msghdrs[i-1].msg_next = &msghdrs[i];
+		msghdrs[i].msg_next = NULL;
+#ifdef MAC
+		mac_sysvmsg_init(&msghdrs[i]);
+#endif
+    	}
+	free_msghdrs = &msghdrs[0];
+
+	for (i = 0; i < msginfo.msgmni; i++) {
+		msqids[i].u.msg_qbytes = 0;	/* implies entry is available */
+		msqids[i].u.msg_perm.seq = 0;	/* reset to a known value */
+		msqids[i].u.msg_perm.mode = 0;
+#ifdef MAC
+		mac_sysvmsq_init(&msqids[i]);
+#endif
+	}
+	mtx_init(&msq_mtx, "msq", NULL, MTX_DEF);
+
+	error = syscall_helper_register(msg_syscalls);
+	if (error != 0)
+		return (error);
+#ifdef COMPAT_FREEBSD32
+	error = syscall32_helper_register(msg32_syscalls);
+	if (error != 0)
+		return (error);
+#endif
+	return (0);
+}
+
+static int
+msgunload()
+{
+	struct msqid_kernel *msqkptr;
+	int msqid;
+#ifdef MAC
+	int i;
+#endif
+
+	syscall_helper_unregister(msg_syscalls);
+#ifdef COMPAT_FREEBSD32
+	syscall32_helper_unregister(msg32_syscalls);
+#endif
+
+	for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+		/*
+		 * Look for an unallocated and unlocked msqid_ds.
+		 * msqid_ds's can be locked by msgsnd or msgrcv while
+		 * they are copying the message in/out.  We can't
+		 * re-use the entry until they release it.
+		 */
+		msqkptr = &msqids[msqid];
+		if (msqkptr->u.msg_qbytes != 0 ||
+		    (msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
+			break;
+	}
+	if (msqid != msginfo.msgmni)
+		return (EBUSY);
+
+#ifdef MAC
+	for (i = 0; i < msginfo.msgtql; i++)
+		mac_sysvmsg_destroy(&msghdrs[i]);
+	for (msqid = 0; msqid < msginfo.msgmni; msqid++)
+		mac_sysvmsq_destroy(&msqids[msqid]);
+#endif
+	free(msgpool, M_MSG);
+	free(msgmaps, M_MSG);
+	free(msghdrs, M_MSG);
+	free(msqids, M_MSG);
+	mtx_destroy(&msq_mtx);
+	return (0);
+}
+
+
+static int
+sysvmsg_modload(struct module *module, int cmd, void *arg)
+{
+	int error = 0;
+
+	switch (cmd) {
+	case MOD_LOAD:
+		error = msginit();
+		if (error != 0)
+			msgunload();
+		break;
+	case MOD_UNLOAD:
+		error = msgunload();
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t sysvmsg_mod = {
+	"sysvmsg",
+	&sysvmsg_modload,
+	NULL
+};
+
+DECLARE_MODULE(sysvmsg, sysvmsg_mod, SI_SUB_SYSV_MSG, SI_ORDER_FIRST);
+MODULE_VERSION(sysvmsg, 1);
+
+static void
+msg_freehdr(msghdr)
+	struct msg *msghdr;
+{
+	while (msghdr->msg_ts > 0) {
+		short next;
+		if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
+			panic("msghdr->msg_spot out of range");
+		next = msgmaps[msghdr->msg_spot].next;
+		msgmaps[msghdr->msg_spot].next = free_msgmaps;
+		free_msgmaps = msghdr->msg_spot;
+		nfree_msgmaps++;
+		msghdr->msg_spot = next;
+		if (msghdr->msg_ts >= msginfo.msgssz)
+			msghdr->msg_ts -= msginfo.msgssz;
+		else
+			msghdr->msg_ts = 0;
+	}
+	if (msghdr->msg_spot != -1)
+		panic("msghdr->msg_spot != -1");
+	msghdr->msg_next = free_msghdrs;
+	free_msghdrs = msghdr;
+#ifdef MAC
+	mac_sysvmsg_cleanup(msghdr);
+#endif
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgctl_args {
+	int	msqid;
+	int	cmd;
+	struct	msqid_ds *buf;
+};
+#endif
+int
+sys_msgctl(td, uap)
+	struct thread *td;
+	register struct msgctl_args *uap;
+{
+	int msqid = uap->msqid;
+	int cmd = uap->cmd;
+	struct msqid_ds msqbuf;
+	int error;
+
+	DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf));
+	if (cmd == IPC_SET &&
+	    (error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0)
+		return (error);
+	error = kern_msgctl(td, msqid, cmd, &msqbuf);
+	if (cmd == IPC_STAT && error == 0)
+		error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds));
+	return (error);
+}
+
+int
+kern_msgctl(td, msqid, cmd, msqbuf)
+	struct thread *td;
+	int msqid;
+	int cmd;
+	struct msqid_ds *msqbuf;
+{
+	int rval, error, msqix;
+	register struct msqid_kernel *msqkptr;
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+
+	msqix = IPCID_TO_IX(msqid);
+
+	if (msqix < 0 || msqix >= msginfo.msgmni) {
+		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
+		    msginfo.msgmni));
+		return (EINVAL);
+	}
+
+	msqkptr = &msqids[msqix];
+
+	mtx_lock(&msq_mtx);
+	if (msqkptr->u.msg_qbytes == 0) {
+		DPRINTF(("no such msqid\n"));
+		error = EINVAL;
+		goto done2;
+	}
+	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
+		DPRINTF(("wrong sequence number\n"));
+		error = EINVAL;
+		goto done2;
+	}
+#ifdef MAC
+	error = mac_sysvmsq_check_msqctl(td->td_ucred, msqkptr, cmd);
+	if (error != 0)
+		goto done2;
+#endif
+
+	error = 0;
+	rval = 0;
+
+	switch (cmd) {
+
+	case IPC_RMID:
+	{
+		struct msg *msghdr;
+		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
+			goto done2;
+
+#ifdef MAC
+		/*
+		 * Check that the thread has MAC access permissions to
+		 * individual msghdrs.  Note: We need to do this in a
+		 * separate loop because the actual loop alters the
+		 * msq/msghdr info as it progresses, and there is no going
+		 * back if half the way through we discover that the
+		 * thread cannot free a certain msghdr.  The msq will get
+		 * into an inconsistent state.
+		 */
+		for (msghdr = msqkptr->u.msg_first; msghdr != NULL;
+		    msghdr = msghdr->msg_next) {
+			error = mac_sysvmsq_check_msgrmid(td->td_ucred, msghdr);
+			if (error != 0)
+				goto done2;
+		}
+#endif
+
+		racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1);
+		racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum);
+		racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes);
+		crfree(msqkptr->cred);
+		msqkptr->cred = NULL;
+
+		/* Free the message headers */
+		msghdr = msqkptr->u.msg_first;
+		while (msghdr != NULL) {
+			struct msg *msghdr_tmp;
+
+			/* Free the segments of each message */
+			msqkptr->u.msg_cbytes -= msghdr->msg_ts;
+			msqkptr->u.msg_qnum--;
+			msghdr_tmp = msghdr;
+			msghdr = msghdr->msg_next;
+			msg_freehdr(msghdr_tmp);
+		}
+
+		if (msqkptr->u.msg_cbytes != 0)
+			panic("msg_cbytes is screwed up");
+		if (msqkptr->u.msg_qnum != 0)
+			panic("msg_qnum is screwed up");
+
+		msqkptr->u.msg_qbytes = 0;	/* Mark it as free */
+
+#ifdef MAC
+		mac_sysvmsq_cleanup(msqkptr);
+#endif
+
+		wakeup(msqkptr);
+	}
+
+		break;
+
+	case IPC_SET:
+		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
+			goto done2;
+		if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) {
+			error = priv_check(td, PRIV_IPC_MSGSIZE);
+			if (error)
+				goto done2;
+		}
+		if (msqbuf->msg_qbytes > msginfo.msgmnb) {
+			DPRINTF(("can't increase msg_qbytes beyond %d"
+			    "(truncating)\n", msginfo.msgmnb));
+			msqbuf->msg_qbytes = msginfo.msgmnb;	/* silently restrict qbytes to system limit */
+		}
+		if (msqbuf->msg_qbytes == 0) {
+			DPRINTF(("can't reduce msg_qbytes to 0\n"));
+			error = EINVAL;		/* non-standard errno! */
+			goto done2;
+		}
+		msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid;	/* change the owner */
+		msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid;	/* change the owner */
+		msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) |
+		    (msqbuf->msg_perm.mode & 0777);
+		msqkptr->u.msg_qbytes = msqbuf->msg_qbytes;
+		msqkptr->u.msg_ctime = time_second;
+		break;
+
+	case IPC_STAT:
+		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
+			DPRINTF(("requester doesn't have read access\n"));
+			goto done2;
+		}
+		*msqbuf = msqkptr->u;
+		break;
+
+	default:
+		DPRINTF(("invalid command %d\n", cmd));
+		error = EINVAL;
+		goto done2;
+	}
+
+	if (error == 0)
+		td->td_retval[0] = rval;
+done2:
+	mtx_unlock(&msq_mtx);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgget_args {
+	key_t	key;
+	int	msgflg;
+};
+#endif
+
+int
+sys_msgget(td, uap)
+	struct thread *td;
+	register struct msgget_args *uap;
+{
+	int msqid, error = 0;
+	int key = uap->key;
+	int msgflg = uap->msgflg;
+	struct ucred *cred = td->td_ucred;
+	register struct msqid_kernel *msqkptr = NULL;
+
+	DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg));
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+
+	mtx_lock(&msq_mtx);
+	if (key != IPC_PRIVATE) {
+		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+			msqkptr = &msqids[msqid];
+			if (msqkptr->u.msg_qbytes != 0 &&
+			    msqkptr->u.msg_perm.key == key)
+				break;
+		}
+		if (msqid < msginfo.msgmni) {
+			DPRINTF(("found public key\n"));
+			if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
+				DPRINTF(("not exclusive\n"));
+				error = EEXIST;
+				goto done2;
+			}
+			if ((error = ipcperm(td, &msqkptr->u.msg_perm,
+			    msgflg & 0700))) {
+				DPRINTF(("requester doesn't have 0%o access\n",
+				    msgflg & 0700));
+				goto done2;
+			}
+#ifdef MAC
+			error = mac_sysvmsq_check_msqget(cred, msqkptr);
+			if (error != 0)
+				goto done2;
+#endif
+			goto found;
+		}
+	}
+
+	DPRINTF(("need to allocate the msqid_ds\n"));
+	if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
+		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+			/*
+			 * Look for an unallocated and unlocked msqid_ds.
+			 * msqid_ds's can be locked by msgsnd or msgrcv while
+			 * they are copying the message in/out.  We can't
+			 * re-use the entry until they release it.
+			 */
+			msqkptr = &msqids[msqid];
+			if (msqkptr->u.msg_qbytes == 0 &&
+			    (msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0)
+				break;
+		}
+		if (msqid == msginfo.msgmni) {
+			DPRINTF(("no more msqid_ds's available\n"));
+			error = ENOSPC;
+			goto done2;
+		}
+#ifdef RACCT
+		PROC_LOCK(td->td_proc);
+		error = racct_add(td->td_proc, RACCT_NMSGQ, 1);
+		PROC_UNLOCK(td->td_proc);
+		if (error != 0) {
+			error = ENOSPC;
+			goto done2;
+		}
+#endif
+		DPRINTF(("msqid %d is available\n", msqid));
+		msqkptr->u.msg_perm.key = key;
+		msqkptr->u.msg_perm.cuid = cred->cr_uid;
+		msqkptr->u.msg_perm.uid = cred->cr_uid;
+		msqkptr->u.msg_perm.cgid = cred->cr_gid;
+		msqkptr->u.msg_perm.gid = cred->cr_gid;
+		msqkptr->u.msg_perm.mode = (msgflg & 0777);
+		msqkptr->cred = crhold(cred);
+		/* Make sure that the returned msqid is unique */
+		msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff;
+		msqkptr->u.msg_first = NULL;
+		msqkptr->u.msg_last = NULL;
+		msqkptr->u.msg_cbytes = 0;
+		msqkptr->u.msg_qnum = 0;
+		msqkptr->u.msg_qbytes = msginfo.msgmnb;
+		msqkptr->u.msg_lspid = 0;
+		msqkptr->u.msg_lrpid = 0;
+		msqkptr->u.msg_stime = 0;
+		msqkptr->u.msg_rtime = 0;
+		msqkptr->u.msg_ctime = time_second;
+#ifdef MAC
+		mac_sysvmsq_create(cred, msqkptr);
+#endif
+	} else {
+		DPRINTF(("didn't find it and wasn't asked to create it\n"));
+		error = ENOENT;
+		goto done2;
+	}
+
+found:
+	/* Construct the unique msqid */
+	td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm);
+done2:
+	mtx_unlock(&msq_mtx);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgsnd_args {
+	int	msqid;
+	const void	*msgp;
+	size_t	msgsz;
+	int	msgflg;
+};
+#endif
+int
+kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
+	struct thread *td;
+	int msqid;
+	const void *msgp;	/* XXX msgp is actually mtext. */
+	size_t msgsz;
+	int msgflg;
+	long mtype;
+{
+	int msqix, segs_needed, error = 0;
+	register struct msqid_kernel *msqkptr;
+	register struct msg *msghdr;
+	short next;
+#ifdef RACCT
+	size_t saved_msgsz;
+#endif
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+
+	mtx_lock(&msq_mtx);
+	msqix = IPCID_TO_IX(msqid);
+
+	if (msqix < 0 || msqix >= msginfo.msgmni) {
+		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
+		    msginfo.msgmni));
+		error = EINVAL;
+		goto done2;
+	}
+
+	msqkptr = &msqids[msqix];
+	if (msqkptr->u.msg_qbytes == 0) {
+		DPRINTF(("no such message queue id\n"));
+		error = EINVAL;
+		goto done2;
+	}
+	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
+		DPRINTF(("wrong sequence number\n"));
+		error = EINVAL;
+		goto done2;
+	}
+
+	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) {
+		DPRINTF(("requester doesn't have write access\n"));
+		goto done2;
+	}
+
+#ifdef MAC
+	error = mac_sysvmsq_check_msqsnd(td->td_ucred, msqkptr);
+	if (error != 0)
+		goto done2;
+#endif
+
+#ifdef RACCT
+	PROC_LOCK(td->td_proc);
+	if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) {
+		PROC_UNLOCK(td->td_proc);
+		error = EAGAIN;
+		goto done2;
+	}
+	saved_msgsz = msgsz;
+	if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) {
+		racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
+		PROC_UNLOCK(td->td_proc);
+		error = EAGAIN;
+		goto done2;
+	}
+	PROC_UNLOCK(td->td_proc);
+#endif
+
+	segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
+	DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
+	    msginfo.msgssz, segs_needed));
+	for (;;) {
+		int need_more_resources = 0;
+
+		/*
+		 * check msgsz
+		 * (inside this loop in case msg_qbytes changes while we sleep)
+		 */
+
+		if (msgsz > msqkptr->u.msg_qbytes) {
+			DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n"));
+			error = EINVAL;
+			goto done3;
+		}
+
+		if (msqkptr->u.msg_perm.mode & MSG_LOCKED) {
+			DPRINTF(("msqid is locked\n"));
+			need_more_resources = 1;
+		}
+		if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) {
+			DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n"));
+			need_more_resources = 1;
+		}
+		if (segs_needed > nfree_msgmaps) {
+			DPRINTF(("segs_needed > nfree_msgmaps\n"));
+			need_more_resources = 1;
+		}
+		if (free_msghdrs == NULL) {
+			DPRINTF(("no more msghdrs\n"));
+			need_more_resources = 1;
+		}
+
+		if (need_more_resources) {
+			int we_own_it;
+
+			if ((msgflg & IPC_NOWAIT) != 0) {
+				DPRINTF(("need more resources but caller "
+				    "doesn't want to wait\n"));
+				error = EAGAIN;
+				goto done3;
+			}
+
+			if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) {
+				DPRINTF(("we don't own the msqid_ds\n"));
+				we_own_it = 0;
+			} else {
+				/* Force later arrivals to wait for our
+				   request */
+				DPRINTF(("we own the msqid_ds\n"));
+				msqkptr->u.msg_perm.mode |= MSG_LOCKED;
+				we_own_it = 1;
+			}
+			DPRINTF(("msgsnd:  goodnight\n"));
+			error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
+			    "msgsnd", hz);
+			DPRINTF(("msgsnd:  good morning, error=%d\n", error));
+			if (we_own_it)
+				msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
+			if (error == EWOULDBLOCK) {
+				DPRINTF(("msgsnd:  timed out\n"));
+				continue;
+			}
+			if (error != 0) {
+				DPRINTF(("msgsnd:  interrupted system call\n"));
+				error = EINTR;
+				goto done3;
+			}
+
+			/*
+			 * Make sure that the msq queue still exists
+			 */
+
+			if (msqkptr->u.msg_qbytes == 0) {
+				DPRINTF(("msqid deleted\n"));
+				error = EIDRM;
+				goto done3;
+			}
+
+		} else {
+			DPRINTF(("got all the resources that we need\n"));
+			break;
+		}
+	}
+
+	/*
+	 * We have the resources that we need.
+	 * Make sure!
+	 */
+
+	if (msqkptr->u.msg_perm.mode & MSG_LOCKED)
+		panic("msg_perm.mode & MSG_LOCKED");
+	if (segs_needed > nfree_msgmaps)
+		panic("segs_needed > nfree_msgmaps");
+	if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes)
+		panic("msgsz + msg_cbytes > msg_qbytes");
+	if (free_msghdrs == NULL)
+		panic("no more msghdrs");
+
+	/*
+	 * Re-lock the msqid_ds in case we page-fault when copying in the
+	 * message
+	 */
+
+	if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
+		panic("msqid_ds is already locked");
+	msqkptr->u.msg_perm.mode |= MSG_LOCKED;
+
+	/*
+	 * Allocate a message header
+	 */
+
+	msghdr = free_msghdrs;
+	free_msghdrs = msghdr->msg_next;
+	msghdr->msg_spot = -1;
+	msghdr->msg_ts = msgsz;
+	msghdr->msg_type = mtype;
+#ifdef MAC
+	/*
+	 * XXXMAC: Should the mac_sysvmsq_check_msgmsq check follow here
+	 * immediately?  Or, should it be checked just before the msg is
+	 * enqueued in the msgq (as it is done now)?
+	 */
+	mac_sysvmsg_create(td->td_ucred, msqkptr, msghdr);
+#endif
+
+	/*
+	 * Allocate space for the message
+	 */
+
+	while (segs_needed > 0) {
+		if (nfree_msgmaps <= 0)
+			panic("not enough msgmaps");
+		if (free_msgmaps == -1)
+			panic("nil free_msgmaps");
+		next = free_msgmaps;
+		if (next <= -1)
+			panic("next too low #1");
+		if (next >= msginfo.msgseg)
+			panic("next out of range #1");
+		DPRINTF(("allocating segment %d to message\n", next));
+		free_msgmaps = msgmaps[next].next;
+		nfree_msgmaps--;
+		msgmaps[next].next = msghdr->msg_spot;
+		msghdr->msg_spot = next;
+		segs_needed--;
+	}
+
+	/*
+	 * Validate the message type
+	 */
+
+	if (msghdr->msg_type < 1) {
+		msg_freehdr(msghdr);
+		msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
+		wakeup(msqkptr);
+		DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
+		error = EINVAL;
+		goto done3;
+	}
+
+	/*
+	 * Copy in the message body
+	 */
+
+	next = msghdr->msg_spot;
+	while (msgsz > 0) {
+		size_t tlen;
+		if (msgsz > msginfo.msgssz)
+			tlen = msginfo.msgssz;
+		else
+			tlen = msgsz;
+		if (next <= -1)
+			panic("next too low #2");
+		if (next >= msginfo.msgseg)
+			panic("next out of range #2");
+		mtx_unlock(&msq_mtx);
+		if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz],
+		    tlen)) != 0) {
+			mtx_lock(&msq_mtx);
+			DPRINTF(("error %d copying in message segment\n",
+			    error));
+			msg_freehdr(msghdr);
+			msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
+			wakeup(msqkptr);
+			goto done3;
+		}
+		mtx_lock(&msq_mtx);
+		msgsz -= tlen;
+		msgp = (const char *)msgp + tlen;
+		next = msgmaps[next].next;
+	}
+	if (next != -1)
+		panic("didn't use all the msg segments");
+
+	/*
+	 * We've got the message.  Unlock the msqid_ds.
+	 */
+
+	msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
+
+	/*
+	 * Make sure that the msqid_ds is still allocated.
+	 */
+
+	if (msqkptr->u.msg_qbytes == 0) {
+		msg_freehdr(msghdr);
+		wakeup(msqkptr);
+		error = EIDRM;
+		goto done3;
+	}
+
+#ifdef MAC
+	/*
+	 * Note: Since the task/thread allocates the msghdr and usually
+	 * primes it with its own MAC label, for a majority of policies, it
+	 * won't be necessary to check whether the msghdr has access
+	 * permissions to the msgq.  The mac_sysvmsq_check_msqsnd check would
+	 * suffice in that case.  However, this hook may be required where
+	 * individual policies derive a non-identical label for the msghdr
+	 * from the current thread label and may want to check the msghdr
+	 * enqueue permissions, along with read/write permissions to the
+	 * msgq.
+	 */
+	error = mac_sysvmsq_check_msgmsq(td->td_ucred, msghdr, msqkptr);
+	if (error != 0) {
+		msg_freehdr(msghdr);
+		wakeup(msqkptr);
+		goto done3;
+	}
+#endif
+
+	/*
+	 * Put the message into the queue
+	 */
+	if (msqkptr->u.msg_first == NULL) {
+		msqkptr->u.msg_first = msghdr;
+		msqkptr->u.msg_last = msghdr;
+	} else {
+		msqkptr->u.msg_last->msg_next = msghdr;
+		msqkptr->u.msg_last = msghdr;
+	}
+	msqkptr->u.msg_last->msg_next = NULL;
+
+	msqkptr->u.msg_cbytes += msghdr->msg_ts;
+	msqkptr->u.msg_qnum++;
+	msqkptr->u.msg_lspid = td->td_proc->p_pid;
+	msqkptr->u.msg_stime = time_second;
+
+	wakeup(msqkptr);
+	td->td_retval[0] = 0;
+done3:
+#ifdef RACCT
+	if (error != 0) {
+		PROC_LOCK(td->td_proc);
+		racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
+		racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz);
+		PROC_UNLOCK(td->td_proc);
+	}
+#endif
+done2:
+	mtx_unlock(&msq_mtx);
+	return (error);
+}
+
+int
+sys_msgsnd(td, uap)
+	struct thread *td;
+	register struct msgsnd_args *uap;
+{
+	int error;
+	long mtype;
+
+	DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp,
+	    uap->msgsz, uap->msgflg));
+
+	if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) {
+		DPRINTF(("error %d copying the message type\n", error));
+		return (error);
+	}
+	return (kern_msgsnd(td, uap->msqid,
+	    (const char *)uap->msgp + sizeof(mtype),
+	    uap->msgsz, uap->msgflg, mtype));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgrcv_args {
+	int	msqid;
+	void	*msgp;
+	size_t	msgsz;
+	long	msgtyp;
+	int	msgflg;
+};
+#endif
+int
+kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype)
+	struct thread *td;
+	int msqid;
+	void *msgp;	/* XXX msgp is actually mtext. */
+	size_t msgsz;
+	long msgtyp;
+	int msgflg;
+	long *mtype;
+{
+	size_t len;
+	register struct msqid_kernel *msqkptr;
+	register struct msg *msghdr;
+	int msqix, error = 0;
+	short next;
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+
+	msqix = IPCID_TO_IX(msqid);
+
+	if (msqix < 0 || msqix >= msginfo.msgmni) {
+		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
+		    msginfo.msgmni));
+		return (EINVAL);
+	}
+
+	msqkptr = &msqids[msqix];
+	mtx_lock(&msq_mtx);
+	if (msqkptr->u.msg_qbytes == 0) {
+		DPRINTF(("no such message queue id\n"));
+		error = EINVAL;
+		goto done2;
+	}
+	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
+		DPRINTF(("wrong sequence number\n"));
+		error = EINVAL;
+		goto done2;
+	}
+
+	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
+		DPRINTF(("requester doesn't have read access\n"));
+		goto done2;
+	}
+
+#ifdef MAC
+	error = mac_sysvmsq_check_msqrcv(td->td_ucred, msqkptr);
+	if (error != 0)
+		goto done2;
+#endif
+
+	msghdr = NULL;
+	while (msghdr == NULL) {
+		if (msgtyp == 0) {
+			msghdr = msqkptr->u.msg_first;
+			if (msghdr != NULL) {
+				if (msgsz < msghdr->msg_ts &&
+				    (msgflg & MSG_NOERROR) == 0) {
+					DPRINTF(("first message on the queue "
+					    "is too big (want %zu, got %d)\n",
+					    msgsz, msghdr->msg_ts));
+					error = E2BIG;
+					goto done2;
+				}
+#ifdef MAC
+				error = mac_sysvmsq_check_msgrcv(td->td_ucred,
+				    msghdr);
+				if (error != 0)
+					goto done2;
+#endif
+				if (msqkptr->u.msg_first == msqkptr->u.msg_last) {
+					msqkptr->u.msg_first = NULL;
+					msqkptr->u.msg_last = NULL;
+				} else {
+					msqkptr->u.msg_first = msghdr->msg_next;
+					if (msqkptr->u.msg_first == NULL)
+						panic("msg_first/last screwed up #1");
+				}
+			}
+		} else {
+			struct msg *previous;
+			struct msg **prev;
+
+			previous = NULL;
+			prev = &(msqkptr->u.msg_first);
+			while ((msghdr = *prev) != NULL) {
+				/*
+				 * Is this message's type an exact match or is
+				 * this message's type less than or equal to
+				 * the absolute value of a negative msgtyp?
+				 * Note that the second half of this test can
+				 * NEVER be true if msgtyp is positive since
+				 * msg_type is always positive!
+				 */
+
+				if (msgtyp == msghdr->msg_type ||
+				    msghdr->msg_type <= -msgtyp) {
+					DPRINTF(("found message type %ld, "
+					    "requested %ld\n",
+					    msghdr->msg_type, msgtyp));
+					if (msgsz < msghdr->msg_ts &&
+					    (msgflg & MSG_NOERROR) == 0) {
+						DPRINTF(("requested message "
+						    "on the queue is too big "
+						    "(want %zu, got %hu)\n",
+						    msgsz, msghdr->msg_ts));
+						error = E2BIG;
+						goto done2;
+					}
+#ifdef MAC
+					error = mac_sysvmsq_check_msgrcv(
+					    td->td_ucred, msghdr);
+					if (error != 0)
+						goto done2;
+#endif
+					*prev = msghdr->msg_next;
+					if (msghdr == msqkptr->u.msg_last) {
+						if (previous == NULL) {
+							if (prev !=
+							    &msqkptr->u.msg_first)
+								panic("msg_first/last screwed up #2");
+							msqkptr->u.msg_first =
+							    NULL;
+							msqkptr->u.msg_last =
+							    NULL;
+						} else {
+							if (prev ==
+							    &msqkptr->u.msg_first)
+								panic("msg_first/last screwed up #3");
+							msqkptr->u.msg_last =
+							    previous;
+						}
+					}
+					break;
+				}
+				previous = msghdr;
+				prev = &(msghdr->msg_next);
+			}
+		}
+
+		/*
+		 * We've either extracted the msghdr for the appropriate
+		 * message or there isn't one.
+		 * If there is one then bail out of this loop.
+		 */
+
+		if (msghdr != NULL)
+			break;
+
+		/*
+		 * Hmph!  No message found.  Does the user want to wait?
+		 */
+
+		if ((msgflg & IPC_NOWAIT) != 0) {
+			DPRINTF(("no appropriate message found (msgtyp=%ld)\n",
+			    msgtyp));
+			/* The SVID says to return ENOMSG. */
+			error = ENOMSG;
+			goto done2;
+		}
+
+		/*
+		 * Wait for something to happen
+		 */
+
+		DPRINTF(("msgrcv:  goodnight\n"));
+		error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
+		    "msgrcv", 0);
+		DPRINTF(("msgrcv:  good morning (error=%d)\n", error));
+
+		if (error != 0) {
+			DPRINTF(("msgrcv:  interrupted system call\n"));
+			error = EINTR;
+			goto done2;
+		}
+
+		/*
+		 * Make sure that the msq queue still exists
+		 */
+
+		if (msqkptr->u.msg_qbytes == 0 ||
+		    msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
+			DPRINTF(("msqid deleted\n"));
+			error = EIDRM;
+			goto done2;
+		}
+	}
+
+	/*
+	 * Return the message to the user.
+	 *
+	 * First, do the bookkeeping (before we risk being interrupted).
+	 */
+
+	msqkptr->u.msg_cbytes -= msghdr->msg_ts;
+	msqkptr->u.msg_qnum--;
+	msqkptr->u.msg_lrpid = td->td_proc->p_pid;
+	msqkptr->u.msg_rtime = time_second;
+
+	racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, 1);
+	racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msghdr->msg_ts);
+
+	/*
+	 * Make msgsz the actual amount that we'll be returning.
+	 * Note that this effectively truncates the message if it is too long
+	 * (since msgsz is never increased).
+	 */
+
+	DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz,
+	    msghdr->msg_ts));
+	if (msgsz > msghdr->msg_ts)
+		msgsz = msghdr->msg_ts;
+	*mtype = msghdr->msg_type;
+
+	/*
+	 * Return the segments to the user
+	 */
+
+	next = msghdr->msg_spot;
+	for (len = 0; len < msgsz; len += msginfo.msgssz) {
+		size_t tlen;
+
+		if (msgsz - len > msginfo.msgssz)
+			tlen = msginfo.msgssz;
+		else
+			tlen = msgsz - len;
+		if (next <= -1)
+			panic("next too low #3");
+		if (next >= msginfo.msgseg)
+			panic("next out of range #3");
+		mtx_unlock(&msq_mtx);
+		error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen);
+		mtx_lock(&msq_mtx);
+		if (error != 0) {
+			DPRINTF(("error (%d) copying out message segment\n",
+			    error));
+			msg_freehdr(msghdr);
+			wakeup(msqkptr);
+			goto done2;
+		}
+		msgp = (char *)msgp + tlen;
+		next = msgmaps[next].next;
+	}
+
+	/*
+	 * Done, return the actual number of bytes copied out.
+	 */
+
+	msg_freehdr(msghdr);
+	wakeup(msqkptr);
+	td->td_retval[0] = msgsz;
+done2:
+	mtx_unlock(&msq_mtx);
+	return (error);
+}
+
+int
+sys_msgrcv(td, uap)
+	struct thread *td;
+	register struct msgrcv_args *uap;
+{
+	int error;
+	long mtype;
+
+	DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid,
+	    uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg));
+
+	if ((error = kern_msgrcv(td, uap->msqid,
+	    (char *)uap->msgp + sizeof(mtype), uap->msgsz,
+	    uap->msgtyp, uap->msgflg, &mtype)) != 0)
+		return (error);
+	if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0)
+		DPRINTF(("error %d copying the message type\n", error));
+	return (error);
+}
+
+static int
+sysctl_msqids(SYSCTL_HANDLER_ARGS)
+{
+
+	return (SYSCTL_OUT(req, msqids,
+	    sizeof(struct msqid_kernel) * msginfo.msgmni));
+}
+
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0,
+    "Maximum message size");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0,
+    "Number of message queue identifiers");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0,
+    "Maximum number of bytes in a queue");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0,
+    "Maximum number of messages in the system");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0,
+    "Size of a message segment");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0,
+    "Number of message segments");
+SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLTYPE_OPAQUE | CTLFLAG_RD,
+    NULL, 0, sysctl_msqids, "", "Message queue IDs");
+
+#ifdef COMPAT_FREEBSD32
+int
+freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap)
+{
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+	switch (uap->which) {
+	case 0:
+		return (freebsd7_freebsd32_msgctl(td,
+		    (struct freebsd7_freebsd32_msgctl_args *)&uap->a2));
+	case 2:
+		return (freebsd32_msgsnd(td,
+		    (struct freebsd32_msgsnd_args *)&uap->a2));
+	case 3:
+		return (freebsd32_msgrcv(td,
+		    (struct freebsd32_msgrcv_args *)&uap->a2));
+	default:
+		return (sys_msgsys(td, (struct msgsys_args *)uap));
+	}
+#else
+	return (nosys(td, NULL));
+#endif
+}
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+int
+freebsd7_freebsd32_msgctl(struct thread *td,
+    struct freebsd7_freebsd32_msgctl_args *uap)
+{
+	struct msqid_ds msqbuf;
+	struct msqid_ds32_old msqbuf32;
+	int error;
+
+	if (uap->cmd == IPC_SET) {
+		error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
+		if (error)
+			return (error);
+		freebsd32_ipcperm_old_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
+		PTRIN_CP(msqbuf32, msqbuf, msg_first);
+		PTRIN_CP(msqbuf32, msqbuf, msg_last);
+		CP(msqbuf32, msqbuf, msg_cbytes);
+		CP(msqbuf32, msqbuf, msg_qnum);
+		CP(msqbuf32, msqbuf, msg_qbytes);
+		CP(msqbuf32, msqbuf, msg_lspid);
+		CP(msqbuf32, msqbuf, msg_lrpid);
+		CP(msqbuf32, msqbuf, msg_stime);
+		CP(msqbuf32, msqbuf, msg_rtime);
+		CP(msqbuf32, msqbuf, msg_ctime);
+	}
+	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
+	if (error)
+		return (error);
+	if (uap->cmd == IPC_STAT) {
+		bzero(&msqbuf32, sizeof(msqbuf32));
+		freebsd32_ipcperm_old_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
+		PTROUT_CP(msqbuf, msqbuf32, msg_first);
+		PTROUT_CP(msqbuf, msqbuf32, msg_last);
+		CP(msqbuf, msqbuf32, msg_cbytes);
+		CP(msqbuf, msqbuf32, msg_qnum);
+		CP(msqbuf, msqbuf32, msg_qbytes);
+		CP(msqbuf, msqbuf32, msg_lspid);
+		CP(msqbuf, msqbuf32, msg_lrpid);
+		CP(msqbuf, msqbuf32, msg_stime);
+		CP(msqbuf, msqbuf32, msg_rtime);
+		CP(msqbuf, msqbuf32, msg_ctime);
+		error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
+	}
+	return (error);
+}
+#endif
+
+int
+freebsd32_msgctl(struct thread *td, struct freebsd32_msgctl_args *uap)
+{
+	struct msqid_ds msqbuf;
+	struct msqid_ds32 msqbuf32;
+	int error;
+
+	if (uap->cmd == IPC_SET) {
+		error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
+		if (error)
+			return (error);
+		freebsd32_ipcperm_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
+		PTRIN_CP(msqbuf32, msqbuf, msg_first);
+		PTRIN_CP(msqbuf32, msqbuf, msg_last);
+		CP(msqbuf32, msqbuf, msg_cbytes);
+		CP(msqbuf32, msqbuf, msg_qnum);
+		CP(msqbuf32, msqbuf, msg_qbytes);
+		CP(msqbuf32, msqbuf, msg_lspid);
+		CP(msqbuf32, msqbuf, msg_lrpid);
+		CP(msqbuf32, msqbuf, msg_stime);
+		CP(msqbuf32, msqbuf, msg_rtime);
+		CP(msqbuf32, msqbuf, msg_ctime);
+	}
+	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
+	if (error)
+		return (error);
+	if (uap->cmd == IPC_STAT) {
+		freebsd32_ipcperm_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
+		PTROUT_CP(msqbuf, msqbuf32, msg_first);
+		PTROUT_CP(msqbuf, msqbuf32, msg_last);
+		CP(msqbuf, msqbuf32, msg_cbytes);
+		CP(msqbuf, msqbuf32, msg_qnum);
+		CP(msqbuf, msqbuf32, msg_qbytes);
+		CP(msqbuf, msqbuf32, msg_lspid);
+		CP(msqbuf, msqbuf32, msg_lrpid);
+		CP(msqbuf, msqbuf32, msg_stime);
+		CP(msqbuf, msqbuf32, msg_rtime);
+		CP(msqbuf, msqbuf32, msg_ctime);
+		error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
+	}
+	return (error);
+}
+
+int
+freebsd32_msgsnd(struct thread *td, struct freebsd32_msgsnd_args *uap)
+{
+	const void *msgp;
+	long mtype;
+	int32_t mtype32;
+	int error;
+
+	msgp = PTRIN(uap->msgp);
+	if ((error = copyin(msgp, &mtype32, sizeof(mtype32))) != 0)
+		return (error);
+	mtype = mtype32;
+	return (kern_msgsnd(td, uap->msqid,
+	    (const char *)msgp + sizeof(mtype32),
+	    uap->msgsz, uap->msgflg, mtype));
+}
+
+int
+freebsd32_msgrcv(struct thread *td, struct freebsd32_msgrcv_args *uap)
+{
+	void *msgp;
+	long mtype;
+	int32_t mtype32;
+	int error;
+
+	msgp = PTRIN(uap->msgp);
+	if ((error = kern_msgrcv(td, uap->msqid,
+	    (char *)msgp + sizeof(mtype32), uap->msgsz,
+	    uap->msgtyp, uap->msgflg, &mtype)) != 0)
+		return (error);
+	mtype32 = (int32_t)mtype;
+	return (copyout(&mtype32, msgp, sizeof(mtype32)));
+}
+#endif
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *msgcalls[] = {
+	(sy_call_t *)freebsd7_msgctl, (sy_call_t *)sys_msgget,
+	(sy_call_t *)sys_msgsnd, (sy_call_t *)sys_msgrcv
+};
+
+/*
+ * Entry point for all MSG calls.
+ */
+int
+sys_msgsys(td, uap)
+	struct thread *td;
+	/* XXX actually varargs. */
+	struct msgsys_args /* {
+		int	which;
+		int	a2;
+		int	a3;
+		int	a4;
+		int	a5;
+		int	a6;
+	} */ *uap;
+{
+	int error;
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+	if (uap->which < 0 ||
+	    uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0]))
+		return (EINVAL);
+	error = (*msgcalls[uap->which])(td, &uap->a2);
+	return (error);
+}
+
+#ifndef CP
+#define CP(src, dst, fld)	do { (dst).fld = (src).fld; } while (0)
+#endif
+
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd7_msgctl_args {
+	int	msqid;
+	int	cmd;
+	struct	msqid_ds_old *buf;
+};
+#endif
+int
+freebsd7_msgctl(td, uap)
+	struct thread *td;
+	struct freebsd7_msgctl_args *uap;
+{
+	struct msqid_ds_old msqold;
+	struct msqid_ds msqbuf;
+	int error;
+
+	DPRINTF(("call to freebsd7_msgctl(%d, %d, %p)\n", uap->msqid, uap->cmd,
+	    uap->buf));
+	if (uap->cmd == IPC_SET) {
+		error = copyin(uap->buf, &msqold, sizeof(msqold));
+		if (error)
+			return (error);
+		ipcperm_old2new(&msqold.msg_perm, &msqbuf.msg_perm);
+		CP(msqold, msqbuf, msg_first);
+		CP(msqold, msqbuf, msg_last);
+		CP(msqold, msqbuf, msg_cbytes);
+		CP(msqold, msqbuf, msg_qnum);
+		CP(msqold, msqbuf, msg_qbytes);
+		CP(msqold, msqbuf, msg_lspid);
+		CP(msqold, msqbuf, msg_lrpid);
+		CP(msqold, msqbuf, msg_stime);
+		CP(msqold, msqbuf, msg_rtime);
+		CP(msqold, msqbuf, msg_ctime);
+	}
+	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
+	if (error)
+		return (error);
+	if (uap->cmd == IPC_STAT) {
+		bzero(&msqold, sizeof(msqold));
+		ipcperm_new2old(&msqbuf.msg_perm, &msqold.msg_perm);
+		CP(msqbuf, msqold, msg_first);
+		CP(msqbuf, msqold, msg_last);
+		CP(msqbuf, msqold, msg_cbytes);
+		CP(msqbuf, msqold, msg_qnum);
+		CP(msqbuf, msqold, msg_qbytes);
+		CP(msqbuf, msqold, msg_lspid);
+		CP(msqbuf, msqold, msg_lrpid);
+		CP(msqbuf, msqold, msg_stime);
+		CP(msqbuf, msqold, msg_rtime);
+		CP(msqbuf, msqold, msg_ctime);
+		error = copyout(&msqold, uap->buf, sizeof(struct msqid_ds_old));
+	}
+	return (error);
+}
+
+#undef CP
+
+#endif	/* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
+	   COMPAT_FREEBSD7 */
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
new file mode 100644
index 0000000..f9ff217
--- /dev/null
+++ b/sys/kern/sysv_sem.c
@@ -0,0 +1,1666 @@
+/*-
+ * Implementation of SVID semaphores
+ *
+ * Author:  Daniel Boulet
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+/*-
+ * Copyright (c) 2003-2005 McAfee, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project in part by McAfee
+ * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
+ * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
+ * program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/racct.h>
+#include <sys/sem.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/malloc.h>
+#include <sys/jail.h>
+
+#include <security/mac/mac_framework.h>
+
+FEATURE(sysv_sem, "System V semaphores support");
+
+static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");
+
+#ifdef SEM_DEBUG
+#define DPRINTF(a)	printf a
+#else
+#define DPRINTF(a)
+#endif
+
+static int seminit(void);
+static int sysvsem_modload(struct module *, int, void *);
+static int semunload(void);
+static void semexit_myhook(void *arg, struct proc *p);
+static int sysctl_sema(SYSCTL_HANDLER_ARGS);
+static int semvalid(int semid, struct semid_kernel *semakptr);
+
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args;
+int __semctl(struct thread *td, struct __semctl_args *uap);
+struct semget_args;
+int semget(struct thread *td, struct semget_args *uap);
+struct semop_args;
+int semop(struct thread *td, struct semop_args *uap);
+#endif
+
+static struct sem_undo *semu_alloc(struct thread *td);
+static int semundo_adjust(struct thread *td, struct sem_undo **supptr,
+    int semid, int semseq, int semnum, int adjval);
+static void semundo_clear(int semid, int semnum);
+
+static struct mtx	sem_mtx;	/* semaphore global lock */
+static struct mtx sem_undo_mtx;
+static int	semtot = 0;
+static struct semid_kernel *sema;	/* semaphore id pool */
+static struct mtx *sema_mtx;	/* semaphore id pool mutexes*/
+static struct sem *sem;		/* semaphore pool */
+LIST_HEAD(, sem_undo) semu_list;	/* list of active undo structures */
+LIST_HEAD(, sem_undo) semu_free_list;	/* list of free undo structures */
+static int	*semu;		/* undo structure pool */
+static eventhandler_tag semexit_tag;
+
+#define SEMUNDO_MTX		sem_undo_mtx
+#define SEMUNDO_LOCK()		mtx_lock(&SEMUNDO_MTX);
+#define SEMUNDO_UNLOCK()	mtx_unlock(&SEMUNDO_MTX);
+#define SEMUNDO_LOCKASSERT(how)	mtx_assert(&SEMUNDO_MTX, (how));
+
+struct sem {
+	u_short	semval;		/* semaphore value */
+	pid_t	sempid;		/* pid of last operation */
+	u_short	semncnt;	/* # awaiting semval > cval */
+	u_short	semzcnt;	/* # awaiting semval = 0 */
+};
+
+/*
+ * Undo structure (one per process)
+ */
+struct sem_undo {
+	LIST_ENTRY(sem_undo) un_next;	/* ptr to next active undo structure */
+	struct	proc *un_proc;		/* owner of this structure */
+	short	un_cnt;			/* # of active entries */
+	struct undo {
+		short	un_adjval;	/* adjust on exit values */
+		short	un_num;		/* semaphore # */
+		int	un_id;		/* semid */
+		unsigned short un_seq;
+	} un_ent[1];			/* undo entries */
+};
+
+/*
+ * Configuration parameters
+ */
+#ifndef SEMMNI
+#define SEMMNI	50		/* # of semaphore identifiers */
+#endif
+#ifndef SEMMNS
+#define SEMMNS	340		/* # of semaphores in system */
+#endif
+#ifndef SEMUME
+#define SEMUME	50		/* max # of undo entries per process */
+#endif
+#ifndef SEMMNU
+#define SEMMNU	150		/* # of undo structures in system */
+#endif
+
+/* shouldn't need tuning */
+#ifndef SEMMSL
+#define SEMMSL	SEMMNS		/* max # of semaphores per id */
+#endif
+#ifndef SEMOPM
+#define SEMOPM	100		/* max # of operations per semop call */
+#endif
+
+#define SEMVMX	32767		/* semaphore maximum value */
+#define SEMAEM	16384		/* adjust on exit max value */
+
+/*
+ * Due to the way semaphore memory is allocated, we have to ensure that
+ * SEMUSZ is properly aligned.
+ */
+
+#define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
+
+/* actual size of an undo structure */
+#define SEMUSZ	SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME]))
+
+/*
+ * Macro to find a particular sem_undo vector
+ */
+#define SEMU(ix) \
+	((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz))
+
+/*
+ * semaphore info struct
+ */
+struct seminfo seminfo = {
+                SEMMNI,         /* # of semaphore identifiers */
+                SEMMNS,         /* # of semaphores in system */
+                SEMMNU,         /* # of undo structures in system */
+                SEMMSL,         /* max # of semaphores per id */
+                SEMOPM,         /* max # of operations per semop call */
+                SEMUME,         /* max # of undo entries per process */
+                SEMUSZ,         /* size in bytes of undo structure */
+                SEMVMX,         /* semaphore maximum value */
+                SEMAEM          /* adjust on exit max value */
+};
+
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
+    "Number of semaphore identifiers");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0,
+    "Maximum number of semaphores in the system");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0,
+    "Maximum number of undo structures in the system");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RW, &seminfo.semmsl, 0,
+    "Max semaphores per id");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0,
+    "Max operations per semop call");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0,
+    "Max undo entries per process");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RDTUN, &seminfo.semusz, 0,
+    "Size in bytes of undo structure");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RW, &seminfo.semvmx, 0,
+    "Semaphore maximum value");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RW, &seminfo.semaem, 0,
+    "Adjust on exit max value");
+SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLTYPE_OPAQUE | CTLFLAG_RD,
+    NULL, 0, sysctl_sema, "", "Semaphore id pool");
+
+static struct syscall_helper_data sem_syscalls[] = {
+	SYSCALL_INIT_HELPER(__semctl),
+	SYSCALL_INIT_HELPER(semget),
+	SYSCALL_INIT_HELPER(semop),
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+	SYSCALL_INIT_HELPER(semsys),
+	SYSCALL_INIT_HELPER_COMPAT(freebsd7___semctl),
+#endif
+	SYSCALL_INIT_LAST
+};
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_ipc.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+static struct syscall_helper_data sem32_syscalls[] = {
+	SYSCALL32_INIT_HELPER(freebsd32_semctl),
+	SYSCALL32_INIT_HELPER_COMPAT(semget),
+	SYSCALL32_INIT_HELPER_COMPAT(semop),
+	SYSCALL32_INIT_HELPER(freebsd32_semsys),
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_semctl),
+#endif
+	SYSCALL_INIT_LAST
+};
+#endif
+
+static int
+seminit(void)
+{
+	int i, error;
+
+	TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni);
+	TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns);
+	TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu);
+	TUNABLE_INT_FETCH("kern.ipc.semmsl", &seminfo.semmsl);
+	TUNABLE_INT_FETCH("kern.ipc.semopm", &seminfo.semopm);
+	TUNABLE_INT_FETCH("kern.ipc.semume", &seminfo.semume);
+	TUNABLE_INT_FETCH("kern.ipc.semusz", &seminfo.semusz);
+	TUNABLE_INT_FETCH("kern.ipc.semvmx", &seminfo.semvmx);
+	TUNABLE_INT_FETCH("kern.ipc.semaem", &seminfo.semaem);
+
+	sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK);
+	sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM,
+	    M_WAITOK);
+	sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM,
+	    M_WAITOK | M_ZERO);
+	semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK);
+
+	for (i = 0; i < seminfo.semmni; i++) {
+		sema[i].u.sem_base = 0;
+		sema[i].u.sem_perm.mode = 0;
+		sema[i].u.sem_perm.seq = 0;
+#ifdef MAC
+		mac_sysvsem_init(&sema[i]);
+#endif
+	}
+	for (i = 0; i < seminfo.semmni; i++)
+		mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF);
+	LIST_INIT(&semu_free_list);
+	for (i = 0; i < seminfo.semmnu; i++) {
+		struct sem_undo *suptr = SEMU(i);
+		suptr->un_proc = NULL;
+		LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
+	}
+	LIST_INIT(&semu_list);
+	mtx_init(&sem_mtx, "sem", NULL, MTX_DEF);
+	mtx_init(&sem_undo_mtx, "semu", NULL, MTX_DEF);
+	semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL,
+	    EVENTHANDLER_PRI_ANY);
+
+	error = syscall_helper_register(sem_syscalls);
+	if (error != 0)
+		return (error);
+#ifdef COMPAT_FREEBSD32
+	error = syscall32_helper_register(sem32_syscalls);
+	if (error != 0)
+		return (error);
+#endif
+	return (0);
+}
+
+static int
+semunload(void)
+{
+	int i;
+
+	/* XXXKIB */
+	if (semtot != 0)
+		return (EBUSY);
+
+#ifdef COMPAT_FREEBSD32
+	syscall32_helper_unregister(sem32_syscalls);
+#endif
+	syscall_helper_unregister(sem_syscalls);
+	EVENTHANDLER_DEREGISTER(process_exit, semexit_tag);
+#ifdef MAC
+	for (i = 0; i < seminfo.semmni; i++)
+		mac_sysvsem_destroy(&sema[i]);
+#endif
+	free(sem, M_SEM);
+	free(sema, M_SEM);
+	free(semu, M_SEM);
+	for (i = 0; i < seminfo.semmni; i++)
+		mtx_destroy(&sema_mtx[i]);
+	free(sema_mtx, M_SEM);
+	mtx_destroy(&sem_mtx);
+	mtx_destroy(&sem_undo_mtx);
+	return (0);
+}
+
+static int
+sysvsem_modload(struct module *module, int cmd, void *arg)
+{
+	int error = 0;
+
+	switch (cmd) {
+	case MOD_LOAD:
+		error = seminit();
+		if (error != 0)
+			semunload();
+		break;
+	case MOD_UNLOAD:
+		error = semunload();
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t sysvsem_mod = {
+	"sysvsem",
+	&sysvsem_modload,
+	NULL
+};
+
+DECLARE_MODULE(sysvsem, sysvsem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
+MODULE_VERSION(sysvsem, 1);
+
+/*
+ * Allocate a new sem_undo structure for a process
+ * (returns ptr to structure or NULL if no more room)
+ */
+
+static struct sem_undo *
+semu_alloc(struct thread *td)
+{
+	struct sem_undo *suptr;
+
+	SEMUNDO_LOCKASSERT(MA_OWNED);
+	if ((suptr = LIST_FIRST(&semu_free_list)) == NULL)
+		return (NULL);
+	LIST_REMOVE(suptr, un_next);
+	LIST_INSERT_HEAD(&semu_list, suptr, un_next);
+	suptr->un_cnt = 0;
+	suptr->un_proc = td->td_proc;
+	return (suptr);
+}
+
+static int
+semu_try_free(struct sem_undo *suptr)
+{
+
+	SEMUNDO_LOCKASSERT(MA_OWNED);
+
+	if (suptr->un_cnt != 0)
+		return (0);
+	LIST_REMOVE(suptr, un_next);
+	LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
+	return (1);
+}
+
+/*
+ * Adjust a particular entry for a particular proc
+ */
+
+static int
+semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid,
+    int semseq, int semnum, int adjval)
+{
+	struct proc *p = td->td_proc;
+	struct sem_undo *suptr;
+	struct undo *sunptr;
+	int i;
+
+	SEMUNDO_LOCKASSERT(MA_OWNED);
+	/* Look for and remember the sem_undo if the caller doesn't provide
+	   it */
+
+	suptr = *supptr;
+	if (suptr == NULL) {
+		LIST_FOREACH(suptr, &semu_list, un_next) {
+			if (suptr->un_proc == p) {
+				*supptr = suptr;
+				break;
+			}
+		}
+		if (suptr == NULL) {
+			if (adjval == 0)
+				return(0);
+			suptr = semu_alloc(td);
+			if (suptr == NULL)
+				return (ENOSPC);
+			*supptr = suptr;
+		}
+	}
+
+	/*
+	 * Look for the requested entry and adjust it (delete if adjval becomes
+	 * 0).
+	 */
+	sunptr = &suptr->un_ent[0];
+	for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
+		if (sunptr->un_id != semid || sunptr->un_num != semnum)
+			continue;
+		if (adjval != 0) {
+			adjval += sunptr->un_adjval;
+			if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
+				return (ERANGE);
+		}
+		sunptr->un_adjval = adjval;
+		if (sunptr->un_adjval == 0) {
+			suptr->un_cnt--;
+			if (i < suptr->un_cnt)
+				suptr->un_ent[i] =
+				    suptr->un_ent[suptr->un_cnt];
+			if (suptr->un_cnt == 0)
+				semu_try_free(suptr);
+		}
+		return (0);
+	}
+
+	/* Didn't find the right entry - create it */
+	if (adjval == 0)
+		return (0);
+	if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
+		return (ERANGE);
+	if (suptr->un_cnt != seminfo.semume) {
+		sunptr = &suptr->un_ent[suptr->un_cnt];
+		suptr->un_cnt++;
+		sunptr->un_adjval = adjval;
+		sunptr->un_id = semid;
+		sunptr->un_num = semnum;
+		sunptr->un_seq = semseq;
+	} else
+		return (EINVAL);
+	return (0);
+}
+
+static void
+semundo_clear(int semid, int semnum)
+{
+	struct sem_undo *suptr, *suptr1;
+	struct undo *sunptr;
+	int i;
+
+	SEMUNDO_LOCKASSERT(MA_OWNED);
+	LIST_FOREACH_SAFE(suptr, &semu_list, un_next, suptr1) {
+		sunptr = &suptr->un_ent[0];
+		for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
+			if (sunptr->un_id != semid)
+				continue;
+			if (semnum == -1 || sunptr->un_num == semnum) {
+				suptr->un_cnt--;
+				if (i < suptr->un_cnt) {
+					suptr->un_ent[i] =
+					    suptr->un_ent[suptr->un_cnt];
+					continue;
+				}
+				semu_try_free(suptr);
+			}
+			if (semnum != -1)
+				break;
+		}
+	}
+}
+
+static int
+semvalid(int semid, struct semid_kernel *semakptr)
+{
+
+	return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
+	    semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ? EINVAL : 0);
+}
+
+/*
+ * Note that the user-mode half of this passes a union, not a pointer.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args {
+	int	semid;
+	int	semnum;
+	int	cmd;
+	union	semun *arg;
+};
+#endif
+int
+sys___semctl(struct thread *td, struct __semctl_args *uap)
+{
+	struct semid_ds dsbuf;
+	union semun arg, semun;
+	register_t rval;
+	int error;
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_SET:
+	case IPC_STAT:
+	case GETALL:
+	case SETVAL:
+	case SETALL:
+		error = copyin(uap->arg, &arg, sizeof(arg));
+		if (error)
+			return (error);
+		break;
+	}
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_STAT:
+		semun.buf = &dsbuf;
+		break;
+	case IPC_SET:
+		error = copyin(arg.buf, &dsbuf, sizeof(dsbuf));
+		if (error)
+			return (error);
+		semun.buf = &dsbuf;
+		break;
+	case GETALL:
+	case SETALL:
+		semun.array = arg.array;
+		break;
+	case SETVAL:
+		semun.val = arg.val;
+		break;		
+	}
+
+	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
+	    &rval);
+	if (error)
+		return (error);
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_STAT:
+		error = copyout(&dsbuf, arg.buf, sizeof(dsbuf));
+		break;
+	}
+
+	if (error == 0)
+		td->td_retval[0] = rval;
+	return (error);
+}
+
+int
+kern_semctl(struct thread *td, int semid, int semnum, int cmd,
+    union semun *arg, register_t *rval)
+{
+	u_short *array;
+	struct ucred *cred = td->td_ucred;
+	int i, error;
+	struct semid_ds *sbuf;
+	struct semid_kernel *semakptr;
+	struct mtx *sema_mtxp;
+	u_short usval, count;
+	int semidx;
+
+	DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n",
+	    semid, semnum, cmd, arg));
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+
+	array = NULL;
+
+	switch(cmd) {
+	case SEM_STAT:
+		/*
+		 * For this command we assume semid is an array index
+		 * rather than an IPC id.
+		 */
+		if (semid < 0 || semid >= seminfo.semmni)
+			return (EINVAL);
+		semakptr = &sema[semid];
+		sema_mtxp = &sema_mtx[semid];
+		mtx_lock(sema_mtxp);
+		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
+			error = EINVAL;
+			goto done2;
+		}
+		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+			goto done2;
+#ifdef MAC
+		error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
+		if (error != 0)
+			goto done2;
+#endif
+		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
+		*rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
+		mtx_unlock(sema_mtxp);
+		return (0);
+	}
+
+	semidx = IPCID_TO_IX(semid);
+	if (semidx < 0 || semidx >= seminfo.semmni)
+		return (EINVAL);
+
+	semakptr = &sema[semidx];
+	sema_mtxp = &sema_mtx[semidx];
+	if (cmd == IPC_RMID)
+		mtx_lock(&sem_mtx);
+	mtx_lock(sema_mtxp);
+#ifdef MAC
+	error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
+	if (error != 0)
+		goto done2;
+#endif
+
+	error = 0;
+	*rval = 0;
+
+	switch (cmd) {
+	case IPC_RMID:
+		if ((error = semvalid(semid, semakptr)) != 0)
+			goto done2;
+		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
+			goto done2;
+		semakptr->u.sem_perm.cuid = cred->cr_uid;
+		semakptr->u.sem_perm.uid = cred->cr_uid;
+		semakptr->u.sem_perm.mode = 0;
+		racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems);
+		crfree(semakptr->cred);
+		semakptr->cred = NULL;
+		SEMUNDO_LOCK();
+		semundo_clear(semidx, -1);
+		SEMUNDO_UNLOCK();
+#ifdef MAC
+		mac_sysvsem_cleanup(semakptr);
+#endif
+		wakeup(semakptr);
+		for (i = 0; i < seminfo.semmni; i++) {
+			if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
+			    sema[i].u.sem_base > semakptr->u.sem_base)
+				mtx_lock_flags(&sema_mtx[i], LOP_DUPOK);
+		}
+		for (i = semakptr->u.sem_base - sem; i < semtot; i++)
+			sem[i] = sem[i + semakptr->u.sem_nsems];
+		for (i = 0; i < seminfo.semmni; i++) {
+			if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
+			    sema[i].u.sem_base > semakptr->u.sem_base) {
+				sema[i].u.sem_base -= semakptr->u.sem_nsems;
+				mtx_unlock(&sema_mtx[i]);
+			}
+		}
+		semtot -= semakptr->u.sem_nsems;
+		break;
+
+	case IPC_SET:
+		if ((error = semvalid(semid, semakptr)) != 0)
+			goto done2;
+		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
+			goto done2;
+		sbuf = arg->buf;
+		semakptr->u.sem_perm.uid = sbuf->sem_perm.uid;
+		semakptr->u.sem_perm.gid = sbuf->sem_perm.gid;
+		semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode &
+		    ~0777) | (sbuf->sem_perm.mode & 0777);
+		semakptr->u.sem_ctime = time_second;
+		break;
+
+	case IPC_STAT:
+		if ((error = semvalid(semid, semakptr)) != 0)
+			goto done2;
+		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+			goto done2;
+		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
+		break;
+
+	case GETNCNT:
+		if ((error = semvalid(semid, semakptr)) != 0)
+			goto done2;
+		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+			goto done2;
+		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
+			error = EINVAL;
+			goto done2;
+		}
+		*rval = semakptr->u.sem_base[semnum].semncnt;
+		break;
+
+	case GETPID:
+		if ((error = semvalid(semid, semakptr)) != 0)
+			goto done2;
+		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+			goto done2;
+		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
+			error = EINVAL;
+			goto done2;
+		}
+		*rval = semakptr->u.sem_base[semnum].sempid;
+		break;
+
+	case GETVAL:
+		if ((error = semvalid(semid, semakptr)) != 0)
+			goto done2;
+		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+			goto done2;
+		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
+			error = EINVAL;
+			goto done2;
+		}
+		*rval = semakptr->u.sem_base[semnum].semval;
+		break;
+
+	case GETALL:
+		/*
+		 * Unfortunately, callers of this function don't know
+		 * in advance how many semaphores are in this set.
+		 * While we could just allocate the maximum size array
+		 * and pass the actual size back to the caller, that
+		 * won't work for SETALL since we can't copyin() more
+		 * data than the user specified as we may return a
+		 * spurious EFAULT.
+		 * 
+		 * Note that the number of semaphores in a set is
+		 * fixed for the life of that set.  The only way that
+		 * the 'count' could change while are blocked in
+		 * malloc() is if this semaphore set were destroyed
+		 * and a new one created with the same index.
+		 * However, semvalid() will catch that due to the
+		 * sequence number unless exactly 0x8000 (or a
+		 * multiple thereof) semaphore sets for the same index
+		 * are created and destroyed while we are in malloc!
+		 *
+		 */
+		count = semakptr->u.sem_nsems;
+		mtx_unlock(sema_mtxp);		    
+		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
+		mtx_lock(sema_mtxp);
+		if ((error = semvalid(semid, semakptr)) != 0)
+			goto done2;
+		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
+		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+			goto done2;
+		for (i = 0; i < semakptr->u.sem_nsems; i++)
+			array[i] = semakptr->u.sem_base[i].semval;
+		mtx_unlock(sema_mtxp);
+		error = copyout(array, arg->array, count * sizeof(*array));
+		mtx_lock(sema_mtxp);
+		break;
+
+	case GETZCNT:
+		if ((error = semvalid(semid, semakptr)) != 0)
+			goto done2;
+		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+			goto done2;
+		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
+			error = EINVAL;
+			goto done2;
+		}
+		*rval = semakptr->u.sem_base[semnum].semzcnt;
+		break;
+
+	case SETVAL:
+		if ((error = semvalid(semid, semakptr)) != 0)
+			goto done2;
+		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
+			goto done2;
+		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
+			error = EINVAL;
+			goto done2;
+		}
+		if (arg->val < 0 || arg->val > seminfo.semvmx) {
+			error = ERANGE;
+			goto done2;
+		}
+		semakptr->u.sem_base[semnum].semval = arg->val;
+		SEMUNDO_LOCK();
+		semundo_clear(semidx, semnum);
+		SEMUNDO_UNLOCK();
+		wakeup(semakptr);
+		break;
+
+	case SETALL:
+		/*
+		 * See comment on GETALL for why 'count' shouldn't change
+		 * and why we require a userland buffer.
+		 */
+		count = semakptr->u.sem_nsems;
+		mtx_unlock(sema_mtxp);		    
+		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
+		error = copyin(arg->array, array, count * sizeof(*array));
+		mtx_lock(sema_mtxp);
+		if (error)
+			break;
+		if ((error = semvalid(semid, semakptr)) != 0)
+			goto done2;
+		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
+		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
+			goto done2;
+		for (i = 0; i < semakptr->u.sem_nsems; i++) {
+			usval = array[i];
+			if (usval > seminfo.semvmx) {
+				error = ERANGE;
+				break;
+			}
+			semakptr->u.sem_base[i].semval = usval;
+		}
+		SEMUNDO_LOCK();
+		semundo_clear(semidx, -1);
+		SEMUNDO_UNLOCK();
+		wakeup(semakptr);
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+
+done2:
+	mtx_unlock(sema_mtxp);
+	if (cmd == IPC_RMID)
+		mtx_unlock(&sem_mtx);
+	if (array != NULL)
+		free(array, M_TEMP);
+	return(error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semget_args {
+	key_t	key;
+	int	nsems;
+	int	semflg;
+};
+#endif
+int
+sys_semget(struct thread *td, struct semget_args *uap)
+{
+	int semid, error = 0;
+	int key = uap->key;
+	int nsems = uap->nsems;
+	int semflg = uap->semflg;
+	struct ucred *cred = td->td_ucred;
+
+	DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg));
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+
+	mtx_lock(&sem_mtx);
+	if (key != IPC_PRIVATE) {
+		for (semid = 0; semid < seminfo.semmni; semid++) {
+			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) &&
+			    sema[semid].u.sem_perm.key == key)
+				break;
+		}
+		if (semid < seminfo.semmni) {
+			DPRINTF(("found public key\n"));
+			if ((error = ipcperm(td, &sema[semid].u.sem_perm,
+			    semflg & 0700))) {
+				goto done2;
+			}
+			if (nsems > 0 && sema[semid].u.sem_nsems < nsems) {
+				DPRINTF(("too small\n"));
+				error = EINVAL;
+				goto done2;
+			}
+			if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
+				DPRINTF(("not exclusive\n"));
+				error = EEXIST;
+				goto done2;
+			}
+#ifdef MAC
+			error = mac_sysvsem_check_semget(cred, &sema[semid]);
+			if (error != 0)
+				goto done2;
+#endif
+			goto found;
+		}
+	}
+
+	DPRINTF(("need to allocate the semid_kernel\n"));
+	if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
+		if (nsems <= 0 || nsems > seminfo.semmsl) {
+			DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems,
+			    seminfo.semmsl));
+			error = EINVAL;
+			goto done2;
+		}
+		if (nsems > seminfo.semmns - semtot) {
+			DPRINTF((
+			    "not enough semaphores left (need %d, got %d)\n",
+			    nsems, seminfo.semmns - semtot));
+			error = ENOSPC;
+			goto done2;
+		}
+		for (semid = 0; semid < seminfo.semmni; semid++) {
+			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0)
+				break;
+		}
+		if (semid == seminfo.semmni) {
+			DPRINTF(("no more semid_kernel's available\n"));
+			error = ENOSPC;
+			goto done2;
+		}
+#ifdef RACCT
+		PROC_LOCK(td->td_proc);
+		error = racct_add(td->td_proc, RACCT_NSEM, nsems);
+		PROC_UNLOCK(td->td_proc);
+		if (error != 0) {
+			error = ENOSPC;
+			goto done2;
+		}
+#endif
+		DPRINTF(("semid %d is available\n", semid));
+		mtx_lock(&sema_mtx[semid]);
+		KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0,
+		    ("Lost semaphore %d", semid));
+		sema[semid].u.sem_perm.key = key;
+		sema[semid].u.sem_perm.cuid = cred->cr_uid;
+		sema[semid].u.sem_perm.uid = cred->cr_uid;
+		sema[semid].u.sem_perm.cgid = cred->cr_gid;
+		sema[semid].u.sem_perm.gid = cred->cr_gid;
+		sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
+		sema[semid].cred = crhold(cred);
+		sema[semid].u.sem_perm.seq =
+		    (sema[semid].u.sem_perm.seq + 1) & 0x7fff;
+		sema[semid].u.sem_nsems = nsems;
+		sema[semid].u.sem_otime = 0;
+		sema[semid].u.sem_ctime = time_second;
+		sema[semid].u.sem_base = &sem[semtot];
+		semtot += nsems;
+		bzero(sema[semid].u.sem_base,
+		    sizeof(sema[semid].u.sem_base[0])*nsems);
+#ifdef MAC
+		mac_sysvsem_create(cred, &sema[semid]);
+#endif
+		mtx_unlock(&sema_mtx[semid]);
+		DPRINTF(("sembase = %p, next = %p\n",
+		    sema[semid].u.sem_base, &sem[semtot]));
+	} else {
+		DPRINTF(("didn't find it and wasn't asked to create it\n"));
+		error = ENOENT;
+		goto done2;
+	}
+
+found:
+	td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm);
+done2:
+	mtx_unlock(&sem_mtx);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semop_args {
+	int	semid;
+	struct	sembuf *sops;
+	size_t	nsops;
+};
+#endif
+int
+sys_semop(struct thread *td, struct semop_args *uap)
+{
+#define SMALL_SOPS	8
+	struct sembuf small_sops[SMALL_SOPS];
+	int semid = uap->semid;
+	size_t nsops = uap->nsops;
+	struct sembuf *sops;
+	struct semid_kernel *semakptr;
+	struct sembuf *sopptr = 0;
+	struct sem *semptr = 0;
+	struct sem_undo *suptr;
+	struct mtx *sema_mtxp;
+	size_t i, j, k;
+	int error;
+	int do_wakeup, do_undos;
+	unsigned short seq;
+
+#ifdef SEM_DEBUG
+	sops = NULL;
+#endif
+	DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops));
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+
+	semid = IPCID_TO_IX(semid);	/* Convert back to zero origin */
+
+	if (semid < 0 || semid >= seminfo.semmni)
+		return (EINVAL);
+
+	/* Allocate memory for sem_ops */
+	if (nsops <= SMALL_SOPS)
+		sops = small_sops;
+	else if (nsops > seminfo.semopm) {
+		DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm,
+		    nsops));
+		return (E2BIG);
+	} else {
+#ifdef RACCT
+		PROC_LOCK(td->td_proc);
+		if (nsops > racct_get_available(td->td_proc, RACCT_NSEMOP)) {
+			PROC_UNLOCK(td->td_proc);
+			return (E2BIG);
+		}
+		PROC_UNLOCK(td->td_proc);
+#endif
+
+		sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK);
+	}
+	if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
+		DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error,
+		    uap->sops, sops, nsops * sizeof(sops[0])));
+		if (sops != small_sops)
+			free(sops, M_SEM);
+		return (error);
+	}
+
+	semakptr = &sema[semid];
+	sema_mtxp = &sema_mtx[semid];
+	mtx_lock(sema_mtxp);
+	if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
+		error = EINVAL;
+		goto done2;
+	}
+	seq = semakptr->u.sem_perm.seq;
+	if (seq != IPCID_TO_SEQ(uap->semid)) {
+		error = EINVAL;
+		goto done2;
+	}
+	/*
+	 * Initial pass thru sops to see what permissions are needed.
+	 * Also perform any checks that don't need repeating on each
+	 * attempt to satisfy the request vector.
+	 */
+	j = 0;		/* permission needed */
+	do_undos = 0;
+	for (i = 0; i < nsops; i++) {
+		sopptr = &sops[i];
+		if (sopptr->sem_num >= semakptr->u.sem_nsems) {
+			error = EFBIG;
+			goto done2;
+		}
+		if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0)
+			do_undos = 1;
+		j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A;
+	}
+
+	if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) {
+		DPRINTF(("error = %d from ipaccess\n", error));
+		goto done2;
+	}
+#ifdef MAC
+	error = mac_sysvsem_check_semop(td->td_ucred, semakptr, j);
+	if (error != 0)
+		goto done2;
+#endif
+
+	/*
+	 * Loop trying to satisfy the vector of requests.
+	 * If we reach a point where we must wait, any requests already
+	 * performed are rolled back and we go to sleep until some other
+	 * process wakes us up.  At this point, we start all over again.
+	 *
+	 * This ensures that from the perspective of other tasks, a set
+	 * of requests is atomic (never partially satisfied).
+	 */
+	for (;;) {
+		do_wakeup = 0;
+		error = 0;	/* error return if necessary */
+
+		for (i = 0; i < nsops; i++) {
+			sopptr = &sops[i];
+			semptr = &semakptr->u.sem_base[sopptr->sem_num];
+
+			DPRINTF((
+			    "semop:  semakptr=%p, sem_base=%p, "
+			    "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n",
+			    semakptr, semakptr->u.sem_base, semptr,
+			    sopptr->sem_num, semptr->semval, sopptr->sem_op,
+			    (sopptr->sem_flg & IPC_NOWAIT) ?
+			    "nowait" : "wait"));
+
+			if (sopptr->sem_op < 0) {
+				if (semptr->semval + sopptr->sem_op < 0) {
+					DPRINTF(("semop:  can't do it now\n"));
+					break;
+				} else {
+					semptr->semval += sopptr->sem_op;
+					if (semptr->semval == 0 &&
+					    semptr->semzcnt > 0)
+						do_wakeup = 1;
+				}
+			} else if (sopptr->sem_op == 0) {
+				if (semptr->semval != 0) {
+					DPRINTF(("semop:  not zero now\n"));
+					break;
+				}
+			} else if (semptr->semval + sopptr->sem_op >
+			    seminfo.semvmx) {
+				error = ERANGE;
+				break;
+			} else {
+				if (semptr->semncnt > 0)
+					do_wakeup = 1;
+				semptr->semval += sopptr->sem_op;
+			}
+		}
+
+		/*
+		 * Did we get through the entire vector?
+		 */
+		if (i >= nsops)
+			goto done;
+
+		/*
+		 * No ... rollback anything that we've already done
+		 */
+		DPRINTF(("semop:  rollback 0 through %d\n", i-1));
+		for (j = 0; j < i; j++)
+			semakptr->u.sem_base[sops[j].sem_num].semval -=
+			    sops[j].sem_op;
+
+		/* If we detected an error, return it */
+		if (error != 0)
+			goto done2;
+
+		/*
+		 * If the request that we couldn't satisfy has the
+		 * NOWAIT flag set then return with EAGAIN.
+		 */
+		if (sopptr->sem_flg & IPC_NOWAIT) {
+			error = EAGAIN;
+			goto done2;
+		}
+
+		if (sopptr->sem_op == 0)
+			semptr->semzcnt++;
+		else
+			semptr->semncnt++;
+
+		DPRINTF(("semop:  good night!\n"));
+		error = msleep(semakptr, sema_mtxp, (PZERO - 4) | PCATCH,
+		    "semwait", 0);
+		DPRINTF(("semop:  good morning (error=%d)!\n", error));
+		/* return code is checked below, after sem[nz]cnt-- */
+
+		/*
+		 * Make sure that the semaphore still exists
+		 */
+		seq = semakptr->u.sem_perm.seq;
+		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
+		    seq != IPCID_TO_SEQ(uap->semid)) {
+			error = EIDRM;
+			goto done2;
+		}
+
+		/*
+		 * Renew the semaphore's pointer after wakeup since
+		 * during msleep sem_base may have been modified and semptr
+		 * is not valid any more
+		 */
+		semptr = &semakptr->u.sem_base[sopptr->sem_num];
+
+		/*
+		 * The semaphore is still alive.  Readjust the count of
+		 * waiting processes.
+		 */
+		if (sopptr->sem_op == 0)
+			semptr->semzcnt--;
+		else
+			semptr->semncnt--;
+
+		/*
+		 * Is it really morning, or was our sleep interrupted?
+		 * (Delayed check of msleep() return code because we
+		 * need to decrement sem[nz]cnt either way.)
+		 */
+		if (error != 0) {
+			error = EINTR;
+			goto done2;
+		}
+		DPRINTF(("semop:  good morning!\n"));
+	}
+
+done:
+	/*
+	 * Process any SEM_UNDO requests.
+	 */
+	if (do_undos) {
+		SEMUNDO_LOCK();
+		suptr = NULL;
+		for (i = 0; i < nsops; i++) {
+			/*
+			 * We only need to deal with SEM_UNDO's for non-zero
+			 * op's.
+			 */
+			int adjval;
+
+			if ((sops[i].sem_flg & SEM_UNDO) == 0)
+				continue;
+			adjval = sops[i].sem_op;
+			if (adjval == 0)
+				continue;
+			error = semundo_adjust(td, &suptr, semid, seq,
+			    sops[i].sem_num, -adjval);
+			if (error == 0)
+				continue;
+
+			/*
+			 * Oh-Oh!  We ran out of either sem_undo's or undo's.
+			 * Rollback the adjustments to this point and then
+			 * rollback the semaphore ups and down so we can return
+			 * with an error with all structures restored.  We
+			 * rollback the undo's in the exact reverse order that
+			 * we applied them.  This guarantees that we won't run
+			 * out of space as we roll things back out.
+			 */
+			for (j = 0; j < i; j++) {
+				k = i - j - 1;
+				if ((sops[k].sem_flg & SEM_UNDO) == 0)
+					continue;
+				adjval = sops[k].sem_op;
+				if (adjval == 0)
+					continue;
+				if (semundo_adjust(td, &suptr, semid, seq,
+				    sops[k].sem_num, adjval) != 0)
+					panic("semop - can't undo undos");
+			}
+
+			for (j = 0; j < nsops; j++)
+				semakptr->u.sem_base[sops[j].sem_num].semval -=
+				    sops[j].sem_op;
+
+			DPRINTF(("error = %d from semundo_adjust\n", error));
+			SEMUNDO_UNLOCK();
+			goto done2;
+		} /* loop through the sops */
+		SEMUNDO_UNLOCK();
+	} /* if (do_undos) */
+
+	/* We're definitely done - set the sempid's and time */
+	for (i = 0; i < nsops; i++) {
+		sopptr = &sops[i];
+		semptr = &semakptr->u.sem_base[sopptr->sem_num];
+		semptr->sempid = td->td_proc->p_pid;
+	}
+	semakptr->u.sem_otime = time_second;
+
+	/*
+	 * Do a wakeup if any semaphore was up'd whilst something was
+	 * sleeping on it.
+	 */
+	if (do_wakeup) {
+		DPRINTF(("semop:  doing wakeup\n"));
+		wakeup(semakptr);
+		DPRINTF(("semop:  back from wakeup\n"));
+	}
+	DPRINTF(("semop:  done\n"));
+	td->td_retval[0] = 0;
+done2:
+	mtx_unlock(sema_mtxp);
+	if (sops != small_sops)
+		free(sops, M_SEM);
+	return (error);
+}
+
+/*
+ * Go through the undo structures for this process and apply the adjustments to
+ * semaphores.
+ */
+static void
+semexit_myhook(void *arg, struct proc *p)
+{
+	struct sem_undo *suptr;
+	struct semid_kernel *semakptr;
+	struct mtx *sema_mtxp;
+	int semid, semnum, adjval, ix;
+	unsigned short seq;
+
+	/*
+	 * Go through the chain of undo vectors looking for one
+	 * associated with this process.
+	 */
+	SEMUNDO_LOCK();
+	LIST_FOREACH(suptr, &semu_list, un_next) {
+		if (suptr->un_proc == p)
+			break;
+	}
+	if (suptr == NULL) {
+		SEMUNDO_UNLOCK();
+		return;
+	}
+	LIST_REMOVE(suptr, un_next);
+
+	DPRINTF(("proc @%p has undo structure with %d entries\n", p,
+	    suptr->un_cnt));
+
+	/*
+	 * If there are any active undo elements then process them.
+	 */
+	if (suptr->un_cnt > 0) {
+		SEMUNDO_UNLOCK();
+		for (ix = 0; ix < suptr->un_cnt; ix++) {
+			semid = suptr->un_ent[ix].un_id;
+			semnum = suptr->un_ent[ix].un_num;
+			adjval = suptr->un_ent[ix].un_adjval;
+			seq = suptr->un_ent[ix].un_seq;
+			semakptr = &sema[semid];
+			sema_mtxp = &sema_mtx[semid];
+
+			mtx_lock(sema_mtxp);
+			if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
+			    (semakptr->u.sem_perm.seq != seq)) {
+				mtx_unlock(sema_mtxp);
+				continue;
+			}
+			if (semnum >= semakptr->u.sem_nsems)
+				panic("semexit - semnum out of range");
+
+			DPRINTF((
+			    "semexit:  %p id=%d num=%d(adj=%d) ; sem=%d\n",
+			    suptr->un_proc, suptr->un_ent[ix].un_id,
+			    suptr->un_ent[ix].un_num,
+			    suptr->un_ent[ix].un_adjval,
+			    semakptr->u.sem_base[semnum].semval));
+
+			if (adjval < 0 && semakptr->u.sem_base[semnum].semval <
+			    -adjval)
+				semakptr->u.sem_base[semnum].semval = 0;
+			else
+				semakptr->u.sem_base[semnum].semval += adjval;
+
+			wakeup(semakptr);
+			DPRINTF(("semexit:  back from wakeup\n"));
+			mtx_unlock(sema_mtxp);
+		}
+		SEMUNDO_LOCK();
+	}
+
+	/*
+	 * Deallocate the undo vector.
+	 */
+	DPRINTF(("removing vector\n"));
+	suptr->un_proc = NULL;
+	suptr->un_cnt = 0;
+	LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
+	SEMUNDO_UNLOCK();
+}
+
+static int
+sysctl_sema(SYSCTL_HANDLER_ARGS)
+{
+
+	return (SYSCTL_OUT(req, sema,
+	    sizeof(struct semid_kernel) * seminfo.semmni));
+}
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *semcalls[] = {
+	(sy_call_t *)freebsd7___semctl, (sy_call_t *)sys_semget,
+	(sy_call_t *)sys_semop
+};
+
+/*
+ * Entry point for all SEM calls.
+ */
+int
+sys_semsys(td, uap)
+	struct thread *td;
+	/* XXX actually varargs. */
+	struct semsys_args /* {
+		int	which;
+		int	a2;
+		int	a3;
+		int	a4;
+		int	a5;
+	} */ *uap;
+{
+	int error;
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+	if (uap->which < 0 ||
+	    uap->which >= sizeof(semcalls)/sizeof(semcalls[0]))
+		return (EINVAL);
+	error = (*semcalls[uap->which])(td, &uap->a2);
+	return (error);
+}
+
+#ifndef CP
+#define CP(src, dst, fld)	do { (dst).fld = (src).fld; } while (0)
+#endif
+
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd7___semctl_args {
+	int	semid;
+	int	semnum;
+	int	cmd;
+	union	semun_old *arg;
+};
+#endif
+int
+freebsd7___semctl(struct thread *td, struct freebsd7___semctl_args *uap)
+{
+	struct semid_ds_old dsold;
+	struct semid_ds dsbuf;
+	union semun_old arg;
+	union semun semun;
+	register_t rval;
+	int error;
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_SET:
+	case IPC_STAT:
+	case GETALL:
+	case SETVAL:
+	case SETALL:
+		error = copyin(uap->arg, &arg, sizeof(arg));
+		if (error)
+			return (error);
+		break;
+	}
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_STAT:
+		semun.buf = &dsbuf;
+		break;
+	case IPC_SET:
+		error = copyin(arg.buf, &dsold, sizeof(dsold));
+		if (error)
+			return (error);
+		ipcperm_old2new(&dsold.sem_perm, &dsbuf.sem_perm);
+		CP(dsold, dsbuf, sem_base);
+		CP(dsold, dsbuf, sem_nsems);
+		CP(dsold, dsbuf, sem_otime);
+		CP(dsold, dsbuf, sem_ctime);
+		semun.buf = &dsbuf;
+		break;
+	case GETALL:
+	case SETALL:
+		semun.array = arg.array;
+		break;
+	case SETVAL:
+		semun.val = arg.val;
+		break;		
+	}
+
+	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
+	    &rval);
+	if (error)
+		return (error);
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_STAT:
+		bzero(&dsold, sizeof(dsold));
+		ipcperm_new2old(&dsbuf.sem_perm, &dsold.sem_perm);
+		CP(dsbuf, dsold, sem_base);
+		CP(dsbuf, dsold, sem_nsems);
+		CP(dsbuf, dsold, sem_otime);
+		CP(dsbuf, dsold, sem_ctime);
+		error = copyout(&dsold, arg.buf, sizeof(dsold));
+		break;
+	}
+
+	if (error == 0)
+		td->td_retval[0] = rval;
+	return (error);
+}
+
+#endif /* COMPAT_FREEBSD{4,5,6,7} */
+
+#ifdef COMPAT_FREEBSD32
+
+int
+freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap)
+{
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+	switch (uap->which) {
+	case 0:
+		return (freebsd7_freebsd32_semctl(td,
+		    (struct freebsd7_freebsd32_semctl_args *)&uap->a2));
+	default:
+		return (sys_semsys(td, (struct semsys_args *)uap));
+	}
+#else
+	return (nosys(td, NULL));
+#endif
+}
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+int
+freebsd7_freebsd32_semctl(struct thread *td,
+    struct freebsd7_freebsd32_semctl_args *uap)
+{
+	struct semid_ds32_old dsbuf32;
+	struct semid_ds dsbuf;
+	union semun semun;
+	union semun32 arg;
+	register_t rval;
+	int error;
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_SET:
+	case IPC_STAT:
+	case GETALL:
+	case SETVAL:
+	case SETALL:
+		error = copyin(uap->arg, &arg, sizeof(arg));
+		if (error)
+			return (error);		
+		break;
+	}
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_STAT:
+		semun.buf = &dsbuf;
+		break;
+	case IPC_SET:
+		error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
+		if (error)
+			return (error);
+		freebsd32_ipcperm_old_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
+		PTRIN_CP(dsbuf32, dsbuf, sem_base);
+		CP(dsbuf32, dsbuf, sem_nsems);
+		CP(dsbuf32, dsbuf, sem_otime);
+		CP(dsbuf32, dsbuf, sem_ctime);
+		semun.buf = &dsbuf;
+		break;
+	case GETALL:
+	case SETALL:
+		semun.array = PTRIN(arg.array);
+		break;
+	case SETVAL:
+		semun.val = arg.val;
+		break;
+	}
+
+	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
+	    &rval);
+	if (error)
+		return (error);
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_STAT:
+		bzero(&dsbuf32, sizeof(dsbuf32));
+		freebsd32_ipcperm_old_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
+		PTROUT_CP(dsbuf, dsbuf32, sem_base);
+		CP(dsbuf, dsbuf32, sem_nsems);
+		CP(dsbuf, dsbuf32, sem_otime);
+		CP(dsbuf, dsbuf32, sem_ctime);
+		error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
+		break;
+	}
+
+	if (error == 0)
+		td->td_retval[0] = rval;
+	return (error);
+}
+#endif
+
+int
+freebsd32_semctl(struct thread *td, struct freebsd32_semctl_args *uap)
+{
+	struct semid_ds32 dsbuf32;
+	struct semid_ds dsbuf;
+	union semun semun;
+	union semun32 arg;
+	register_t rval;
+	int error;
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_SET:
+	case IPC_STAT:
+	case GETALL:
+	case SETVAL:
+	case SETALL:
+		error = copyin(uap->arg, &arg, sizeof(arg));
+		if (error)
+			return (error);		
+		break;
+	}
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_STAT:
+		semun.buf = &dsbuf;
+		break;
+	case IPC_SET:
+		error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
+		if (error)
+			return (error);
+		freebsd32_ipcperm_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
+		PTRIN_CP(dsbuf32, dsbuf, sem_base);
+		CP(dsbuf32, dsbuf, sem_nsems);
+		CP(dsbuf32, dsbuf, sem_otime);
+		CP(dsbuf32, dsbuf, sem_ctime);
+		semun.buf = &dsbuf;
+		break;
+	case GETALL:
+	case SETALL:
+		semun.array = PTRIN(arg.array);
+		break;
+	case SETVAL:
+		semun.val = arg.val;
+		break;		
+	}
+
+	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
+	    &rval);
+	if (error)
+		return (error);
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_STAT:
+		bzero(&dsbuf32, sizeof(dsbuf32));
+		freebsd32_ipcperm_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
+		PTROUT_CP(dsbuf, dsbuf32, sem_base);
+		CP(dsbuf, dsbuf32, sem_nsems);
+		CP(dsbuf, dsbuf32, sem_otime);
+		CP(dsbuf, dsbuf32, sem_ctime);
+		error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
+		break;
+	}
+
+	if (error == 0)
+		td->td_retval[0] = rval;
+	return (error);
+}
+
+#endif /* COMPAT_FREEBSD32 */
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
new file mode 100644
index 0000000..90f5d77
--- /dev/null
+++ b/sys/kern/sysv_shm.c
@@ -0,0 +1,1407 @@
+/*	$NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $	*/
+/*-
+ * Copyright (c) 1994 Adam Glass and Charles Hannum.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Adam Glass and Charles
+ *	Hannum.
+ * 4. The names of the authors may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 2003-2005 McAfee, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project in part by McAfee
+ * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
+ * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
+ * program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/sysctl.h>
+#include <sys/shm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/jail.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_object.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+FEATURE(sysv_shm, "System V shared memory segments support");
+
+static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
+
+static int shmget_allocate_segment(struct thread *td,
+    struct shmget_args *uap, int mode);
+static int shmget_existing(struct thread *td, struct shmget_args *uap,
+    int mode, int segnum);
+
+#define	SHMSEG_FREE     	0x0200
+#define	SHMSEG_REMOVED  	0x0400
+#define	SHMSEG_ALLOCATED	0x0800
+#define	SHMSEG_WANTED		0x1000
+
+static int shm_last_free, shm_nused, shmalloced;
+vm_size_t shm_committed;
+static struct shmid_kernel	*shmsegs;
+
+struct shmmap_state {
+	vm_offset_t va;
+	int shmid;
+};
+
+static void shm_deallocate_segment(struct shmid_kernel *);
+static int shm_find_segment_by_key(key_t);
+static struct shmid_kernel *shm_find_segment_by_shmid(int);
+static struct shmid_kernel *shm_find_segment_by_shmidx(int);
+static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *);
+static void shmrealloc(void);
+static int shminit(void);
+static int sysvshm_modload(struct module *, int, void *);
+static int shmunload(void);
+static void shmexit_myhook(struct vmspace *vm);
+static void shmfork_myhook(struct proc *p1, struct proc *p2);
+static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS);
+
+/*
+ * Tuneable values.
+ */
+#ifndef SHMMAXPGS
+#define	SHMMAXPGS	131072	/* Note: sysv shared memory is swap backed. */
+#endif
+#ifndef SHMMAX
+#define	SHMMAX	(SHMMAXPGS*PAGE_SIZE)
+#endif
+#ifndef SHMMIN
+#define	SHMMIN	1
+#endif
+#ifndef SHMMNI
+#define	SHMMNI	192
+#endif
+#ifndef SHMSEG
+#define	SHMSEG	128
+#endif
+#ifndef SHMALL
+#define	SHMALL	(SHMMAXPGS)
+#endif
+
+struct	shminfo shminfo = {
+	SHMMAX,
+	SHMMIN,
+	SHMMNI,
+	SHMSEG,
+	SHMALL
+};
+
+static int shm_use_phys;
+static int shm_allow_removed;
+
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0,
+    "Maximum shared memory segment size");
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0,
+    "Minimum shared memory segment size");
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
+    "Number of shared memory identifiers");
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
+    "Number of segments per process");
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0,
+    "Maximum number of pages available for shared memory");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW,
+    &shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RW,
+    &shm_allow_removed, 0,
+    "Enable/Disable attachment to attached segments marked for removal");
+SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLTYPE_OPAQUE | CTLFLAG_RD,
+    NULL, 0, sysctl_shmsegs, "",
+    "Current number of shared memory segments allocated");
+
+static int
+shm_find_segment_by_key(key)
+	key_t key;
+{
+	int i;
+
+	for (i = 0; i < shmalloced; i++)
+		if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) &&
+		    shmsegs[i].u.shm_perm.key == key)
+			return (i);
+	return (-1);
+}
+
+static struct shmid_kernel *
+shm_find_segment_by_shmid(int shmid)
+{
+	int segnum;
+	struct shmid_kernel *shmseg;
+
+	segnum = IPCID_TO_IX(shmid);
+	if (segnum < 0 || segnum >= shmalloced)
+		return (NULL);
+	shmseg = &shmsegs[segnum];
+	if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
+	    (!shm_allow_removed &&
+	     (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) ||
+	    shmseg->u.shm_perm.seq != IPCID_TO_SEQ(shmid))
+		return (NULL);
+	return (shmseg);
+}
+
+static struct shmid_kernel *
+shm_find_segment_by_shmidx(int segnum)
+{
+	struct shmid_kernel *shmseg;
+
+	if (segnum < 0 || segnum >= shmalloced)
+		return (NULL);
+	shmseg = &shmsegs[segnum];
+	if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
+	    (!shm_allow_removed &&
+	     (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0))
+		return (NULL);
+	return (shmseg);
+}
+
+static void
+shm_deallocate_segment(shmseg)
+	struct shmid_kernel *shmseg;
+{
+	vm_size_t size;
+
+	GIANT_REQUIRED;
+
+	vm_object_deallocate(shmseg->object);
+	shmseg->object = NULL;
+	size = round_page(shmseg->u.shm_segsz);
+	shm_committed -= btoc(size);
+	shm_nused--;
+	shmseg->u.shm_perm.mode = SHMSEG_FREE;
+#ifdef MAC
+	mac_sysvshm_cleanup(shmseg);
+#endif
+	racct_sub_cred(shmseg->cred, RACCT_NSHM, 1);
+	racct_sub_cred(shmseg->cred, RACCT_SHMSIZE, size);
+	crfree(shmseg->cred);
+	shmseg->cred = NULL;
+}
+
+static int
+shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s)
+{
+	struct shmid_kernel *shmseg;
+	int segnum, result;
+	vm_size_t size;
+
+	GIANT_REQUIRED;
+
+	segnum = IPCID_TO_IX(shmmap_s->shmid);
+	shmseg = &shmsegs[segnum];
+	size = round_page(shmseg->u.shm_segsz);
+	result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size);
+	if (result != KERN_SUCCESS)
+		return (EINVAL);
+	shmmap_s->shmid = -1;
+	shmseg->u.shm_dtime = time_second;
+	if ((--shmseg->u.shm_nattch <= 0) &&
+	    (shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) {
+		shm_deallocate_segment(shmseg);
+		shm_last_free = segnum;
+	}
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmdt_args {
+	const void *shmaddr;
+};
+#endif
+int
+sys_shmdt(td, uap)
+	struct thread *td;
+	struct shmdt_args *uap;
+{
+	struct proc *p = td->td_proc;
+	struct shmmap_state *shmmap_s;
+#ifdef MAC
+	struct shmid_kernel *shmsegptr;
+#endif
+	int i;
+	int error = 0;
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+	mtx_lock(&Giant);
+	shmmap_s = p->p_vmspace->vm_shm;
+ 	if (shmmap_s == NULL) {
+		error = EINVAL;
+		goto done2;
+	}
+	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
+		if (shmmap_s->shmid != -1 &&
+		    shmmap_s->va == (vm_offset_t)uap->shmaddr) {
+			break;
+		}
+	}
+	if (i == shminfo.shmseg) {
+		error = EINVAL;
+		goto done2;
+	}
+#ifdef MAC
+	shmsegptr = &shmsegs[IPCID_TO_IX(shmmap_s->shmid)];
+	error = mac_sysvshm_check_shmdt(td->td_ucred, shmsegptr);
+	if (error != 0)
+		goto done2;
+#endif
+	error = shm_delete_mapping(p->p_vmspace, shmmap_s);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmat_args {
+	int shmid;
+	const void *shmaddr;
+	int shmflg;
+};
+#endif
+int
+kern_shmat(td, shmid, shmaddr, shmflg)
+	struct thread *td;
+	int shmid;
+	const void *shmaddr;
+	int shmflg;
+{
+	struct proc *p = td->td_proc;
+	int i, flags;
+	struct shmid_kernel *shmseg;
+	struct shmmap_state *shmmap_s = NULL;
+	vm_offset_t attach_va;
+	vm_prot_t prot;
+	vm_size_t size;
+	int rv;
+	int error = 0;
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+	mtx_lock(&Giant);
+	shmmap_s = p->p_vmspace->vm_shm;
+	if (shmmap_s == NULL) {
+		shmmap_s = malloc(shminfo.shmseg * sizeof(struct shmmap_state),
+		    M_SHM, M_WAITOK);
+		for (i = 0; i < shminfo.shmseg; i++)
+			shmmap_s[i].shmid = -1;
+		p->p_vmspace->vm_shm = shmmap_s;
+	}
+	shmseg = shm_find_segment_by_shmid(shmid);
+	if (shmseg == NULL) {
+		error = EINVAL;
+		goto done2;
+	}
+	error = ipcperm(td, &shmseg->u.shm_perm,
+	    (shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
+	if (error)
+		goto done2;
+#ifdef MAC
+	error = mac_sysvshm_check_shmat(td->td_ucred, shmseg, shmflg);
+	if (error != 0)
+		goto done2;
+#endif
+	for (i = 0; i < shminfo.shmseg; i++) {
+		if (shmmap_s->shmid == -1)
+			break;
+		shmmap_s++;
+	}
+	if (i >= shminfo.shmseg) {
+		error = EMFILE;
+		goto done2;
+	}
+	size = round_page(shmseg->u.shm_segsz);
+	prot = VM_PROT_READ;
+	if ((shmflg & SHM_RDONLY) == 0)
+		prot |= VM_PROT_WRITE;
+	flags = MAP_ANON | MAP_SHARED;
+	if (shmaddr) {
+		flags |= MAP_FIXED;
+		if (shmflg & SHM_RND) {
+			attach_va = (vm_offset_t)shmaddr & ~(SHMLBA-1);
+		} else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0) {
+			attach_va = (vm_offset_t)shmaddr;
+		} else {
+			error = EINVAL;
+			goto done2;
+		}
+	} else {
+		/*
+		 * This is just a hint to vm_map_find() about where to
+		 * put it.
+		 */
+		PROC_LOCK(p);
+		attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
+		    lim_max(p, RLIMIT_DATA));
+		PROC_UNLOCK(p);
+	}
+
+	vm_object_reference(shmseg->object);
+	rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->object,
+	    0, &attach_va, size, (flags & MAP_FIXED) ? VMFS_NO_SPACE :
+	    VMFS_OPTIMAL_SPACE, prot, prot, MAP_INHERIT_SHARE);
+	if (rv != KERN_SUCCESS) {
+		vm_object_deallocate(shmseg->object);
+		error = ENOMEM;
+		goto done2;
+	}
+
+	shmmap_s->va = attach_va;
+	shmmap_s->shmid = shmid;
+	shmseg->u.shm_lpid = p->p_pid;
+	shmseg->u.shm_atime = time_second;
+	shmseg->u.shm_nattch++;
+	td->td_retval[0] = attach_va;
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+int
+sys_shmat(td, uap)
+	struct thread *td;
+	struct shmat_args *uap;
+{
+	return kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg);
+}
+
+int
+kern_shmctl(td, shmid, cmd, buf, bufsz)
+	struct thread *td;
+	int shmid;
+	int cmd;
+	void *buf;
+	size_t *bufsz;
+{
+	int error = 0;
+	struct shmid_kernel *shmseg;
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+
+	mtx_lock(&Giant);
+	switch (cmd) {
+	/*
+	 * It is possible that kern_shmctl is being called from the Linux ABI
+	 * layer, in which case, we will need to implement IPC_INFO.  It should
+	 * be noted that other shmctl calls will be funneled through here for
+	 * Linix binaries as well.
+	 *
+	 * NB: The Linux ABI layer will convert this data to structure(s) more
+	 * consistent with the Linux ABI.
+	 */
+	case IPC_INFO:
+		memcpy(buf, &shminfo, sizeof(shminfo));
+		if (bufsz)
+			*bufsz = sizeof(shminfo);
+		td->td_retval[0] = shmalloced;
+		goto done2;
+	case SHM_INFO: {
+		struct shm_info shm_info;
+		shm_info.used_ids = shm_nused;
+		shm_info.shm_rss = 0;	/*XXX where to get from ? */
+		shm_info.shm_tot = 0;	/*XXX where to get from ? */
+		shm_info.shm_swp = 0;	/*XXX where to get from ? */
+		shm_info.swap_attempts = 0;	/*XXX where to get from ? */
+		shm_info.swap_successes = 0;	/*XXX where to get from ? */
+		memcpy(buf, &shm_info, sizeof(shm_info));
+		if (bufsz)
+			*bufsz = sizeof(shm_info);
+		td->td_retval[0] = shmalloced;
+		goto done2;
+	}
+	}
+	if (cmd == SHM_STAT)
+		shmseg = shm_find_segment_by_shmidx(shmid);
+	else
+		shmseg = shm_find_segment_by_shmid(shmid);
+	if (shmseg == NULL) {
+		error = EINVAL;
+		goto done2;
+	}
+#ifdef MAC
+	error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, cmd);
+	if (error != 0)
+		goto done2;
+#endif
+	switch (cmd) {
+	case SHM_STAT:
+	case IPC_STAT:
+		error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
+		if (error)
+			goto done2;
+		memcpy(buf, &shmseg->u, sizeof(struct shmid_ds));
+		if (bufsz)
+			*bufsz = sizeof(struct shmid_ds);
+		if (cmd == SHM_STAT)
+			td->td_retval[0] = IXSEQ_TO_IPCID(shmid, shmseg->u.shm_perm);
+		break;
+	case IPC_SET: {
+		struct shmid_ds *shmid;
+
+		shmid = (struct shmid_ds *)buf;
+		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
+		if (error)
+			goto done2;
+		shmseg->u.shm_perm.uid = shmid->shm_perm.uid;
+		shmseg->u.shm_perm.gid = shmid->shm_perm.gid;
+		shmseg->u.shm_perm.mode =
+		    (shmseg->u.shm_perm.mode & ~ACCESSPERMS) |
+		    (shmid->shm_perm.mode & ACCESSPERMS);
+		shmseg->u.shm_ctime = time_second;
+		break;
+	}
+	case IPC_RMID:
+		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
+		if (error)
+			goto done2;
+		shmseg->u.shm_perm.key = IPC_PRIVATE;
+		shmseg->u.shm_perm.mode |= SHMSEG_REMOVED;
+		if (shmseg->u.shm_nattch <= 0) {
+			shm_deallocate_segment(shmseg);
+			shm_last_free = IPCID_TO_IX(shmid);
+		}
+		break;
+#if 0
+	case SHM_LOCK:
+	case SHM_UNLOCK:
+#endif
+	default:
+		error = EINVAL;
+		break;
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmctl_args {
+	int shmid;
+	int cmd;
+	struct shmid_ds *buf;
+};
+#endif
+int
+sys_shmctl(td, uap)
+	struct thread *td;
+	struct shmctl_args *uap;
+{
+	int error = 0;
+	struct shmid_ds buf;
+	size_t bufsz;
+	
+	/*
+	 * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
+	 * Linux binaries.  If we see the call come through the FreeBSD ABI,
+	 * return an error back to the user since we do not to support this.
+	 */
+	if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
+	    uap->cmd == SHM_STAT)
+		return (EINVAL);
+
+	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
+	if (uap->cmd == IPC_SET) {
+		if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds))))
+			goto done;
+	}
+	
+	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
+	if (error)
+		goto done;
+	
+	/* Cases in which we need to copyout */
+	switch (uap->cmd) {
+	case IPC_STAT:
+		error = copyout(&buf, uap->buf, bufsz);
+		break;
+	}
+
+done:
+	if (error) {
+		/* Invalidate the return value */
+		td->td_retval[0] = -1;
+	}
+	return (error);
+}
+
+
+static int
+shmget_existing(td, uap, mode, segnum)
+	struct thread *td;
+	struct shmget_args *uap;
+	int mode;
+	int segnum;
+{
+	struct shmid_kernel *shmseg;
+	int error;
+
+	shmseg = &shmsegs[segnum];
+	if (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) {
+		/*
+		 * This segment is in the process of being allocated.  Wait
+		 * until it's done, and look the key up again (in case the
+		 * allocation failed or it was freed).
+		 */
+		shmseg->u.shm_perm.mode |= SHMSEG_WANTED;
+		error = tsleep(shmseg, PLOCK | PCATCH, "shmget", 0);
+		if (error)
+			return (error);
+		return (EAGAIN);
+	}
+	if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
+		return (EEXIST);
+#ifdef MAC
+	error = mac_sysvshm_check_shmget(td->td_ucred, shmseg, uap->shmflg);
+	if (error != 0)
+		return (error);
+#endif
+	if (uap->size != 0 && uap->size > shmseg->u.shm_segsz)
+		return (EINVAL);
+	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
+	return (0);
+}
+
+static int
+shmget_allocate_segment(td, uap, mode)
+	struct thread *td;
+	struct shmget_args *uap;
+	int mode;
+{
+	int i, segnum, shmid;
+	size_t size;
+	struct ucred *cred = td->td_ucred;
+	struct shmid_kernel *shmseg;
+	vm_object_t shm_object;
+
+	GIANT_REQUIRED;
+
+	if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
+		return (EINVAL);
+	if (shm_nused >= shminfo.shmmni) /* Any shmids left? */
+		return (ENOSPC);
+	size = round_page(uap->size);
+	if (shm_committed + btoc(size) > shminfo.shmall)
+		return (ENOMEM);
+	if (shm_last_free < 0) {
+		shmrealloc();	/* Maybe expand the shmsegs[] array. */
+		for (i = 0; i < shmalloced; i++)
+			if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE)
+				break;
+		if (i == shmalloced)
+			return (ENOSPC);
+		segnum = i;
+	} else  {
+		segnum = shm_last_free;
+		shm_last_free = -1;
+	}
+	shmseg = &shmsegs[segnum];
+#ifdef RACCT
+	PROC_LOCK(td->td_proc);
+	if (racct_add(td->td_proc, RACCT_NSHM, 1)) {
+		PROC_UNLOCK(td->td_proc);
+		return (ENOSPC);
+	}
+	if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) {
+		racct_sub(td->td_proc, RACCT_NSHM, 1);
+		PROC_UNLOCK(td->td_proc);
+		return (ENOMEM);
+	}
+	PROC_UNLOCK(td->td_proc);
+#endif
+	/*
+	 * In case we sleep in malloc(), mark the segment present but deleted
+	 * so that noone else tries to create the same key.
+	 */
+	shmseg->u.shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
+	shmseg->u.shm_perm.key = uap->key;
+	shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff;
+	shmid = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
+	
+	/*
+	 * We make sure that we have allocated a pager before we need
+	 * to.
+	 */
+	shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP,
+	    0, size, VM_PROT_DEFAULT, 0, cred);
+	if (shm_object == NULL) {
+#ifdef RACCT
+		PROC_LOCK(td->td_proc);
+		racct_sub(td->td_proc, RACCT_NSHM, 1);
+		racct_sub(td->td_proc, RACCT_SHMSIZE, size);
+		PROC_UNLOCK(td->td_proc);
+#endif
+		return (ENOMEM);
+	}
+	VM_OBJECT_WLOCK(shm_object);
+	vm_object_clear_flag(shm_object, OBJ_ONEMAPPING);
+	vm_object_set_flag(shm_object, OBJ_NOSPLIT);
+	VM_OBJECT_WUNLOCK(shm_object);
+
+	shmseg->object = shm_object;
+	shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid;
+	shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid;
+	shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & SHMSEG_WANTED) |
+	    (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
+	shmseg->cred = crhold(cred);
+	shmseg->u.shm_segsz = uap->size;
+	shmseg->u.shm_cpid = td->td_proc->p_pid;
+	shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0;
+	shmseg->u.shm_atime = shmseg->u.shm_dtime = 0;
+#ifdef MAC
+	mac_sysvshm_create(cred, shmseg);
+#endif
+	shmseg->u.shm_ctime = time_second;
+	shm_committed += btoc(size);
+	shm_nused++;
+	if (shmseg->u.shm_perm.mode & SHMSEG_WANTED) {
+		/*
+		 * Somebody else wanted this key while we were asleep.  Wake
+		 * them up now.
+		 */
+		shmseg->u.shm_perm.mode &= ~SHMSEG_WANTED;
+		wakeup(shmseg);
+	}
+	td->td_retval[0] = shmid;
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmget_args {
+	key_t key;
+	size_t size;
+	int shmflg;
+};
+#endif
+int
+sys_shmget(td, uap)
+	struct thread *td;
+	struct shmget_args *uap;
+{
+	int segnum, mode;
+	int error;
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+	mtx_lock(&Giant);
+	mode = uap->shmflg & ACCESSPERMS;
+	if (uap->key != IPC_PRIVATE) {
+	again:
+		segnum = shm_find_segment_by_key(uap->key);
+		if (segnum >= 0) {
+			error = shmget_existing(td, uap, mode, segnum);
+			if (error == EAGAIN)
+				goto again;
+			goto done2;
+		}
+		if ((uap->shmflg & IPC_CREAT) == 0) {
+			error = ENOENT;
+			goto done2;
+		}
+	}
+	error = shmget_allocate_segment(td, uap, mode);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+static void
+shmfork_myhook(p1, p2)
+	struct proc *p1, *p2;
+{
+	struct shmmap_state *shmmap_s;
+	size_t size;
+	int i;
+
+	mtx_lock(&Giant);
+	size = shminfo.shmseg * sizeof(struct shmmap_state);
+	shmmap_s = malloc(size, M_SHM, M_WAITOK);
+	bcopy(p1->p_vmspace->vm_shm, shmmap_s, size);
+	p2->p_vmspace->vm_shm = shmmap_s;
+	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+		if (shmmap_s->shmid != -1)
+			shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++;
+	mtx_unlock(&Giant);
+}
+
+static void
+shmexit_myhook(struct vmspace *vm)
+{
+	struct shmmap_state *base, *shm;
+	int i;
+
+	if ((base = vm->vm_shm) != NULL) {
+		vm->vm_shm = NULL;
+		mtx_lock(&Giant);
+		for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) {
+			if (shm->shmid != -1)
+				shm_delete_mapping(vm, shm);
+		}
+		mtx_unlock(&Giant);
+		free(base, M_SHM);
+	}
+}
+
+static void
+shmrealloc(void)
+{
+	int i;
+	struct shmid_kernel *newsegs;
+
+	if (shmalloced >= shminfo.shmmni)
+		return;
+
+	newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK);
+	if (newsegs == NULL)
+		return;
+	for (i = 0; i < shmalloced; i++)
+		bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0]));
+	for (; i < shminfo.shmmni; i++) {
+		shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
+		shmsegs[i].u.shm_perm.seq = 0;
+#ifdef MAC
+		mac_sysvshm_init(&shmsegs[i]);
+#endif
+	}
+	free(shmsegs, M_SHM);
+	shmsegs = newsegs;
+	shmalloced = shminfo.shmmni;
+}
+
+static struct syscall_helper_data shm_syscalls[] = {
+	SYSCALL_INIT_HELPER(shmat),
+	SYSCALL_INIT_HELPER(shmctl),
+	SYSCALL_INIT_HELPER(shmdt),
+	SYSCALL_INIT_HELPER(shmget),
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+	SYSCALL_INIT_HELPER_COMPAT(freebsd7_shmctl),
+#endif
+#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
+	SYSCALL_INIT_HELPER(shmsys),
+#endif
+	SYSCALL_INIT_LAST
+};
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_ipc.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+static struct syscall_helper_data shm32_syscalls[] = {
+	SYSCALL32_INIT_HELPER_COMPAT(shmat),
+	SYSCALL32_INIT_HELPER_COMPAT(shmdt),
+	SYSCALL32_INIT_HELPER_COMPAT(shmget),
+	SYSCALL32_INIT_HELPER(freebsd32_shmsys),
+	SYSCALL32_INIT_HELPER(freebsd32_shmctl),
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_shmctl),
+#endif
+	SYSCALL_INIT_LAST
+};
+#endif
+
+static int
+shminit()
+{
+	int i, error;
+
+#ifndef BURN_BRIDGES
+	if (TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall) != 0)
+		printf("kern.ipc.shmmaxpgs is now called kern.ipc.shmall!\n");
+#endif
+	TUNABLE_ULONG_FETCH("kern.ipc.shmall", &shminfo.shmall);
+	if (!TUNABLE_ULONG_FETCH("kern.ipc.shmmax", &shminfo.shmmax)) {
+		/* Initialize shmmax dealing with possible overflow. */
+		for (i = PAGE_SIZE; i > 0; i--) {
+			shminfo.shmmax = shminfo.shmall * i;
+			if (shminfo.shmmax >= shminfo.shmall)
+				break;
+		}
+	}
+	TUNABLE_ULONG_FETCH("kern.ipc.shmmin", &shminfo.shmmin);
+	TUNABLE_ULONG_FETCH("kern.ipc.shmmni", &shminfo.shmmni);
+	TUNABLE_ULONG_FETCH("kern.ipc.shmseg", &shminfo.shmseg);
+	TUNABLE_INT_FETCH("kern.ipc.shm_use_phys", &shm_use_phys);
+
+	shmalloced = shminfo.shmmni;
+	shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK);
+	for (i = 0; i < shmalloced; i++) {
+		shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
+		shmsegs[i].u.shm_perm.seq = 0;
+#ifdef MAC
+		mac_sysvshm_init(&shmsegs[i]);
+#endif
+	}
+	shm_last_free = 0;
+	shm_nused = 0;
+	shm_committed = 0;
+	shmexit_hook = &shmexit_myhook;
+	shmfork_hook = &shmfork_myhook;
+
+	error = syscall_helper_register(shm_syscalls);
+	if (error != 0)
+		return (error);
+#ifdef COMPAT_FREEBSD32
+	error = syscall32_helper_register(shm32_syscalls);
+	if (error != 0)
+		return (error);
+#endif
+	return (0);
+}
+
+static int
+shmunload()
+{
+	int i;	
+
+	if (shm_nused > 0)
+		return (EBUSY);
+
+#ifdef COMPAT_FREEBSD32
+	syscall32_helper_unregister(shm32_syscalls);
+#endif
+	syscall_helper_unregister(shm_syscalls);
+
+	for (i = 0; i < shmalloced; i++) {
+#ifdef MAC
+		mac_sysvshm_destroy(&shmsegs[i]);
+#endif
+		/*
+		 * Objects might be still mapped into the processes
+		 * address spaces.  Actual free would happen on the
+		 * last mapping destruction.
+		 */
+		if (shmsegs[i].u.shm_perm.mode != SHMSEG_FREE)
+			vm_object_deallocate(shmsegs[i].object);
+	}
+	free(shmsegs, M_SHM);
+	shmexit_hook = NULL;
+	shmfork_hook = NULL;
+	return (0);
+}
+
+static int
+sysctl_shmsegs(SYSCTL_HANDLER_ARGS)
+{
+
+	return (SYSCTL_OUT(req, shmsegs, shmalloced * sizeof(shmsegs[0])));
+}
+
+#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
+struct oshmid_ds {
+	struct	ipc_perm_old shm_perm;	/* operation perms */
+	int	shm_segsz;		/* size of segment (bytes) */
+	u_short	shm_cpid;		/* pid, creator */
+	u_short	shm_lpid;		/* pid, last operation */
+	short	shm_nattch;		/* no. of current attaches */
+	time_t	shm_atime;		/* last attach time */
+	time_t	shm_dtime;		/* last detach time */
+	time_t	shm_ctime;		/* last change time */
+	void	*shm_handle;		/* internal handle for shm segment */
+};
+
+struct oshmctl_args {
+	int shmid;
+	int cmd;
+	struct oshmid_ds *ubuf;
+};
+
+static int
+oshmctl(struct thread *td, struct oshmctl_args *uap)
+{
+#ifdef COMPAT_43
+	int error = 0;
+	struct shmid_kernel *shmseg;
+	struct oshmid_ds outbuf;
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+	mtx_lock(&Giant);
+	shmseg = shm_find_segment_by_shmid(uap->shmid);
+	if (shmseg == NULL) {
+		error = EINVAL;
+		goto done2;
+	}
+	switch (uap->cmd) {
+	case IPC_STAT:
+		error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
+		if (error)
+			goto done2;
+#ifdef MAC
+		error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, uap->cmd);
+		if (error != 0)
+			goto done2;
+#endif
+		ipcperm_new2old(&shmseg->u.shm_perm, &outbuf.shm_perm);
+		outbuf.shm_segsz = shmseg->u.shm_segsz;
+		outbuf.shm_cpid = shmseg->u.shm_cpid;
+		outbuf.shm_lpid = shmseg->u.shm_lpid;
+		outbuf.shm_nattch = shmseg->u.shm_nattch;
+		outbuf.shm_atime = shmseg->u.shm_atime;
+		outbuf.shm_dtime = shmseg->u.shm_dtime;
+		outbuf.shm_ctime = shmseg->u.shm_ctime;
+		outbuf.shm_handle = shmseg->object;
+		error = copyout(&outbuf, uap->ubuf, sizeof(outbuf));
+		if (error)
+			goto done2;
+		break;
+	default:
+		error = freebsd7_shmctl(td, (struct freebsd7_shmctl_args *)uap);
+		break;
+	}
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+#else
+	return (EINVAL);
+#endif
+}
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *shmcalls[] = {
+	(sy_call_t *)sys_shmat, (sy_call_t *)oshmctl,
+	(sy_call_t *)sys_shmdt, (sy_call_t *)sys_shmget,
+	(sy_call_t *)freebsd7_shmctl
+};
+
+int
+sys_shmsys(td, uap)
+	struct thread *td;
+	/* XXX actually varargs. */
+	struct shmsys_args /* {
+		int	which;
+		int	a2;
+		int	a3;
+		int	a4;
+	} */ *uap;
+{
+	int error;
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+		return (ENOSYS);
+	if (uap->which < 0 ||
+	    uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0]))
+		return (EINVAL);
+	mtx_lock(&Giant);
+	error = (*shmcalls[uap->which])(td, &uap->a2);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+#endif	/* i386 && (COMPAT_FREEBSD4 || COMPAT_43) */
+
+#ifdef COMPAT_FREEBSD32
+
+int
+freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap)
+{
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+	switch (uap->which) {
+	case 0:	{	/* shmat */
+		struct shmat_args ap;
+
+		ap.shmid = uap->a2;
+		ap.shmaddr = PTRIN(uap->a3);
+		ap.shmflg = uap->a4;
+		return (sysent[SYS_shmat].sy_call(td, &ap));
+	}
+	case 2: {	/* shmdt */
+		struct shmdt_args ap;
+
+		ap.shmaddr = PTRIN(uap->a2);
+		return (sysent[SYS_shmdt].sy_call(td, &ap));
+	}
+	case 3: {	/* shmget */
+		struct shmget_args ap;
+
+		ap.key = uap->a2;
+		ap.size = uap->a3;
+		ap.shmflg = uap->a4;
+		return (sysent[SYS_shmget].sy_call(td, &ap));
+	}
+	case 4: {	/* shmctl */
+		struct freebsd7_freebsd32_shmctl_args ap;
+
+		ap.shmid = uap->a2;
+		ap.cmd = uap->a3;
+		ap.buf = PTRIN(uap->a4);
+		return (freebsd7_freebsd32_shmctl(td, &ap));
+	}
+	case 1:		/* oshmctl */
+	default:
+		return (EINVAL);
+	}
+#else
+	return (nosys(td, NULL));
+#endif
+}
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+int
+freebsd7_freebsd32_shmctl(struct thread *td,
+    struct freebsd7_freebsd32_shmctl_args *uap)
+{
+	int error = 0;
+	union {
+		struct shmid_ds shmid_ds;
+		struct shm_info shm_info;
+		struct shminfo shminfo;
+	} u;
+	union {
+		struct shmid_ds32_old shmid_ds32;
+		struct shm_info32 shm_info32;
+		struct shminfo32 shminfo32;
+	} u32;
+	size_t sz;
+
+	if (uap->cmd == IPC_SET) {
+		if ((error = copyin(uap->buf, &u32.shmid_ds32,
+		    sizeof(u32.shmid_ds32))))
+			goto done;
+		freebsd32_ipcperm_old_in(&u32.shmid_ds32.shm_perm,
+		    &u.shmid_ds.shm_perm);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
+	}
+	
+	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
+	if (error)
+		goto done;
+	
+	/* Cases in which we need to copyout */
+	switch (uap->cmd) {
+	case IPC_INFO:
+		CP(u.shminfo, u32.shminfo32, shmmax);
+		CP(u.shminfo, u32.shminfo32, shmmin);
+		CP(u.shminfo, u32.shminfo32, shmmni);
+		CP(u.shminfo, u32.shminfo32, shmseg);
+		CP(u.shminfo, u32.shminfo32, shmall);
+		error = copyout(&u32.shminfo32, uap->buf,
+		    sizeof(u32.shminfo32));
+		break;
+	case SHM_INFO:
+		CP(u.shm_info, u32.shm_info32, used_ids);
+		CP(u.shm_info, u32.shm_info32, shm_rss);
+		CP(u.shm_info, u32.shm_info32, shm_tot);
+		CP(u.shm_info, u32.shm_info32, shm_swp);
+		CP(u.shm_info, u32.shm_info32, swap_attempts);
+		CP(u.shm_info, u32.shm_info32, swap_successes);
+		error = copyout(&u32.shm_info32, uap->buf,
+		    sizeof(u32.shm_info32));
+		break;
+	case SHM_STAT:
+	case IPC_STAT:
+		freebsd32_ipcperm_old_out(&u.shmid_ds.shm_perm,
+		    &u32.shmid_ds32.shm_perm);
+		if (u.shmid_ds.shm_segsz > INT32_MAX)
+			u32.shmid_ds32.shm_segsz = INT32_MAX;
+		else
+			CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
+		CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
+		CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
+		CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
+		CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
+		CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
+		CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
+		u32.shmid_ds32.shm_internal = 0;
+		error = copyout(&u32.shmid_ds32, uap->buf,
+		    sizeof(u32.shmid_ds32));
+		break;
+	}
+
+done:
+	if (error) {
+		/* Invalidate the return value */
+		td->td_retval[0] = -1;
+	}
+	return (error);
+}
+#endif
+
+int
+freebsd32_shmctl(struct thread *td, struct freebsd32_shmctl_args *uap)
+{
+	int error = 0;
+	union {
+		struct shmid_ds shmid_ds;
+		struct shm_info shm_info;
+		struct shminfo shminfo;
+	} u;
+	union {
+		struct shmid_ds32 shmid_ds32;
+		struct shm_info32 shm_info32;
+		struct shminfo32 shminfo32;
+	} u32;
+	size_t sz;
+	
+	if (uap->cmd == IPC_SET) {
+		if ((error = copyin(uap->buf, &u32.shmid_ds32,
+		    sizeof(u32.shmid_ds32))))
+			goto done;
+		freebsd32_ipcperm_in(&u32.shmid_ds32.shm_perm,
+		    &u.shmid_ds.shm_perm);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
+		CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
+	}
+	
+	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
+	if (error)
+		goto done;
+	
+	/* Cases in which we need to copyout */
+	switch (uap->cmd) {
+	case IPC_INFO:
+		CP(u.shminfo, u32.shminfo32, shmmax);
+		CP(u.shminfo, u32.shminfo32, shmmin);
+		CP(u.shminfo, u32.shminfo32, shmmni);
+		CP(u.shminfo, u32.shminfo32, shmseg);
+		CP(u.shminfo, u32.shminfo32, shmall);
+		error = copyout(&u32.shminfo32, uap->buf,
+		    sizeof(u32.shminfo32));
+		break;
+	case SHM_INFO:
+		CP(u.shm_info, u32.shm_info32, used_ids);
+		CP(u.shm_info, u32.shm_info32, shm_rss);
+		CP(u.shm_info, u32.shm_info32, shm_tot);
+		CP(u.shm_info, u32.shm_info32, shm_swp);
+		CP(u.shm_info, u32.shm_info32, swap_attempts);
+		CP(u.shm_info, u32.shm_info32, swap_successes);
+		error = copyout(&u32.shm_info32, uap->buf,
+		    sizeof(u32.shm_info32));
+		break;
+	case SHM_STAT:
+	case IPC_STAT:
+		freebsd32_ipcperm_out(&u.shmid_ds.shm_perm,
+		    &u32.shmid_ds32.shm_perm);
+		if (u.shmid_ds.shm_segsz > INT32_MAX)
+			u32.shmid_ds32.shm_segsz = INT32_MAX;
+		else
+			CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
+		CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
+		CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
+		CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
+		CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
+		CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
+		CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
+		error = copyout(&u32.shmid_ds32, uap->buf,
+		    sizeof(u32.shmid_ds32));
+		break;
+	}
+
+done:
+	if (error) {
+		/* Invalidate the return value */
+		td->td_retval[0] = -1;
+	}
+	return (error);
+}
+#endif
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+
+#ifndef CP
+#define CP(src, dst, fld)	do { (dst).fld = (src).fld; } while (0)
+#endif
+
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd7_shmctl_args {
+	int shmid;
+	int cmd;
+	struct shmid_ds_old *buf;
+};
+#endif
+int
+freebsd7_shmctl(td, uap)
+	struct thread *td;
+	struct freebsd7_shmctl_args *uap;
+{
+	int error = 0;
+	struct shmid_ds_old old;
+	struct shmid_ds buf;
+	size_t bufsz;
+	
+	/*
+	 * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
+	 * Linux binaries.  If we see the call come through the FreeBSD ABI,
+	 * return an error back to the user since we do not to support this.
+	 */
+	if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
+	    uap->cmd == SHM_STAT)
+		return (EINVAL);
+
+	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
+	if (uap->cmd == IPC_SET) {
+		if ((error = copyin(uap->buf, &old, sizeof(old))))
+			goto done;
+		ipcperm_old2new(&old.shm_perm, &buf.shm_perm);
+		CP(old, buf, shm_segsz);
+		CP(old, buf, shm_lpid);
+		CP(old, buf, shm_cpid);
+		CP(old, buf, shm_nattch);
+		CP(old, buf, shm_atime);
+		CP(old, buf, shm_dtime);
+		CP(old, buf, shm_ctime);
+	}
+	
+	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
+	if (error)
+		goto done;
+
+	/* Cases in which we need to copyout */
+	switch (uap->cmd) {
+	case IPC_STAT:
+		ipcperm_new2old(&buf.shm_perm, &old.shm_perm);
+		if (buf.shm_segsz > INT_MAX)
+			old.shm_segsz = INT_MAX;
+		else
+			CP(buf, old, shm_segsz);
+		CP(buf, old, shm_lpid);
+		CP(buf, old, shm_cpid);
+		if (buf.shm_nattch > SHRT_MAX)
+			old.shm_nattch = SHRT_MAX;
+		else
+			CP(buf, old, shm_nattch);
+		CP(buf, old, shm_atime);
+		CP(buf, old, shm_dtime);
+		CP(buf, old, shm_ctime);
+		old.shm_internal = NULL;
+		error = copyout(&old, uap->buf, sizeof(old));
+		break;
+	}
+
+done:
+	if (error) {
+		/* Invalidate the return value */
+		td->td_retval[0] = -1;
+	}
+	return (error);
+}
+
+#endif	/* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
+	   COMPAT_FREEBSD7 */
+
+static int
+sysvshm_modload(struct module *module, int cmd, void *arg)
+{
+	int error = 0;
+
+	switch (cmd) {
+	case MOD_LOAD:
+		error = shminit();
+		if (error != 0)
+			shmunload();
+		break;
+	case MOD_UNLOAD:
+		error = shmunload();
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t sysvshm_mod = {
+	"sysvshm",
+	&sysvshm_modload,
+	NULL
+};
+
+DECLARE_MODULE(sysvshm, sysvshm_mod, SI_SUB_SYSV_SHM, SI_ORDER_FIRST);
+MODULE_VERSION(sysvshm, 1);
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
new file mode 100644
index 0000000..4fce607
--- /dev/null
+++ b/sys/kern/tty.c
@@ -0,0 +1,2209 @@
+/*-
+ * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions of this software were developed under sponsorship from Snow
+ * B.V., the Netherlands.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#ifdef COMPAT_43TTY
+#include <sys/ioctl_compat.h>
+#endif /* COMPAT_43TTY */
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/poll.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/serial.h>
+#include <sys/signal.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/ttycom.h>
+#define TTYDEFCHARS
+#include <sys/ttydefaults.h>
+#undef TTYDEFCHARS
+#include <sys/ucred.h>
+#include <sys/vnode.h>
+
+#include <machine/stdarg.h>
+
+static MALLOC_DEFINE(M_TTY, "tty", "tty device");
+
+static void tty_rel_free(struct tty *tp);
+
+static TAILQ_HEAD(, tty) tty_list = TAILQ_HEAD_INITIALIZER(tty_list);
+static struct sx tty_list_sx;
+SX_SYSINIT(tty_list, &tty_list_sx, "tty list");
+static unsigned int tty_list_count = 0;
+
+/* Character device of /dev/console. */
+static struct cdev	*dev_console;
+static const char	*dev_console_filename;
+
+/*
+ * Flags that are supported and stored by this implementation.
+ */
+#define TTYSUP_IFLAG	(IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK|ISTRIP|\
+			INLCR|IGNCR|ICRNL|IXON|IXOFF|IXANY|IMAXBEL)
+#define TTYSUP_OFLAG	(OPOST|ONLCR|TAB3|ONOEOT|OCRNL|ONOCR|ONLRET)
+#define TTYSUP_LFLAG	(ECHOKE|ECHOE|ECHOK|ECHO|ECHONL|ECHOPRT|\
+			ECHOCTL|ISIG|ICANON|ALTWERASE|IEXTEN|TOSTOP|\
+			FLUSHO|NOKERNINFO|NOFLSH)
+#define TTYSUP_CFLAG	(CIGNORE|CSIZE|CSTOPB|CREAD|PARENB|PARODD|\
+			HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\
+			CDSR_OFLOW|CCAR_OFLOW)
+
+#define	TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT)
+
+/*
+ * Set TTY buffer sizes.
+ */
+
+#define	TTYBUF_MAX	65536
+
+static void
+tty_watermarks(struct tty *tp)
+{
+	size_t bs = 0;
+
+	/* Provide an input buffer for 0.2 seconds of data. */
+	if (tp->t_termios.c_cflag & CREAD)
+		bs = MIN(tp->t_termios.c_ispeed / 5, TTYBUF_MAX);
+	ttyinq_setsize(&tp->t_inq, tp, bs);
+
+	/* Set low watermark at 10% (when 90% is available). */
+	tp->t_inlow = (ttyinq_getallocatedsize(&tp->t_inq) * 9) / 10;
+
+	/* Provide an output buffer for 0.2 seconds of data. */
+	bs = MIN(tp->t_termios.c_ospeed / 5, TTYBUF_MAX);
+	ttyoutq_setsize(&tp->t_outq, tp, bs);
+
+	/* Set low watermark at 10% (when 90% is available). */
+	tp->t_outlow = (ttyoutq_getallocatedsize(&tp->t_outq) * 9) / 10;
+}
+
+static int
+tty_drain(struct tty *tp)
+{
+	int error;
+
+	if (ttyhook_hashook(tp, getc_inject))
+		/* buffer is inaccessible */
+		return (0);
+
+	while (ttyoutq_bytesused(&tp->t_outq) > 0) {
+		ttydevsw_outwakeup(tp);
+		/* Could be handled synchronously. */
+		if (ttyoutq_bytesused(&tp->t_outq) == 0)
+			return (0);
+
+		/* Wait for data to be drained. */
+		error = tty_wait(tp, &tp->t_outwait);
+		if (error)
+			return (error);
+	}
+
+	return (0);
+}
+
+/*
+ * Though ttydev_enter() and ttydev_leave() seem to be related, they
+ * don't have to be used together. ttydev_enter() is used by the cdev
+ * operations to prevent an actual operation from being processed when
+ * the TTY has been abandoned. ttydev_leave() is used by ttydev_open()
+ * and ttydev_close() to determine whether per-TTY data should be
+ * deallocated.
+ */
+
+static __inline int
+ttydev_enter(struct tty *tp)
+{
+	tty_lock(tp);
+
+	if (tty_gone(tp) || !tty_opened(tp)) {
+		/* Device is already gone. */
+		tty_unlock(tp);
+		return (ENXIO);
+	}
+
+	return (0);
+}
+
+static void
+ttydev_leave(struct tty *tp)
+{
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (tty_opened(tp) || tp->t_flags & TF_OPENCLOSE) {
+		/* Device is still opened somewhere. */
+		tty_unlock(tp);
+		return;
+	}
+
+	tp->t_flags |= TF_OPENCLOSE;
+
+	/* Stop asynchronous I/O. */
+	funsetown(&tp->t_sigio);
+
+	/* Remove console TTY. */
+	if (constty == tp)
+		constty_clear();
+
+	/* Drain any output. */
+	MPASS((tp->t_flags & TF_STOPPED) == 0);
+	if (!tty_gone(tp))
+		tty_drain(tp);
+
+	ttydisc_close(tp);
+
+	/* Destroy associated buffers already. */
+	ttyinq_free(&tp->t_inq);
+	tp->t_inlow = 0;
+	ttyoutq_free(&tp->t_outq);
+	tp->t_outlow = 0;
+
+	knlist_clear(&tp->t_inpoll.si_note, 1);
+	knlist_clear(&tp->t_outpoll.si_note, 1);
+
+	if (!tty_gone(tp))
+		ttydevsw_close(tp);
+
+	tp->t_flags &= ~TF_OPENCLOSE;
+	cv_broadcast(&tp->t_dcdwait);
+	tty_rel_free(tp);
+}
+
+/*
+ * Operations that are exposed through the character device in /dev.
+ */
+static int
+ttydev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+	struct tty *tp;
+	int error = 0;
+
+	while ((tp = dev->si_drv1) == NULL) {
+		error = tsleep(&dev->si_drv1, PCATCH, "ttdrv1", 1);
+		if (error != EWOULDBLOCK)
+			return (error);
+	}
+
+	tty_lock(tp);
+	if (tty_gone(tp)) {
+		/* Device is already gone. */
+		tty_unlock(tp);
+		return (ENXIO);
+	}
+
+	/*
+	 * Block when other processes are currently opening or closing
+	 * the TTY.
+	 */
+	while (tp->t_flags & TF_OPENCLOSE) {
+		error = tty_wait(tp, &tp->t_dcdwait);
+		if (error != 0) {
+			tty_unlock(tp);
+			return (error);
+		}
+	}
+	tp->t_flags |= TF_OPENCLOSE;
+
+	/*
+	 * Make sure the "tty" and "cua" device cannot be opened at the
+	 * same time.
+	 */
+	if (TTY_CALLOUT(tp, dev)) {
+		if (tp->t_flags & TF_OPENED_IN) {
+			error = EBUSY;
+			goto done;
+		}
+	} else {
+		if (tp->t_flags & TF_OPENED_OUT) {
+			error = EBUSY;
+			goto done;
+		}
+	}
+
+	if (tp->t_flags & TF_EXCLUDE && priv_check(td, PRIV_TTY_EXCLUSIVE)) {
+		error = EBUSY;
+		goto done;
+	}
+
+	if (!tty_opened(tp)) {
+		/* Set proper termios flags. */
+		if (TTY_CALLOUT(tp, dev))
+			tp->t_termios = tp->t_termios_init_out;
+		else
+			tp->t_termios = tp->t_termios_init_in;
+		ttydevsw_param(tp, &tp->t_termios);
+		/* Prevent modem control on callout devices and /dev/console. */
+		if (TTY_CALLOUT(tp, dev) || dev == dev_console)
+			tp->t_termios.c_cflag |= CLOCAL;
+
+		ttydevsw_modem(tp, SER_DTR|SER_RTS, 0);
+
+		error = ttydevsw_open(tp);
+		if (error != 0)
+			goto done;
+
+		ttydisc_open(tp);
+		tty_watermarks(tp);
+	}
+
+	/* Wait for Carrier Detect. */
+	if ((oflags & O_NONBLOCK) == 0 &&
+	    (tp->t_termios.c_cflag & CLOCAL) == 0) {
+		while ((ttydevsw_modem(tp, 0, 0) & SER_DCD) == 0) {
+			error = tty_wait(tp, &tp->t_dcdwait);
+			if (error != 0)
+				goto done;
+		}
+	}
+
+	if (dev == dev_console)
+		tp->t_flags |= TF_OPENED_CONS;
+	else if (TTY_CALLOUT(tp, dev))
+		tp->t_flags |= TF_OPENED_OUT;
+	else
+		tp->t_flags |= TF_OPENED_IN;
+
+done:	tp->t_flags &= ~TF_OPENCLOSE;
+	cv_broadcast(&tp->t_dcdwait);
+	ttydev_leave(tp);
+
+	return (error);
+}
+
+static int
+ttydev_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
+{
+	struct tty *tp = dev->si_drv1;
+
+	tty_lock(tp);
+
+	/*
+	 * Don't actually close the device if it is being used as the
+	 * console.
+	 */
+	MPASS((tp->t_flags & TF_OPENED) != TF_OPENED);
+	if (dev == dev_console)
+		tp->t_flags &= ~TF_OPENED_CONS;
+	else
+		tp->t_flags &= ~(TF_OPENED_IN|TF_OPENED_OUT);
+
+	if (tp->t_flags & TF_OPENED) {
+		tty_unlock(tp);
+		return (0);
+	}
+
+	/*
+	 * This can only be called once. The callin and the callout
+	 * devices cannot be opened at the same time.
+	 */
+	tp->t_flags &= ~(TF_EXCLUDE|TF_STOPPED);
+
+	/* Properly wake up threads that are stuck - revoke(). */
+	tp->t_revokecnt++;
+	tty_wakeup(tp, FREAD|FWRITE);
+	cv_broadcast(&tp->t_bgwait);
+	cv_broadcast(&tp->t_dcdwait);
+
+	ttydev_leave(tp);
+
+	return (0);
+}
+
+static __inline int
+tty_is_ctty(struct tty *tp, struct proc *p)
+{
+	tty_lock_assert(tp, MA_OWNED);
+
+	return (p->p_session == tp->t_session && p->p_flag & P_CONTROLT);
+}
+
+int
+tty_wait_background(struct tty *tp, struct thread *td, int sig)
+{
+	struct proc *p = td->td_proc;
+	struct pgrp *pg;
+	ksiginfo_t ksi;
+	int error;
+
+	MPASS(sig == SIGTTIN || sig == SIGTTOU);
+	tty_lock_assert(tp, MA_OWNED);
+
+	for (;;) {
+		PROC_LOCK(p);
+		/*
+		 * The process should only sleep, when:
+		 * - This terminal is the controling terminal
+		 * - Its process group is not the foreground process
+		 *   group
+		 * - The parent process isn't waiting for the child to
+		 *   exit
+		 * - the signal to send to the process isn't masked
+		 */
+		if (!tty_is_ctty(tp, p) || p->p_pgrp == tp->t_pgrp) {
+			/* Allow the action to happen. */
+			PROC_UNLOCK(p);
+			return (0);
+		}
+
+		if (SIGISMEMBER(p->p_sigacts->ps_sigignore, sig) ||
+		    SIGISMEMBER(td->td_sigmask, sig)) {
+			/* Only allow them in write()/ioctl(). */
+			PROC_UNLOCK(p);
+			return (sig == SIGTTOU ? 0 : EIO);
+		}
+
+		pg = p->p_pgrp;
+		if (p->p_flag & P_PPWAIT || pg->pg_jobc == 0) {
+			/* Don't allow the action to happen. */
+			PROC_UNLOCK(p);
+			return (EIO);
+		}
+		PROC_UNLOCK(p);
+
+		/*
+		 * Send the signal and sleep until we're the new
+		 * foreground process group.
+		 */
+		if (sig != 0) {
+			ksiginfo_init(&ksi);
+			ksi.ksi_code = SI_KERNEL;
+			ksi.ksi_signo = sig;
+			sig = 0;
+		}
+		PGRP_LOCK(pg);
+		pgsignal(pg, ksi.ksi_signo, 1, &ksi);
+		PGRP_UNLOCK(pg);
+
+		error = tty_wait(tp, &tp->t_bgwait);
+		if (error)
+			return (error);
+	}
+}
+
+static int
+ttydev_read(struct cdev *dev, struct uio *uio, int ioflag)
+{
+	struct tty *tp = dev->si_drv1;
+	int error;
+
+	error = ttydev_enter(tp);
+	if (error)
+		goto done;
+	error = ttydisc_read(tp, uio, ioflag);
+	tty_unlock(tp);
+
+	/*
+	 * The read() call should not throw an error when the device is
+	 * being destroyed. Silently convert it to an EOF.
+	 */
+done:	if (error == ENXIO)
+		error = 0;
+	return (error);
+}
+
+static int
+ttydev_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+	struct tty *tp = dev->si_drv1;
+	int error;
+
+	error = ttydev_enter(tp);
+	if (error)
+		return (error);
+
+	if (tp->t_termios.c_lflag & TOSTOP) {
+		error = tty_wait_background(tp, curthread, SIGTTOU);
+		if (error)
+			goto done;
+	}
+
+	if (ioflag & IO_NDELAY && tp->t_flags & TF_BUSY_OUT) {
+		/* Allow non-blocking writes to bypass serialization. */
+		error = ttydisc_write(tp, uio, ioflag);
+	} else {
+		/* Serialize write() calls. */
+		while (tp->t_flags & TF_BUSY_OUT) {
+			error = tty_wait(tp, &tp->t_outserwait);
+			if (error)
+				goto done;
+		}
+
+		tp->t_flags |= TF_BUSY_OUT;
+		error = ttydisc_write(tp, uio, ioflag);
+		tp->t_flags &= ~TF_BUSY_OUT;
+		cv_signal(&tp->t_outserwait);
+	}
+
+done:	tty_unlock(tp);
+	return (error);
+}
+
+static int
+ttydev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
+    struct thread *td)
+{
+	struct tty *tp = dev->si_drv1;
+	int error;
+
+	error = ttydev_enter(tp);
+	if (error)
+		return (error);
+
+	switch (cmd) {
+	case TIOCCBRK:
+	case TIOCCONS:
+	case TIOCDRAIN:
+	case TIOCEXCL:
+	case TIOCFLUSH:
+	case TIOCNXCL:
+	case TIOCSBRK:
+	case TIOCSCTTY:
+	case TIOCSETA:
+	case TIOCSETAF:
+	case TIOCSETAW:
+	case TIOCSPGRP:
+	case TIOCSTART:
+	case TIOCSTAT:
+	case TIOCSTI:
+	case TIOCSTOP:
+	case TIOCSWINSZ:
+#if 0
+	case TIOCSDRAINWAIT:
+	case TIOCSETD:
+#endif
+#ifdef COMPAT_43TTY
+	case  TIOCLBIC:
+	case  TIOCLBIS:
+	case  TIOCLSET:
+	case  TIOCSETC:
+	case OTIOCSETD:
+	case  TIOCSETN:
+	case  TIOCSETP:
+	case  TIOCSLTC:
+#endif /* COMPAT_43TTY */
+		/*
+		 * If the ioctl() causes the TTY to be modified, let it
+		 * wait in the background.
+		 */
+		error = tty_wait_background(tp, curthread, SIGTTOU);
+		if (error)
+			goto done;
+	}
+
+	if (cmd == TIOCSETA || cmd == TIOCSETAW || cmd == TIOCSETAF) {
+		struct termios *old = &tp->t_termios;
+		struct termios *new = (struct termios *)data;
+		struct termios *lock = TTY_CALLOUT(tp, dev) ?
+		    &tp->t_termios_lock_out : &tp->t_termios_lock_in;
+		int cc;
+
+		/*
+		 * Lock state devices.  Just overwrite the values of the
+		 * commands that are currently in use.
+		 */
+		new->c_iflag = (old->c_iflag & lock->c_iflag) |
+		    (new->c_iflag & ~lock->c_iflag);
+		new->c_oflag = (old->c_oflag & lock->c_oflag) |
+		    (new->c_oflag & ~lock->c_oflag);
+		new->c_cflag = (old->c_cflag & lock->c_cflag) |
+		    (new->c_cflag & ~lock->c_cflag);
+		new->c_lflag = (old->c_lflag & lock->c_lflag) |
+		    (new->c_lflag & ~lock->c_lflag);
+		for (cc = 0; cc < NCCS; ++cc)
+			if (lock->c_cc[cc])
+				new->c_cc[cc] = old->c_cc[cc];
+		if (lock->c_ispeed)
+			new->c_ispeed = old->c_ispeed;
+		if (lock->c_ospeed)
+			new->c_ospeed = old->c_ospeed;
+	}
+
+	error = tty_ioctl(tp, cmd, data, fflag, td);
+done:	tty_unlock(tp);
+
+	return (error);
+}
+
+static int
+ttydev_poll(struct cdev *dev, int events, struct thread *td)
+{
+	struct tty *tp = dev->si_drv1;
+	int error, revents = 0;
+
+	error = ttydev_enter(tp);
+	if (error)
+		return ((events & (POLLIN|POLLRDNORM)) | POLLHUP);
+
+	if (events & (POLLIN|POLLRDNORM)) {
+		/* See if we can read something. */
+		if (ttydisc_read_poll(tp) > 0)
+			revents |= events & (POLLIN|POLLRDNORM);
+	}
+
+	if (tp->t_flags & TF_ZOMBIE) {
+		/* Hangup flag on zombie state. */
+		revents |= POLLHUP;
+	} else if (events & (POLLOUT|POLLWRNORM)) {
+		/* See if we can write something. */
+		if (ttydisc_write_poll(tp) > 0)
+			revents |= events & (POLLOUT|POLLWRNORM);
+	}
+
+	if (revents == 0) {
+		if (events & (POLLIN|POLLRDNORM))
+			selrecord(td, &tp->t_inpoll);
+		if (events & (POLLOUT|POLLWRNORM))
+			selrecord(td, &tp->t_outpoll);
+	}
+
+	tty_unlock(tp);
+
+	return (revents);
+}
+
+static int
+ttydev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
+    int nprot, vm_memattr_t *memattr)
+{
+	struct tty *tp = dev->si_drv1;
+	int error;
+
+	/* Handle mmap() through the driver. */
+
+	error = ttydev_enter(tp);
+	if (error)
+		return (-1);
+	error = ttydevsw_mmap(tp, offset, paddr, nprot, memattr);
+	tty_unlock(tp);
+
+	return (error);
+}
+
+/*
+ * kqueue support.
+ */
+
+static void
+tty_kqops_read_detach(struct knote *kn)
+{
+	struct tty *tp = kn->kn_hook;
+
+	knlist_remove(&tp->t_inpoll.si_note, kn, 0);
+}
+
+static int
+tty_kqops_read_event(struct knote *kn, long hint)
+{
+	struct tty *tp = kn->kn_hook;
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (tty_gone(tp) || tp->t_flags & TF_ZOMBIE) {
+		kn->kn_flags |= EV_EOF;
+		return (1);
+	} else {
+		kn->kn_data = ttydisc_read_poll(tp);
+		return (kn->kn_data > 0);
+	}
+}
+
+static void
+tty_kqops_write_detach(struct knote *kn)
+{
+	struct tty *tp = kn->kn_hook;
+
+	knlist_remove(&tp->t_outpoll.si_note, kn, 0);
+}
+
+static int
+tty_kqops_write_event(struct knote *kn, long hint)
+{
+	struct tty *tp = kn->kn_hook;
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (tty_gone(tp)) {
+		kn->kn_flags |= EV_EOF;
+		return (1);
+	} else {
+		kn->kn_data = ttydisc_write_poll(tp);
+		return (kn->kn_data > 0);
+	}
+}
+
+static struct filterops tty_kqops_read = {
+	.f_isfd = 1,
+	.f_detach = tty_kqops_read_detach,
+	.f_event = tty_kqops_read_event,
+};
+static struct filterops tty_kqops_write = {
+	.f_isfd = 1,
+	.f_detach = tty_kqops_write_detach,
+	.f_event = tty_kqops_write_event,
+};
+
+static int
+ttydev_kqfilter(struct cdev *dev, struct knote *kn)
+{
+	struct tty *tp = dev->si_drv1;
+	int error;
+
+	error = ttydev_enter(tp);
+	if (error)
+		return (error);
+
+	switch (kn->kn_filter) {
+	case EVFILT_READ:
+		kn->kn_hook = tp;
+		kn->kn_fop = &tty_kqops_read;
+		knlist_add(&tp->t_inpoll.si_note, kn, 1);
+		break;
+	case EVFILT_WRITE:
+		kn->kn_hook = tp;
+		kn->kn_fop = &tty_kqops_write;
+		knlist_add(&tp->t_outpoll.si_note, kn, 1);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	tty_unlock(tp);
+	return (error);
+}
+
+static struct cdevsw ttydev_cdevsw = {
+	.d_version	= D_VERSION,
+	.d_open		= ttydev_open,
+	.d_close	= ttydev_close,
+	.d_read		= ttydev_read,
+	.d_write	= ttydev_write,
+	.d_ioctl	= ttydev_ioctl,
+	.d_kqfilter	= ttydev_kqfilter,
+	.d_poll		= ttydev_poll,
+	.d_mmap		= ttydev_mmap,
+	.d_name		= "ttydev",
+	.d_flags	= D_TTY,
+};
+
+/*
+ * Init/lock-state devices
+ */
+
+static int
+ttyil_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+	struct tty *tp;
+	int error = 0;
+
+	while ((tp = dev->si_drv1) == NULL) {
+		error = tsleep(&dev->si_drv1, PCATCH, "ttdrv1", 1);
+		if (error != EWOULDBLOCK)
+			return (error);
+	}
+	tty_lock(tp);
+	if (tty_gone(tp))
+		error = ENODEV;
+	tty_unlock(tp);
+
+	return (error);
+}
+
+static int
+ttyil_close(struct cdev *dev, int flag, int mode, struct thread *td)
+{
+	return (0);
+}
+
+static int
+ttyil_rdwr(struct cdev *dev, struct uio *uio, int ioflag)
+{
+	return (ENODEV);
+}
+
+static int
+ttyil_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
+    struct thread *td)
+{
+	struct tty *tp = dev->si_drv1;
+	int error;
+
+	tty_lock(tp);
+	if (tty_gone(tp)) {
+		error = ENODEV;
+		goto done;
+	}
+
+	error = ttydevsw_cioctl(tp, dev2unit(dev), cmd, data, td);
+	if (error != ENOIOCTL)
+		goto done;
+	error = 0;
+
+	switch (cmd) {
+	case TIOCGETA:
+		/* Obtain terminal flags through tcgetattr(). */
+		*(struct termios*)data = *(struct termios*)dev->si_drv2;
+		break;
+	case TIOCSETA:
+		/* Set terminal flags through tcsetattr(). */
+		error = priv_check(td, PRIV_TTY_SETA);
+		if (error)
+			break;
+		*(struct termios*)dev->si_drv2 = *(struct termios*)data;
+		break;
+	case TIOCGETD:
+		*(int *)data = TTYDISC;
+		break;
+	case TIOCGWINSZ:
+		bzero(data, sizeof(struct winsize));
+		break;
+	default:
+		error = ENOTTY;
+	}
+
+done:	tty_unlock(tp);
+	return (error);
+}
+
+static struct cdevsw ttyil_cdevsw = {
+	.d_version	= D_VERSION,
+	.d_open		= ttyil_open,
+	.d_close	= ttyil_close,
+	.d_read		= ttyil_rdwr,
+	.d_write	= ttyil_rdwr,
+	.d_ioctl	= ttyil_ioctl,
+	.d_name		= "ttyil",
+	.d_flags	= D_TTY,
+};
+
+static void
+tty_init_termios(struct tty *tp)
+{
+	struct termios *t = &tp->t_termios_init_in;
+
+	t->c_cflag = TTYDEF_CFLAG;
+	t->c_iflag = TTYDEF_IFLAG;
+	t->c_lflag = TTYDEF_LFLAG;
+	t->c_oflag = TTYDEF_OFLAG;
+	t->c_ispeed = TTYDEF_SPEED;
+	t->c_ospeed = TTYDEF_SPEED;
+	memcpy(&t->c_cc, ttydefchars, sizeof ttydefchars);
+
+	tp->t_termios_init_out = *t;
+}
+
+void
+tty_init_console(struct tty *tp, speed_t s)
+{
+	struct termios *ti = &tp->t_termios_init_in;
+	struct termios *to = &tp->t_termios_init_out;
+
+	if (s != 0) {
+		ti->c_ispeed = ti->c_ospeed = s;
+		to->c_ispeed = to->c_ospeed = s;
+	}
+
+	ti->c_cflag |= CLOCAL;
+	to->c_cflag |= CLOCAL;
+}
+
+/*
+ * Standard device routine implementations, mostly meant for
+ * pseudo-terminal device drivers. When a driver creates a new terminal
+ * device class, missing routines are patched.
+ */
+
+static int
+ttydevsw_defopen(struct tty *tp)
+{
+
+	return (0);
+}
+
+static void
+ttydevsw_defclose(struct tty *tp)
+{
+}
+
+static void
+ttydevsw_defoutwakeup(struct tty *tp)
+{
+
+	panic("Terminal device has output, while not implemented");
+}
+
+static void
+ttydevsw_definwakeup(struct tty *tp)
+{
+}
+
+static int
+ttydevsw_defioctl(struct tty *tp, u_long cmd, caddr_t data, struct thread *td)
+{
+
+	return (ENOIOCTL);
+}
+
+static int
+ttydevsw_defcioctl(struct tty *tp, int unit, u_long cmd, caddr_t data, struct thread *td)
+{
+
+	return (ENOIOCTL);
+}
+
+static int
+ttydevsw_defparam(struct tty *tp, struct termios *t)
+{
+
+	/*
+	 * Allow the baud rate to be adjusted for pseudo-devices, but at
+	 * least restrict it to 115200 to prevent excessive buffer
+	 * usage.  Also disallow 0, to prevent foot shooting.
+	 */
+	if (t->c_ispeed < B50)
+		t->c_ispeed = B50;
+	else if (t->c_ispeed > B115200)
+		t->c_ispeed = B115200;
+	if (t->c_ospeed < B50)
+		t->c_ospeed = B50;
+	else if (t->c_ospeed > B115200)
+		t->c_ospeed = B115200;
+	t->c_cflag |= CREAD;
+
+	return (0);
+}
+
+static int
+ttydevsw_defmodem(struct tty *tp, int sigon, int sigoff)
+{
+
+	/* Simulate a carrier to make the TTY layer happy. */
+	return (SER_DCD);
+}
+
+static int
+ttydevsw_defmmap(struct tty *tp, vm_ooffset_t offset, vm_paddr_t *paddr,
+    int nprot, vm_memattr_t *memattr)
+{
+
+	return (-1);
+}
+
+static void
+ttydevsw_defpktnotify(struct tty *tp, char event)
+{
+}
+
+static void
+ttydevsw_deffree(void *softc)
+{
+
+	panic("Terminal device freed without a free-handler");
+}
+
+/*
+ * TTY allocation and deallocation. TTY devices can be deallocated when
+ * the driver doesn't use it anymore, when the TTY isn't a session's
+ * controlling TTY and when the device node isn't opened through devfs.
+ */
+
+struct tty *
+tty_alloc(struct ttydevsw *tsw, void *sc)
+{
+
+	return (tty_alloc_mutex(tsw, sc, NULL));
+}
+
+struct tty *
+tty_alloc_mutex(struct ttydevsw *tsw, void *sc, struct mtx *mutex)
+{
+	struct tty *tp;
+
+	/* Make sure the driver defines all routines. */
+#define PATCH_FUNC(x) do {				\
+	if (tsw->tsw_ ## x == NULL)			\
+		tsw->tsw_ ## x = ttydevsw_def ## x;	\
+} while (0)
+	PATCH_FUNC(open);
+	PATCH_FUNC(close);
+	PATCH_FUNC(outwakeup);
+	PATCH_FUNC(inwakeup);
+	PATCH_FUNC(ioctl);
+	PATCH_FUNC(cioctl);
+	PATCH_FUNC(param);
+	PATCH_FUNC(modem);
+	PATCH_FUNC(mmap);
+	PATCH_FUNC(pktnotify);
+	PATCH_FUNC(free);
+#undef PATCH_FUNC
+
+	tp = malloc(sizeof(struct tty), M_TTY, M_WAITOK|M_ZERO);
+	tp->t_devsw = tsw;
+	tp->t_devswsoftc = sc;
+	tp->t_flags = tsw->tsw_flags;
+
+	tty_init_termios(tp);
+
+	cv_init(&tp->t_inwait, "ttyin");
+	cv_init(&tp->t_outwait, "ttyout");
+	cv_init(&tp->t_outserwait, "ttyosr");
+	cv_init(&tp->t_bgwait, "ttybg");
+	cv_init(&tp->t_dcdwait, "ttydcd");
+
+	/* Allow drivers to use a custom mutex to lock the TTY. */
+	if (mutex != NULL) {
+		tp->t_mtx = mutex;
+	} else {
+		tp->t_mtx = &tp->t_mtxobj;
+		mtx_init(&tp->t_mtxobj, "ttymtx", NULL, MTX_DEF);
+	}
+
+	knlist_init_mtx(&tp->t_inpoll.si_note, tp->t_mtx);
+	knlist_init_mtx(&tp->t_outpoll.si_note, tp->t_mtx);
+
+	sx_xlock(&tty_list_sx);
+	TAILQ_INSERT_TAIL(&tty_list, tp, t_list);
+	tty_list_count++;
+	sx_xunlock(&tty_list_sx);
+
+	return (tp);
+}
+
+static void
+tty_dealloc(void *arg)
+{
+	struct tty *tp = arg;
+
+	sx_xlock(&tty_list_sx);
+	TAILQ_REMOVE(&tty_list, tp, t_list);
+	tty_list_count--;
+	sx_xunlock(&tty_list_sx);
+
+	/* Make sure we haven't leaked buffers. */
+	MPASS(ttyinq_getsize(&tp->t_inq) == 0);
+	MPASS(ttyoutq_getsize(&tp->t_outq) == 0);
+
+	seldrain(&tp->t_inpoll);
+	seldrain(&tp->t_outpoll);
+	knlist_destroy(&tp->t_inpoll.si_note);
+	knlist_destroy(&tp->t_outpoll.si_note);
+
+	cv_destroy(&tp->t_inwait);
+	cv_destroy(&tp->t_outwait);
+	cv_destroy(&tp->t_bgwait);
+	cv_destroy(&tp->t_dcdwait);
+	cv_destroy(&tp->t_outserwait);
+
+	if (tp->t_mtx == &tp->t_mtxobj)
+		mtx_destroy(&tp->t_mtxobj);
+	ttydevsw_free(tp);
+	free(tp, M_TTY);
+}
+
+static void
+tty_rel_free(struct tty *tp)
+{
+	struct cdev *dev;
+
+	tty_lock_assert(tp, MA_OWNED);
+
+#define	TF_ACTIVITY	(TF_GONE|TF_OPENED|TF_HOOK|TF_OPENCLOSE)
+	if (tp->t_sessioncnt != 0 || (tp->t_flags & TF_ACTIVITY) != TF_GONE) {
+		/* TTY is still in use. */
+		tty_unlock(tp);
+		return;
+	}
+
+	/* TTY can be deallocated. */
+	dev = tp->t_dev;
+	tp->t_dev = NULL;
+	tty_unlock(tp);
+
+	if (dev != NULL)
+		destroy_dev_sched_cb(dev, tty_dealloc, tp);
+}
+
+void
+tty_rel_pgrp(struct tty *tp, struct pgrp *pg)
+{
+	MPASS(tp->t_sessioncnt > 0);
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (tp->t_pgrp == pg)
+		tp->t_pgrp = NULL;
+
+	tty_unlock(tp);
+}
+
+void
+tty_rel_sess(struct tty *tp, struct session *sess)
+{
+	MPASS(tp->t_sessioncnt > 0);
+
+	/* Current session has left. */
+	if (tp->t_session == sess) {
+		tp->t_session = NULL;
+		MPASS(tp->t_pgrp == NULL);
+	}
+	tp->t_sessioncnt--;
+	tty_rel_free(tp);
+}
+
+void
+tty_rel_gone(struct tty *tp)
+{
+	MPASS(!tty_gone(tp));
+
+	/* Simulate carrier removal. */
+	ttydisc_modem(tp, 0);
+
+	/* Wake up all blocked threads. */
+	tty_wakeup(tp, FREAD|FWRITE);
+	cv_broadcast(&tp->t_bgwait);
+	cv_broadcast(&tp->t_dcdwait);
+
+	tp->t_flags |= TF_GONE;
+	tty_rel_free(tp);
+}
+
+/*
+ * Exposing information about current TTY's through sysctl
+ */
+
+static void
+tty_to_xtty(struct tty *tp, struct xtty *xt)
+{
+	tty_lock_assert(tp, MA_OWNED);
+
+	xt->xt_size = sizeof(struct xtty);
+	xt->xt_insize = ttyinq_getsize(&tp->t_inq);
+	xt->xt_incc = ttyinq_bytescanonicalized(&tp->t_inq);
+	xt->xt_inlc = ttyinq_bytesline(&tp->t_inq);
+	xt->xt_inlow = tp->t_inlow;
+	xt->xt_outsize = ttyoutq_getsize(&tp->t_outq);
+	xt->xt_outcc = ttyoutq_bytesused(&tp->t_outq);
+	xt->xt_outlow = tp->t_outlow;
+	xt->xt_column = tp->t_column;
+	xt->xt_pgid = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
+	xt->xt_sid = tp->t_session ? tp->t_session->s_sid : 0;
+	xt->xt_flags = tp->t_flags;
+	xt->xt_dev = tp->t_dev ? dev2udev(tp->t_dev) : NODEV;
+}
+
+static int
+sysctl_kern_ttys(SYSCTL_HANDLER_ARGS)
+{
+	unsigned long lsize;
+	struct xtty *xtlist, *xt;
+	struct tty *tp;
+	int error;
+
+	sx_slock(&tty_list_sx);
+	lsize = tty_list_count * sizeof(struct xtty);
+	if (lsize == 0) {
+		sx_sunlock(&tty_list_sx);
+		return (0);
+	}
+
+	xtlist = xt = malloc(lsize, M_TTY, M_WAITOK);
+
+	TAILQ_FOREACH(tp, &tty_list, t_list) {
+		tty_lock(tp);
+		tty_to_xtty(tp, xt);
+		tty_unlock(tp);
+		xt++;
+	}
+	sx_sunlock(&tty_list_sx);
+
+	error = SYSCTL_OUT(req, xtlist, lsize);
+	free(xtlist, M_TTY);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
+	0, 0, sysctl_kern_ttys, "S,xtty", "List of TTYs");
+
+/*
+ * Device node creation. Device has been set up, now we can expose it to
+ * the user.
+ */
+
+void
+tty_makedev(struct tty *tp, struct ucred *cred, const char *fmt, ...)
+{
+	va_list ap;
+	struct cdev *dev;
+	const char *prefix = "tty";
+	char name[SPECNAMELEN - 3]; /* for "tty" and "cua". */
+	uid_t uid;
+	gid_t gid;
+	mode_t mode;
+
+	/* Remove "tty" prefix from devices like PTY's. */
+	if (tp->t_flags & TF_NOPREFIX)
+		prefix = "";
+
+	va_start(ap, fmt);
+	vsnrprintf(name, sizeof name, 32, fmt, ap);
+	va_end(ap);
+
+	if (cred == NULL) {
+		/* System device. */
+		uid = UID_ROOT;
+		gid = GID_WHEEL;
+		mode = S_IRUSR|S_IWUSR;
+	} else {
+		/* User device. */
+		uid = cred->cr_ruid;
+		gid = GID_TTY;
+		mode = S_IRUSR|S_IWUSR|S_IWGRP;
+	}
+
+	/* Master call-in device. */
+	dev = make_dev_cred(&ttydev_cdevsw, 0, cred,
+	    uid, gid, mode, "%s%s", prefix, name);
+	dev->si_drv1 = tp;
+	wakeup(&dev->si_drv1);
+	tp->t_dev = dev;
+
+	/* Slave call-in devices. */
+	if (tp->t_flags & TF_INITLOCK) {
+		dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_INIT, cred,
+		    uid, gid, mode, "%s%s.init", prefix, name);
+		dev_depends(tp->t_dev, dev);
+		dev->si_drv1 = tp;
+		wakeup(&dev->si_drv1);
+		dev->si_drv2 = &tp->t_termios_init_in;
+
+		dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_LOCK, cred,
+		    uid, gid, mode, "%s%s.lock", prefix, name);
+		dev_depends(tp->t_dev, dev);
+		dev->si_drv1 = tp;
+		wakeup(&dev->si_drv1);
+		dev->si_drv2 = &tp->t_termios_lock_in;
+	}
+
+	/* Call-out devices. */
+	if (tp->t_flags & TF_CALLOUT) {
+		dev = make_dev_cred(&ttydev_cdevsw, TTYUNIT_CALLOUT, cred,
+		    UID_UUCP, GID_DIALER, 0660, "cua%s", name);
+		dev_depends(tp->t_dev, dev);
+		dev->si_drv1 = tp;
+		wakeup(&dev->si_drv1);
+
+		/* Slave call-out devices. */
+		if (tp->t_flags & TF_INITLOCK) {
+			dev = make_dev_cred(&ttyil_cdevsw,
+			    TTYUNIT_CALLOUT | TTYUNIT_INIT, cred,
+			    UID_UUCP, GID_DIALER, 0660, "cua%s.init", name);
+			dev_depends(tp->t_dev, dev);
+			dev->si_drv1 = tp;
+			wakeup(&dev->si_drv1);
+			dev->si_drv2 = &tp->t_termios_init_out;
+
+			dev = make_dev_cred(&ttyil_cdevsw,
+			    TTYUNIT_CALLOUT | TTYUNIT_LOCK, cred,
+			    UID_UUCP, GID_DIALER, 0660, "cua%s.lock", name);
+			dev_depends(tp->t_dev, dev);
+			dev->si_drv1 = tp;
+			wakeup(&dev->si_drv1);
+			dev->si_drv2 = &tp->t_termios_lock_out;
+		}
+	}
+}
+
+/*
+ * Signalling processes.
+ */
+
+void
+tty_signal_sessleader(struct tty *tp, int sig)
+{
+	struct proc *p;
+
+	tty_lock_assert(tp, MA_OWNED);
+	MPASS(sig >= 1 && sig < NSIG);
+
+	/* Make signals start output again. */
+	tp->t_flags &= ~TF_STOPPED;
+
+	if (tp->t_session != NULL && tp->t_session->s_leader != NULL) {
+		p = tp->t_session->s_leader;
+		PROC_LOCK(p);
+		kern_psignal(p, sig);
+		PROC_UNLOCK(p);
+	}
+}
+
+void
+tty_signal_pgrp(struct tty *tp, int sig)
+{
+	ksiginfo_t ksi;
+
+	tty_lock_assert(tp, MA_OWNED);
+	MPASS(sig >= 1 && sig < NSIG);
+
+	/* Make signals start output again. */
+	tp->t_flags &= ~TF_STOPPED;
+
+	if (sig == SIGINFO && !(tp->t_termios.c_lflag & NOKERNINFO))
+		tty_info(tp);
+	if (tp->t_pgrp != NULL) {
+		ksiginfo_init(&ksi);
+		ksi.ksi_signo = sig;
+		ksi.ksi_code = SI_KERNEL;
+		PGRP_LOCK(tp->t_pgrp);
+		pgsignal(tp->t_pgrp, sig, 1, &ksi);
+		PGRP_UNLOCK(tp->t_pgrp);
+	}
+}
+
+void
+tty_wakeup(struct tty *tp, int flags)
+{
+	if (tp->t_flags & TF_ASYNC && tp->t_sigio != NULL)
+		pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
+
+	if (flags & FWRITE) {
+		cv_broadcast(&tp->t_outwait);
+		selwakeup(&tp->t_outpoll);
+		KNOTE_LOCKED(&tp->t_outpoll.si_note, 0);
+	}
+	if (flags & FREAD) {
+		cv_broadcast(&tp->t_inwait);
+		selwakeup(&tp->t_inpoll);
+		KNOTE_LOCKED(&tp->t_inpoll.si_note, 0);
+	}
+}
+
+int
+tty_wait(struct tty *tp, struct cv *cv)
+{
+	int error;
+	int revokecnt = tp->t_revokecnt;
+
+	tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED);
+	MPASS(!tty_gone(tp));
+
+	error = cv_wait_sig(cv, tp->t_mtx);
+
+	/* Restart the system call when we may have been revoked. */
+	if (tp->t_revokecnt != revokecnt)
+		return (ERESTART);
+
+	/* Bail out when the device slipped away. */
+	if (tty_gone(tp))
+		return (ENXIO);
+
+	return (error);
+}
+
+int
+tty_timedwait(struct tty *tp, struct cv *cv, int hz)
+{
+	int error;
+	int revokecnt = tp->t_revokecnt;
+
+	tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED);
+	MPASS(!tty_gone(tp));
+
+	error = cv_timedwait_sig(cv, tp->t_mtx, hz);
+
+	/* Restart the system call when we may have been revoked. */
+	if (tp->t_revokecnt != revokecnt)
+		return (ERESTART);
+
+	/* Bail out when the device slipped away. */
+	if (tty_gone(tp))
+		return (ENXIO);
+
+	return (error);
+}
+
+void
+tty_flush(struct tty *tp, int flags)
+{
+	if (flags & FWRITE) {
+		tp->t_flags &= ~TF_HIWAT_OUT;
+		ttyoutq_flush(&tp->t_outq);
+		tty_wakeup(tp, FWRITE);
+		ttydevsw_pktnotify(tp, TIOCPKT_FLUSHWRITE);
+	}
+	if (flags & FREAD) {
+		tty_hiwat_in_unblock(tp);
+		ttyinq_flush(&tp->t_inq);
+		ttydevsw_inwakeup(tp);
+		ttydevsw_pktnotify(tp, TIOCPKT_FLUSHREAD);
+	}
+}
+
+void
+tty_set_winsize(struct tty *tp, const struct winsize *wsz)
+{
+
+	if (memcmp(&tp->t_winsize, wsz, sizeof(*wsz)) == 0)
+		return;
+	tp->t_winsize = *wsz;
+	tty_signal_pgrp(tp, SIGWINCH);
+}
+
+static int
+tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag,
+    struct thread *td)
+{
+	int error;
+
+	switch (cmd) {
+	/*
+	 * Modem commands.
+	 * The SER_* and TIOCM_* flags are the same, but one bit
+	 * shifted. I don't know why.
+	 */
+	case TIOCSDTR:
+		ttydevsw_modem(tp, SER_DTR, 0);
+		return (0);
+	case TIOCCDTR:
+		ttydevsw_modem(tp, 0, SER_DTR);
+		return (0);
+	case TIOCMSET: {
+		int bits = *(int *)data;
+		ttydevsw_modem(tp,
+		    (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1,
+		    ((~bits) & (TIOCM_DTR | TIOCM_RTS)) >> 1);
+		return (0);
+	}
+	case TIOCMBIS: {
+		int bits = *(int *)data;
+		ttydevsw_modem(tp, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1, 0);
+		return (0);
+	}
+	case TIOCMBIC: {
+		int bits = *(int *)data;
+		ttydevsw_modem(tp, 0, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1);
+		return (0);
+	}
+	case TIOCMGET:
+		*(int *)data = TIOCM_LE + (ttydevsw_modem(tp, 0, 0) << 1);
+		return (0);
+
+	case FIOASYNC:
+		if (*(int *)data)
+			tp->t_flags |= TF_ASYNC;
+		else
+			tp->t_flags &= ~TF_ASYNC;
+		return (0);
+	case FIONBIO:
+		/* This device supports non-blocking operation. */
+		return (0);
+	case FIONREAD:
+		*(int *)data = ttyinq_bytescanonicalized(&tp->t_inq);
+		return (0);
+	case FIONWRITE:
+	case TIOCOUTQ:
+		*(int *)data = ttyoutq_bytesused(&tp->t_outq);
+		return (0);
+	case FIOSETOWN:
+		if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
+			/* Not allowed to set ownership. */
+			return (ENOTTY);
+
+		/* Temporarily unlock the TTY to set ownership. */
+		tty_unlock(tp);
+		error = fsetown(*(int *)data, &tp->t_sigio);
+		tty_lock(tp);
+		return (error);
+	case FIOGETOWN:
+		if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
+			/* Not allowed to set ownership. */
+			return (ENOTTY);
+
+		/* Get ownership. */
+		*(int *)data = fgetown(&tp->t_sigio);
+		return (0);
+	case TIOCGETA:
+		/* Obtain terminal flags through tcgetattr(). */
+		*(struct termios*)data = tp->t_termios;
+		return (0);
+	case TIOCSETA:
+	case TIOCSETAW:
+	case TIOCSETAF: {
+		struct termios *t = data;
+
+		/*
+		 * Who makes up these funny rules? According to POSIX,
+		 * input baud rate is set equal to the output baud rate
+		 * when zero.
+		 */
+		if (t->c_ispeed == 0)
+			t->c_ispeed = t->c_ospeed;
+
+		/* Discard any unsupported bits. */
+		t->c_iflag &= TTYSUP_IFLAG;
+		t->c_oflag &= TTYSUP_OFLAG;
+		t->c_lflag &= TTYSUP_LFLAG;
+		t->c_cflag &= TTYSUP_CFLAG;
+
+		/* Set terminal flags through tcsetattr(). */
+		if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
+			error = tty_drain(tp);
+			if (error)
+				return (error);
+			if (cmd == TIOCSETAF)
+				tty_flush(tp, FREAD);
+		}
+
+		/*
+		 * Only call param() when the flags really change.
+		 */
+		if ((t->c_cflag & CIGNORE) == 0 &&
+		    (tp->t_termios.c_cflag != t->c_cflag ||
+		    ((tp->t_termios.c_iflag ^ t->c_iflag) &
+		    (IXON|IXOFF|IXANY)) ||
+		    tp->t_termios.c_ispeed != t->c_ispeed ||
+		    tp->t_termios.c_ospeed != t->c_ospeed)) {
+			error = ttydevsw_param(tp, t);
+			if (error)
+				return (error);
+
+			/* XXX: CLOCAL? */
+
+			tp->t_termios.c_cflag = t->c_cflag & ~CIGNORE;
+			tp->t_termios.c_ispeed = t->c_ispeed;
+			tp->t_termios.c_ospeed = t->c_ospeed;
+
+			/* Baud rate has changed - update watermarks. */
+			tty_watermarks(tp);
+		}
+
+		/* Copy new non-device driver parameters. */
+		tp->t_termios.c_iflag = t->c_iflag;
+		tp->t_termios.c_oflag = t->c_oflag;
+		tp->t_termios.c_lflag = t->c_lflag;
+		memcpy(&tp->t_termios.c_cc, t->c_cc, sizeof t->c_cc);
+
+		ttydisc_optimize(tp);
+
+		if ((t->c_lflag & ICANON) == 0) {
+			/*
+			 * When in non-canonical mode, wake up all
+			 * readers. Canonicalize any partial input. VMIN
+			 * and VTIME could also be adjusted.
+			 */
+			ttyinq_canonicalize(&tp->t_inq);
+			tty_wakeup(tp, FREAD);
+		}
+
+		/*
+		 * For packet mode: notify the PTY consumer that VSTOP
+		 * and VSTART may have been changed.
+		 */
+		if (tp->t_termios.c_iflag & IXON &&
+		    tp->t_termios.c_cc[VSTOP] == CTRL('S') &&
+		    tp->t_termios.c_cc[VSTART] == CTRL('Q'))
+			ttydevsw_pktnotify(tp, TIOCPKT_DOSTOP);
+		else
+			ttydevsw_pktnotify(tp, TIOCPKT_NOSTOP);
+		return (0);
+	}
+	case TIOCGETD:
+		/* For compatibility - we only support TTYDISC. */
+		*(int *)data = TTYDISC;
+		return (0);
+	case TIOCGPGRP:
+		if (!tty_is_ctty(tp, td->td_proc))
+			return (ENOTTY);
+
+		if (tp->t_pgrp != NULL)
+			*(int *)data = tp->t_pgrp->pg_id;
+		else
+			*(int *)data = NO_PID;
+		return (0);
+	case TIOCGSID:
+		if (!tty_is_ctty(tp, td->td_proc))
+			return (ENOTTY);
+
+		MPASS(tp->t_session);
+		*(int *)data = tp->t_session->s_sid;
+		return (0);
+	case TIOCSCTTY: {
+		struct proc *p = td->td_proc;
+
+		/* XXX: This looks awful. */
+		tty_unlock(tp);
+		sx_xlock(&proctree_lock);
+		tty_lock(tp);
+
+		if (!SESS_LEADER(p)) {
+			/* Only the session leader may do this. */
+			sx_xunlock(&proctree_lock);
+			return (EPERM);
+		}
+
+		if (tp->t_session != NULL && tp->t_session == p->p_session) {
+			/* This is already our controlling TTY. */
+			sx_xunlock(&proctree_lock);
+			return (0);
+		}
+
+		if (p->p_session->s_ttyp != NULL ||
+		    (tp->t_session != NULL && tp->t_session->s_ttyvp != NULL &&
+		    tp->t_session->s_ttyvp->v_type != VBAD)) {
+			/*
+			 * There is already a relation between a TTY and
+			 * a session, or the caller is not the session
+			 * leader.
+			 *
+			 * Allow the TTY to be stolen when the vnode is
+			 * invalid, but the reference to the TTY is
+			 * still active.  This allows immediate reuse of
+			 * TTYs of which the session leader has been
+			 * killed or the TTY revoked.
+			 */
+			sx_xunlock(&proctree_lock);
+			return (EPERM);
+		}
+
+		/* Connect the session to the TTY. */
+		tp->t_session = p->p_session;
+		tp->t_session->s_ttyp = tp;
+		tp->t_sessioncnt++;
+		sx_xunlock(&proctree_lock);
+
+		/* Assign foreground process group. */
+		tp->t_pgrp = p->p_pgrp;
+		PROC_LOCK(p);
+		p->p_flag |= P_CONTROLT;
+		PROC_UNLOCK(p);
+
+		return (0);
+	}
+	case TIOCSPGRP: {
+		struct pgrp *pg;
+
+		/*
+		 * XXX: Temporarily unlock the TTY to locate the process
+		 * group. This code would be lot nicer if we would ever
+		 * decompose proctree_lock.
+		 */
+		tty_unlock(tp);
+		sx_slock(&proctree_lock);
+		pg = pgfind(*(int *)data);
+		if (pg != NULL)
+			PGRP_UNLOCK(pg);
+		if (pg == NULL || pg->pg_session != td->td_proc->p_session) {
+			sx_sunlock(&proctree_lock);
+			tty_lock(tp);
+			return (EPERM);
+		}
+		tty_lock(tp);
+
+		/*
+		 * Determine if this TTY is the controlling TTY after
+		 * relocking the TTY.
+		 */
+		if (!tty_is_ctty(tp, td->td_proc)) {
+			sx_sunlock(&proctree_lock);
+			return (ENOTTY);
+		}
+		tp->t_pgrp = pg;
+		sx_sunlock(&proctree_lock);
+
+		/* Wake up the background process groups. */
+		cv_broadcast(&tp->t_bgwait);
+		return (0);
+	}
+	case TIOCFLUSH: {
+		int flags = *(int *)data;
+
+		if (flags == 0)
+			flags = (FREAD|FWRITE);
+		else
+			flags &= (FREAD|FWRITE);
+		tty_flush(tp, flags);
+		return (0);
+	}
+	case TIOCDRAIN:
+		/* Drain TTY output. */
+		return tty_drain(tp);
+	case TIOCCONS:
+		/* Set terminal as console TTY. */
+		if (*(int *)data) {
+			error = priv_check(td, PRIV_TTY_CONSOLE);
+			if (error)
+				return (error);
+
+			/*
+			 * XXX: constty should really need to be locked!
+			 * XXX: allow disconnected constty's to be stolen!
+			 */
+
+			if (constty == tp)
+				return (0);
+			if (constty != NULL)
+				return (EBUSY);
+
+			tty_unlock(tp);
+			constty_set(tp);
+			tty_lock(tp);
+		} else if (constty == tp) {
+			constty_clear();
+		}
+		return (0);
+	case TIOCGWINSZ:
+		/* Obtain window size. */
+		*(struct winsize*)data = tp->t_winsize;
+		return (0);
+	case TIOCSWINSZ:
+		/* Set window size. */
+		tty_set_winsize(tp, data);
+		return (0);
+	case TIOCEXCL:
+		tp->t_flags |= TF_EXCLUDE;
+		return (0);
+	case TIOCNXCL:
+		tp->t_flags &= ~TF_EXCLUDE;
+		return (0);
+	case TIOCSTOP:
+		tp->t_flags |= TF_STOPPED;
+		ttydevsw_pktnotify(tp, TIOCPKT_STOP);
+		return (0);
+	case TIOCSTART:
+		tp->t_flags &= ~TF_STOPPED;
+		ttydevsw_outwakeup(tp);
+		ttydevsw_pktnotify(tp, TIOCPKT_START);
+		return (0);
+	case TIOCSTAT:
+		tty_info(tp);
+		return (0);
+	case TIOCSTI:
+		if ((fflag & FREAD) == 0 && priv_check(td, PRIV_TTY_STI))
+			return (EPERM);
+		if (!tty_is_ctty(tp, td->td_proc) &&
+		    priv_check(td, PRIV_TTY_STI))
+			return (EACCES);
+		ttydisc_rint(tp, *(char *)data, 0);
+		ttydisc_rint_done(tp);
+		return (0);
+	}
+
+#ifdef COMPAT_43TTY
+	return tty_ioctl_compat(tp, cmd, data, fflag, td);
+#else /* !COMPAT_43TTY */
+	return (ENOIOCTL);
+#endif /* COMPAT_43TTY */
+}
+
+int
+tty_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td)
+{
+	int error;
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (tty_gone(tp))
+		return (ENXIO);
+
+	error = ttydevsw_ioctl(tp, cmd, data, td);
+	if (error == ENOIOCTL)
+		error = tty_generic_ioctl(tp, cmd, data, fflag, td);
+
+	return (error);
+}
+
+dev_t
+tty_udev(struct tty *tp)
+{
+	if (tp->t_dev)
+		return dev2udev(tp->t_dev);
+	else
+		return NODEV;
+}
+
+int
+tty_checkoutq(struct tty *tp)
+{
+
+	/* 256 bytes should be enough to print a log message. */
+	return (ttyoutq_bytesleft(&tp->t_outq) >= 256);
+}
+
+void
+tty_hiwat_in_block(struct tty *tp)
+{
+
+	if ((tp->t_flags & TF_HIWAT_IN) == 0 &&
+	    tp->t_termios.c_iflag & IXOFF &&
+	    tp->t_termios.c_cc[VSTOP] != _POSIX_VDISABLE) {
+		/*
+		 * Input flow control. Only enter the high watermark when we
+		 * can successfully store the VSTOP character.
+		 */
+		if (ttyoutq_write_nofrag(&tp->t_outq,
+		    &tp->t_termios.c_cc[VSTOP], 1) == 0)
+			tp->t_flags |= TF_HIWAT_IN;
+	} else {
+		/* No input flow control. */
+		tp->t_flags |= TF_HIWAT_IN;
+	}
+}
+
+void
+tty_hiwat_in_unblock(struct tty *tp)
+{
+
+	if (tp->t_flags & TF_HIWAT_IN &&
+	    tp->t_termios.c_iflag & IXOFF &&
+	    tp->t_termios.c_cc[VSTART] != _POSIX_VDISABLE) {
+		/*
+		 * Input flow control. Only leave the high watermark when we
+		 * can successfully store the VSTART character.
+		 */
+		if (ttyoutq_write_nofrag(&tp->t_outq,
+		    &tp->t_termios.c_cc[VSTART], 1) == 0)
+			tp->t_flags &= ~TF_HIWAT_IN;
+	} else {
+		/* No input flow control. */
+		tp->t_flags &= ~TF_HIWAT_IN;
+	}
+
+	if (!tty_gone(tp))
+		ttydevsw_inwakeup(tp);
+}
+
+/*
+ * TTY hooks interface.
+ */
+
+static int
+ttyhook_defrint(struct tty *tp, char c, int flags)
+{
+
+	if (ttyhook_rint_bypass(tp, &c, 1) != 1)
+		return (-1);
+
+	return (0);
+}
+
+int
+ttyhook_register(struct tty **rtp, struct proc *p, int fd,
+    struct ttyhook *th, void *softc)
+{
+	struct tty *tp;
+	struct file *fp;
+	struct cdev *dev;
+	struct cdevsw *cdp;
+	struct filedesc *fdp;
+	cap_rights_t rights;
+	int error, ref;
+
+	/* Validate the file descriptor. */
+	fdp = p->p_fd;
+	error = fget_unlocked(fdp, fd, cap_rights_init(&rights, CAP_TTYHOOK),
+	    0, &fp, NULL);
+	if (error != 0)
+		return (error);
+	if (fp->f_ops == &badfileops) {
+		error = EBADF;
+		goto done1;
+	}
+
+	/*
+	 * Make sure the vnode is bound to a character device.
+	 * Unlocked check for the vnode type is ok there, because we
+	 * only shall prevent calling devvn_refthread on the file that
+	 * never has been opened over a character device.
+	 */
+	if (fp->f_type != DTYPE_VNODE || fp->f_vnode->v_type != VCHR) {
+		error = EINVAL;
+		goto done1;
+	}
+
+	/* Make sure it is a TTY. */
+	cdp = devvn_refthread(fp->f_vnode, &dev, &ref);
+	if (cdp == NULL) {
+		error = ENXIO;
+		goto done1;
+	}
+	if (dev != fp->f_data) {
+		error = ENXIO;
+		goto done2;
+	}
+	if (cdp != &ttydev_cdevsw) {
+		error = ENOTTY;
+		goto done2;
+	}
+	tp = dev->si_drv1;
+
+	/* Try to attach the hook to the TTY. */
+	error = EBUSY;
+	tty_lock(tp);
+	MPASS((tp->t_hook == NULL) == ((tp->t_flags & TF_HOOK) == 0));
+	if (tp->t_flags & TF_HOOK)
+		goto done3;
+
+	tp->t_flags |= TF_HOOK;
+	tp->t_hook = th;
+	tp->t_hooksoftc = softc;
+	*rtp = tp;
+	error = 0;
+
+	/* Maybe we can switch into bypass mode now. */
+	ttydisc_optimize(tp);
+
+	/* Silently convert rint() calls to rint_bypass() when possible. */
+	if (!ttyhook_hashook(tp, rint) && ttyhook_hashook(tp, rint_bypass))
+		th->th_rint = ttyhook_defrint;
+
+done3:	tty_unlock(tp);
+done2:	dev_relthread(dev, ref);
+done1:	fdrop(fp, curthread);
+	return (error);
+}
+
+void
+ttyhook_unregister(struct tty *tp)
+{
+
+	tty_lock_assert(tp, MA_OWNED);
+	MPASS(tp->t_flags & TF_HOOK);
+
+	/* Disconnect the hook. */
+	tp->t_flags &= ~TF_HOOK;
+	tp->t_hook = NULL;
+
+	/* Maybe we need to leave bypass mode. */
+	ttydisc_optimize(tp);
+
+	/* Maybe deallocate the TTY as well. */
+	tty_rel_free(tp);
+}
+
+/*
+ * /dev/console handling.
+ */
+
+static int
+ttyconsdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+	struct tty *tp;
+
+	/* System has no console device. */
+	if (dev_console_filename == NULL)
+		return (ENXIO);
+
+	/* Look up corresponding TTY by device name. */
+	sx_slock(&tty_list_sx);
+	TAILQ_FOREACH(tp, &tty_list, t_list) {
+		if (strcmp(dev_console_filename, tty_devname(tp)) == 0) {
+			dev_console->si_drv1 = tp;
+			break;
+		}
+	}
+	sx_sunlock(&tty_list_sx);
+
+	/* System console has no TTY associated. */
+	if (dev_console->si_drv1 == NULL)
+		return (ENXIO);
+
+	return (ttydev_open(dev, oflags, devtype, td));
+}
+
+static int
+ttyconsdev_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+
+	log_console(uio);
+
+	return (ttydev_write(dev, uio, ioflag));
+}
+
+/*
+ * /dev/console is a little different than normal TTY's.  When opened,
+ * it determines which TTY to use.  When data gets written to it, it
+ * will be logged in the kernel message buffer.
+ */
+static struct cdevsw ttyconsdev_cdevsw = {
+	.d_version	= D_VERSION,
+	.d_open		= ttyconsdev_open,
+	.d_close	= ttydev_close,
+	.d_read		= ttydev_read,
+	.d_write	= ttyconsdev_write,
+	.d_ioctl	= ttydev_ioctl,
+	.d_kqfilter	= ttydev_kqfilter,
+	.d_poll		= ttydev_poll,
+	.d_mmap		= ttydev_mmap,
+	.d_name		= "ttyconsdev",
+	.d_flags	= D_TTY,
+};
+
+static void
+ttyconsdev_init(void *unused)
+{
+
+	dev_console = make_dev_credf(MAKEDEV_ETERNAL, &ttyconsdev_cdevsw, 0,
+	    NULL, UID_ROOT, GID_WHEEL, 0600, "console");
+}
+
+SYSINIT(tty, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyconsdev_init, NULL);
+
+void
+ttyconsdev_select(const char *name)
+{
+
+	dev_console_filename = name;
+}
+
+/*
+ * Debugging routines.
+ */
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+#include <ddb/db_sym.h>
+
+static struct {
+	int flag;
+	char val;
+} ttystates[] = {
+#if 0
+	{ TF_NOPREFIX,		'N' },
+#endif
+	{ TF_INITLOCK,		'I' },
+	{ TF_CALLOUT,		'C' },
+
+	/* Keep these together -> 'Oi' and 'Oo'. */
+	{ TF_OPENED,		'O' },
+	{ TF_OPENED_IN,		'i' },
+	{ TF_OPENED_OUT,	'o' },
+	{ TF_OPENED_CONS,	'c' },
+
+	{ TF_GONE,		'G' },
+	{ TF_OPENCLOSE,		'B' },
+	{ TF_ASYNC,		'Y' },
+	{ TF_LITERAL,		'L' },
+
+	/* Keep these together -> 'Hi' and 'Ho'. */
+	{ TF_HIWAT,		'H' },
+	{ TF_HIWAT_IN,		'i' },
+	{ TF_HIWAT_OUT,		'o' },
+
+	{ TF_STOPPED,		'S' },
+	{ TF_EXCLUDE,		'X' },
+	{ TF_BYPASS,		'l' },
+	{ TF_ZOMBIE,		'Z' },
+	{ TF_HOOK,		's' },
+
+	/* Keep these together -> 'bi' and 'bo'. */
+	{ TF_BUSY,		'b' },
+	{ TF_BUSY_IN,		'i' },
+	{ TF_BUSY_OUT,		'o' },
+
+	{ 0,			'\0'},
+};
+
+#define	TTY_FLAG_BITS \
+	"\20\1NOPREFIX\2INITLOCK\3CALLOUT\4OPENED_IN\5OPENED_OUT\6GONE" \
+	"\7OPENCLOSE\10ASYNC\11LITERAL\12HIWAT_IN\13HIWAT_OUT\14STOPPED" \
+	"\15EXCLUDE\16BYPASS\17ZOMBIE\20HOOK"
+
+#define DB_PRINTSYM(name, addr) \
+	db_printf("%s  " #name ": ", sep); \
+	db_printsym((db_addr_t) addr, DB_STGY_ANY); \
+	db_printf("\n");
+
+static void
+_db_show_devsw(const char *sep, const struct ttydevsw *tsw)
+{
+	db_printf("%sdevsw: ", sep);
+	db_printsym((db_addr_t)tsw, DB_STGY_ANY);
+	db_printf(" (%p)\n", tsw);
+	DB_PRINTSYM(open, tsw->tsw_open);
+	DB_PRINTSYM(close, tsw->tsw_close);
+	DB_PRINTSYM(outwakeup, tsw->tsw_outwakeup);
+	DB_PRINTSYM(inwakeup, tsw->tsw_inwakeup);
+	DB_PRINTSYM(ioctl, tsw->tsw_ioctl);
+	DB_PRINTSYM(param, tsw->tsw_param);
+	DB_PRINTSYM(modem, tsw->tsw_modem);
+	DB_PRINTSYM(mmap, tsw->tsw_mmap);
+	DB_PRINTSYM(pktnotify, tsw->tsw_pktnotify);
+	DB_PRINTSYM(free, tsw->tsw_free);
+}
+static void
+_db_show_hooks(const char *sep, const struct ttyhook *th)
+{
+	db_printf("%shook: ", sep);
+	db_printsym((db_addr_t)th, DB_STGY_ANY);
+	db_printf(" (%p)\n", th);
+	if (th == NULL)
+		return;
+	DB_PRINTSYM(rint, th->th_rint);
+	DB_PRINTSYM(rint_bypass, th->th_rint_bypass);
+	DB_PRINTSYM(rint_done, th->th_rint_done);
+	DB_PRINTSYM(rint_poll, th->th_rint_poll);
+	DB_PRINTSYM(getc_inject, th->th_getc_inject);
+	DB_PRINTSYM(getc_capture, th->th_getc_capture);
+	DB_PRINTSYM(getc_poll, th->th_getc_poll);
+	DB_PRINTSYM(close, th->th_close);
+}
+
+static void
+_db_show_termios(const char *name, const struct termios *t)
+{
+
+	db_printf("%s: iflag 0x%x oflag 0x%x cflag 0x%x "
+	    "lflag 0x%x ispeed %u ospeed %u\n", name,
+	    t->c_iflag, t->c_oflag, t->c_cflag, t->c_lflag,
+	    t->c_ispeed, t->c_ospeed);
+}
+
+/* DDB command to show TTY statistics. */
+DB_SHOW_COMMAND(tty, db_show_tty)
+{
+	struct tty *tp;
+
+	if (!have_addr) {
+		db_printf("usage: show tty <addr>\n");
+		return;
+	}
+	tp = (struct tty *)addr;
+
+	db_printf("0x%p: %s\n", tp, tty_devname(tp));
+	db_printf("\tmtx: %p\n", tp->t_mtx);
+	db_printf("\tflags: %b\n", tp->t_flags, TTY_FLAG_BITS);
+	db_printf("\trevokecnt: %u\n", tp->t_revokecnt);
+
+	/* Buffering mechanisms. */
+	db_printf("\tinq: %p begin %u linestart %u reprint %u end %u "
+	    "nblocks %u quota %u\n", &tp->t_inq, tp->t_inq.ti_begin,
+	    tp->t_inq.ti_linestart, tp->t_inq.ti_reprint, tp->t_inq.ti_end,
+	    tp->t_inq.ti_nblocks, tp->t_inq.ti_quota);
+	db_printf("\toutq: %p begin %u end %u nblocks %u quota %u\n",
+	    &tp->t_outq, tp->t_outq.to_begin, tp->t_outq.to_end,
+	    tp->t_outq.to_nblocks, tp->t_outq.to_quota);
+	db_printf("\tinlow: %zu\n", tp->t_inlow);
+	db_printf("\toutlow: %zu\n", tp->t_outlow);
+	_db_show_termios("\ttermios", &tp->t_termios);
+	db_printf("\twinsize: row %u col %u xpixel %u ypixel %u\n",
+	    tp->t_winsize.ws_row, tp->t_winsize.ws_col,
+	    tp->t_winsize.ws_xpixel, tp->t_winsize.ws_ypixel);
+	db_printf("\tcolumn: %u\n", tp->t_column);
+	db_printf("\twritepos: %u\n", tp->t_writepos);
+	db_printf("\tcompatflags: 0x%x\n", tp->t_compatflags);
+
+	/* Init/lock-state devices. */
+	_db_show_termios("\ttermios_init_in", &tp->t_termios_init_in);
+	_db_show_termios("\ttermios_init_out", &tp->t_termios_init_out);
+	_db_show_termios("\ttermios_lock_in", &tp->t_termios_lock_in);
+	_db_show_termios("\ttermios_lock_out", &tp->t_termios_lock_out);
+
+	/* Hooks */
+	_db_show_devsw("\t", tp->t_devsw);
+	_db_show_hooks("\t", tp->t_hook);
+
+	/* Process info. */
+	db_printf("\tpgrp: %p gid %d jobc %d\n", tp->t_pgrp,
+	    tp->t_pgrp ? tp->t_pgrp->pg_id : 0,
+	    tp->t_pgrp ? tp->t_pgrp->pg_jobc : 0);
+	db_printf("\tsession: %p", tp->t_session);
+	if (tp->t_session != NULL)
+	    db_printf(" count %u leader %p tty %p sid %d login %s",
+		tp->t_session->s_count, tp->t_session->s_leader,
+		tp->t_session->s_ttyp, tp->t_session->s_sid,
+		tp->t_session->s_login);
+	db_printf("\n");
+	db_printf("\tsessioncnt: %u\n", tp->t_sessioncnt);
+	db_printf("\tdevswsoftc: %p\n", tp->t_devswsoftc);
+	db_printf("\thooksoftc: %p\n", tp->t_hooksoftc);
+	db_printf("\tdev: %p\n", tp->t_dev);
+}
+
+/* DDB command to list TTYs. */
+DB_SHOW_ALL_COMMAND(ttys, db_show_all_ttys)
+{
+	struct tty *tp;
+	size_t isiz, osiz;
+	int i, j;
+
+	/* Make the output look like `pstat -t'. */
+	db_printf("PTR        ");
+#if defined(__LP64__)
+	db_printf("        ");
+#endif
+	db_printf("      LINE   INQ  CAN  LIN  LOW  OUTQ  USE  LOW   "
+	    "COL  SESS  PGID STATE\n");
+
+	TAILQ_FOREACH(tp, &tty_list, t_list) {
+		isiz = tp->t_inq.ti_nblocks * TTYINQ_DATASIZE;
+		osiz = tp->t_outq.to_nblocks * TTYOUTQ_DATASIZE;
+
+		db_printf("%p %10s %5zu %4u %4u %4zu %5zu %4u %4zu %5u %5d %5d ",
+		    tp,
+		    tty_devname(tp),
+		    isiz,
+		    tp->t_inq.ti_linestart - tp->t_inq.ti_begin,
+		    tp->t_inq.ti_end - tp->t_inq.ti_linestart,
+		    isiz - tp->t_inlow,
+		    osiz,
+		    tp->t_outq.to_end - tp->t_outq.to_begin,
+		    osiz - tp->t_outlow,
+		    MIN(tp->t_column, 99999),
+		    tp->t_session ? tp->t_session->s_sid : 0,
+		    tp->t_pgrp ? tp->t_pgrp->pg_id : 0);
+
+		/* Flag bits. */
+		for (i = j = 0; ttystates[i].flag; i++)
+			if (tp->t_flags & ttystates[i].flag) {
+				db_printf("%c", ttystates[i].val);
+				j++;
+			}
+		if (j == 0)
+			db_printf("-");
+		db_printf("\n");
+	}
+}
+#endif /* DDB */
diff --git a/sys/kern/tty_compat.c b/sys/kern/tty_compat.c
new file mode 100644
index 0000000..6dce01d
--- /dev/null
+++ b/sys/kern/tty_compat.c
@@ -0,0 +1,484 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tty_compat.c	8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+/*
+ * mapping routines for old line discipline (yuck)
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ioctl_compat.h>
+#include <sys/tty.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+struct speedtab {
+	int sp_speed;			/* Speed. */
+	int sp_code;			/* Code. */
+};
+
+static int ttcompatgetflags(struct tty *tp);
+static void ttcompatsetflags(struct tty *tp, struct termios *t);
+static void ttcompatsetlflags(struct tty *tp, struct termios *t);
+static int ttcompatspeedtab(int speed, struct speedtab *table);
+
+static int ttydebug = 0;
+SYSCTL_INT(_debug, OID_AUTO, ttydebug, CTLFLAG_RW, &ttydebug, 0, "");
+
+static struct speedtab compatspeeds[] = {
+#define MAX_SPEED	17
+	{ 115200, 17 },
+	{ 57600, 16 },
+	{ 38400, 15 },
+	{ 19200, 14 },
+	{ 9600,	13 },
+	{ 4800,	12 },
+	{ 2400,	11 },
+	{ 1800,	10 },
+	{ 1200,	9 },
+	{ 600,	8 },
+	{ 300,	7 },
+	{ 200,	6 },
+	{ 150,	5 },
+	{ 134,	4 },
+	{ 110,	3 },
+	{ 75,	2 },
+	{ 50,	1 },
+	{ 0,	0 },
+	{ -1,	-1 },
+};
+static int compatspcodes[] = {
+	0, 50, 75, 110, 134, 150, 200, 300, 600, 1200,
+	1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200,
+};
+
+static int
+ttcompatspeedtab(int speed, struct speedtab *table)
+{
+	if (speed == 0)
+		return (0); /* hangup */
+	for ( ; table->sp_speed > 0; table++)
+		if (table->sp_speed <= speed) /* nearest one, rounded down */
+			return (table->sp_code);
+	return (1); /* 50, min and not hangup */
+}
+
+static int
+ttsetcompat(struct tty *tp, u_long *com, caddr_t data, struct termios *term)
+{
+	switch (*com) {
+	case TIOCSETP:
+	case TIOCSETN: {
+		struct sgttyb *sg = (struct sgttyb *)data;
+		int speed;
+
+		if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0)
+			return(EINVAL);
+		else if (speed != ttcompatspeedtab(tp->t_termios.c_ispeed,
+		    compatspeeds))
+			term->c_ispeed = compatspcodes[speed];
+		else
+			term->c_ispeed = tp->t_termios.c_ispeed;
+		if ((speed = sg->sg_ospeed) > MAX_SPEED || speed < 0)
+			return(EINVAL);
+		else if (speed != ttcompatspeedtab(tp->t_termios.c_ospeed,
+		    compatspeeds))
+			term->c_ospeed = compatspcodes[speed];
+		else
+			term->c_ospeed = tp->t_termios.c_ospeed;
+		term->c_cc[VERASE] = sg->sg_erase;
+		term->c_cc[VKILL] = sg->sg_kill;
+		tp->t_compatflags = (tp->t_compatflags&0xffff0000) |
+		    (sg->sg_flags&0xffff);
+		ttcompatsetflags(tp, term);
+		*com = (*com == TIOCSETP) ? TIOCSETAF : TIOCSETA;
+		break;
+	}
+	case TIOCSETC: {
+		struct tchars *tc = (struct tchars *)data;
+		cc_t *cc;
+
+		cc = term->c_cc;
+		cc[VINTR] = tc->t_intrc;
+		cc[VQUIT] = tc->t_quitc;
+		cc[VSTART] = tc->t_startc;
+		cc[VSTOP] = tc->t_stopc;
+		cc[VEOF] = tc->t_eofc;
+		cc[VEOL] = tc->t_brkc;
+		if (tc->t_brkc == (char)_POSIX_VDISABLE)
+			cc[VEOL2] = _POSIX_VDISABLE;
+		*com = TIOCSETA;
+		break;
+	}
+	case TIOCSLTC: {
+		struct ltchars *ltc = (struct ltchars *)data;
+		cc_t *cc;
+
+		cc = term->c_cc;
+		cc[VSUSP] = ltc->t_suspc;
+		cc[VDSUSP] = ltc->t_dsuspc;
+		cc[VREPRINT] = ltc->t_rprntc;
+		cc[VDISCARD] = ltc->t_flushc;
+		cc[VWERASE] = ltc->t_werasc;
+		cc[VLNEXT] = ltc->t_lnextc;
+		*com = TIOCSETA;
+		break;
+	}
+	case TIOCLBIS:
+	case TIOCLBIC:
+	case TIOCLSET:
+		if (*com == TIOCLSET)
+			tp->t_compatflags = (tp->t_compatflags&0xffff) |
+			    *(int *)data<<16;
+		else {
+			tp->t_compatflags = (ttcompatgetflags(tp)&0xffff0000) |
+			    (tp->t_compatflags&0xffff);
+			if (*com == TIOCLBIS)
+				tp->t_compatflags |= *(int *)data<<16;
+			else
+				tp->t_compatflags &= ~(*(int *)data<<16);
+		}
+		ttcompatsetlflags(tp, term);
+		*com = TIOCSETA;
+		break;
+	}
+	return 0;
+}
+
+/*ARGSUSED*/
+int
+tty_ioctl_compat(struct tty *tp, u_long com, caddr_t data, int fflag,
+    struct thread *td)
+{
+	switch (com) {
+	case TIOCSETP:
+	case TIOCSETN:
+	case TIOCSETC:
+	case TIOCSLTC:
+	case TIOCLBIS:
+	case TIOCLBIC:
+	case TIOCLSET: {
+		struct termios term;
+		int error;
+
+		term = tp->t_termios;
+		if ((error = ttsetcompat(tp, &com, data, &term)) != 0)
+			return error;
+		return tty_ioctl(tp, com, &term, fflag, td);
+	}
+	case TIOCGETP: {
+		struct sgttyb *sg = (struct sgttyb *)data;
+		cc_t *cc = tp->t_termios.c_cc;
+
+		sg->sg_ospeed = ttcompatspeedtab(tp->t_termios.c_ospeed,
+		    compatspeeds);
+		if (tp->t_termios.c_ispeed == 0)
+			sg->sg_ispeed = sg->sg_ospeed;
+		else
+			sg->sg_ispeed = ttcompatspeedtab(tp->t_termios.c_ispeed,
+			    compatspeeds);
+		sg->sg_erase = cc[VERASE];
+		sg->sg_kill = cc[VKILL];
+		sg->sg_flags = tp->t_compatflags = ttcompatgetflags(tp);
+		break;
+	}
+	case TIOCGETC: {
+		struct tchars *tc = (struct tchars *)data;
+		cc_t *cc = tp->t_termios.c_cc;
+
+		tc->t_intrc = cc[VINTR];
+		tc->t_quitc = cc[VQUIT];
+		tc->t_startc = cc[VSTART];
+		tc->t_stopc = cc[VSTOP];
+		tc->t_eofc = cc[VEOF];
+		tc->t_brkc = cc[VEOL];
+		break;
+	}
+	case TIOCGLTC: {
+		struct ltchars *ltc = (struct ltchars *)data;
+		cc_t *cc = tp->t_termios.c_cc;
+
+		ltc->t_suspc = cc[VSUSP];
+		ltc->t_dsuspc = cc[VDSUSP];
+		ltc->t_rprntc = cc[VREPRINT];
+		ltc->t_flushc = cc[VDISCARD];
+		ltc->t_werasc = cc[VWERASE];
+		ltc->t_lnextc = cc[VLNEXT];
+		break;
+	}
+	case TIOCLGET:
+		tp->t_compatflags =
+		 (ttcompatgetflags(tp) & 0xffff0000UL)
+		   | (tp->t_compatflags & 0xffff);
+		*(int *)data = tp->t_compatflags>>16;
+		if (ttydebug)
+			printf("CLGET: returning %x\n", *(int *)data);
+		break;
+
+	case OTIOCGETD:
+		*(int *)data = 2;
+		break;
+
+	case OTIOCSETD: {
+		int ldisczero = 0;
+
+		return (tty_ioctl(tp, TIOCSETD,
+			*(int *)data == 2 ? (caddr_t)&ldisczero : data,
+			fflag, td));
+	    }
+
+	case OTIOCCONS:
+		*(int *)data = 1;
+		return (tty_ioctl(tp, TIOCCONS, data, fflag, td));
+
+	default:
+		return (ENOIOCTL);
+	}
+	return (0);
+}
+
+static int
+ttcompatgetflags(struct tty *tp)
+{
+	tcflag_t iflag	= tp->t_termios.c_iflag;
+	tcflag_t lflag	= tp->t_termios.c_lflag;
+	tcflag_t oflag	= tp->t_termios.c_oflag;
+	tcflag_t cflag	= tp->t_termios.c_cflag;
+	int flags = 0;
+
+	if (iflag&IXOFF)
+		flags |= TANDEM;
+	if (iflag&ICRNL || oflag&ONLCR)
+		flags |= CRMOD;
+	if ((cflag&CSIZE) == CS8) {
+		flags |= PASS8;
+		if (iflag&ISTRIP)
+			flags |= ANYP;
+	}
+	else if (cflag&PARENB) {
+		if (iflag&INPCK) {
+			if (cflag&PARODD)
+				flags |= ODDP;
+			else
+				flags |= EVENP;
+		} else
+			flags |= EVENP | ODDP;
+	}
+
+	if ((lflag&ICANON) == 0) {
+		/* fudge */
+		if (iflag&(INPCK|ISTRIP|IXON) || lflag&(IEXTEN|ISIG)
+		    || (cflag&(CSIZE|PARENB)) != CS8)
+			flags |= CBREAK;
+		else
+			flags |= RAW;
+	}
+	if (!(flags&RAW) && !(oflag&OPOST) && (cflag&(CSIZE|PARENB)) == CS8)
+		flags |= LITOUT;
+	if (cflag&MDMBUF)
+		flags |= MDMBUF;
+	if ((cflag&HUPCL) == 0)
+		flags |= NOHANG;
+	if (oflag&TAB3)
+		flags |= XTABS;
+	if (lflag&ECHOE)
+		flags |= CRTERA|CRTBS;
+	if (lflag&ECHOKE)
+		flags |= CRTKIL|CRTBS;
+	if (lflag&ECHOPRT)
+		flags |= PRTERA;
+	if (lflag&ECHOCTL)
+		flags |= CTLECH;
+	if ((iflag&IXANY) == 0)
+		flags |= DECCTQ;
+	flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH);
+	if (ttydebug)
+		printf("getflags: %x\n", flags);
+	return (flags);
+}
+
+static void
+ttcompatsetflags(struct tty *tp, struct termios *t)
+{
+	int flags = tp->t_compatflags;
+	tcflag_t iflag	= t->c_iflag;
+	tcflag_t oflag	= t->c_oflag;
+	tcflag_t lflag	= t->c_lflag;
+	tcflag_t cflag	= t->c_cflag;
+
+	if (flags & RAW) {
+		iflag = IGNBRK;
+		lflag &= ~(ECHOCTL|ISIG|ICANON|IEXTEN);
+	} else {
+		iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
+		iflag |= BRKINT|IXON|IMAXBEL;
+		lflag |= ISIG|IEXTEN|ECHOCTL;	/* XXX was echoctl on ? */
+		if (flags & XTABS)
+			oflag |= TAB3;
+		else
+			oflag &= ~TAB3;
+		if (flags & CBREAK)
+			lflag &= ~ICANON;
+		else
+			lflag |= ICANON;
+		if (flags&CRMOD) {
+			iflag |= ICRNL;
+			oflag |= ONLCR;
+		} else {
+			iflag &= ~ICRNL;
+			oflag &= ~ONLCR;
+		}
+	}
+	if (flags&ECHO)
+		lflag |= ECHO;
+	else
+		lflag &= ~ECHO;
+
+	cflag &= ~(CSIZE|PARENB);
+	if (flags&(RAW|LITOUT|PASS8)) {
+		cflag |= CS8;
+		if (!(flags&(RAW|PASS8))
+		    || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
+			iflag |= ISTRIP;
+		else
+			iflag &= ~ISTRIP;
+		if (flags&(RAW|LITOUT))
+			oflag &= ~OPOST;
+		else
+			oflag |= OPOST;
+	} else {
+		cflag |= CS7|PARENB;
+		iflag |= ISTRIP;
+		oflag |= OPOST;
+	}
+	/* XXX don't set INPCK if RAW or PASS8? */
+	if ((flags&(EVENP|ODDP)) == EVENP) {
+		iflag |= INPCK;
+		cflag &= ~PARODD;
+	} else if ((flags&(EVENP|ODDP)) == ODDP) {
+		iflag |= INPCK;
+		cflag |= PARODD;
+	} else
+		iflag &= ~INPCK;
+	if (flags&TANDEM)
+		iflag |= IXOFF;
+	else
+		iflag &= ~IXOFF;
+	if ((flags&DECCTQ) == 0)
+		iflag |= IXANY;
+	else
+		iflag &= ~IXANY;
+	t->c_iflag = iflag;
+	t->c_oflag = oflag;
+	t->c_lflag = lflag;
+	t->c_cflag = cflag;
+}
+
+static void
+ttcompatsetlflags(struct tty *tp, struct termios *t)
+{
+	int flags = tp->t_compatflags;
+	tcflag_t iflag	= t->c_iflag;
+	tcflag_t oflag	= t->c_oflag;
+	tcflag_t lflag	= t->c_lflag;
+	tcflag_t cflag	= t->c_cflag;
+
+	iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
+	if (flags&CRTERA)
+		lflag |= ECHOE;
+	else
+		lflag &= ~ECHOE;
+	if (flags&CRTKIL)
+		lflag |= ECHOKE;
+	else
+		lflag &= ~ECHOKE;
+	if (flags&PRTERA)
+		lflag |= ECHOPRT;
+	else
+		lflag &= ~ECHOPRT;
+	if (flags&CTLECH)
+		lflag |= ECHOCTL;
+	else
+		lflag &= ~ECHOCTL;
+	if (flags&TANDEM)
+		iflag |= IXOFF;
+	else
+		iflag &= ~IXOFF;
+	if ((flags&DECCTQ) == 0)
+		iflag |= IXANY;
+	else
+		iflag &= ~IXANY;
+	if (flags & MDMBUF)
+		cflag |= MDMBUF;
+	else
+		cflag &= ~MDMBUF;
+	if (flags&NOHANG)
+		cflag &= ~HUPCL;
+	else
+		cflag |= HUPCL;
+	lflag &= ~(TOSTOP|FLUSHO|PENDIN|NOFLSH);
+	lflag |= flags&(TOSTOP|FLUSHO|PENDIN|NOFLSH);
+
+	/*
+	 * The next if-else statement is copied from above so don't bother
+	 * checking it separately.  We could avoid fiddlling with the
+	 * character size if the mode is already RAW or if neither the
+	 * LITOUT bit or the PASS8 bit is being changed, but the delta of
+	 * the change is not available here and skipping the RAW case would
+	 * make the code different from above.
+	 */
+	cflag &= ~(CSIZE|PARENB);
+	if (flags&(RAW|LITOUT|PASS8)) {
+		cflag |= CS8;
+		if (!(flags&(RAW|PASS8))
+		    || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
+			iflag |= ISTRIP;
+		else
+			iflag &= ~ISTRIP;
+		if (flags&(RAW|LITOUT))
+			oflag &= ~OPOST;
+		else
+			oflag |= OPOST;
+	} else {
+		cflag |= CS7|PARENB;
+		iflag |= ISTRIP;
+		oflag |= OPOST;
+	}
+	t->c_iflag = iflag;
+	t->c_oflag = oflag;
+	t->c_lflag = lflag;
+	t->c_cflag = cflag;
+}
diff --git a/sys/kern/tty_info.c b/sys/kern/tty_info.c
new file mode 100644
index 0000000..6849d0b
--- /dev/null
+++ b/sys/kern/tty_info.c
@@ -0,0 +1,313 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Copyright (c) 2002 Networks Associates Technologies, Inc.
+ * All rights reserved.
+ *
+ * Portions of this software were developed for the FreeBSD Project by
+ * ThinkSec AS and NAI Labs, the Security Research Division of Network
+ * Associates, Inc.  under DARPA/SPAWAR contract N66001-01-C-8035
+ * ("CBOSS"), as part of the DARPA CHATS research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+/*
+ * Returns 1 if p2 is "better" than p1
+ *
+ * The algorithm for picking the "interesting" process is thus:
+ *
+ *	1) Only foreground processes are eligible - implied.
+ *	2) Runnable processes are favored over anything else.  The runner
+ *	   with the highest cpu utilization is picked (p_estcpu).  Ties are
+ *	   broken by picking the highest pid.
+ *	3) The sleeper with the shortest sleep time is next.  With ties,
+ *	   we pick out just "short-term" sleepers (P_SINTR == 0).
+ *	4) Further ties are broken by picking the highest pid.
+ */
+
+#define TESTAB(a, b)    ((a)<<1 | (b))
+#define ONLYA   2
+#define ONLYB   1
+#define BOTH    3
+
+static int
+proc_sum(struct proc *p, fixpt_t *estcpup)
+{
+	struct thread *td;
+	int estcpu;
+	int val;
+
+	val = 0;
+	estcpu = 0;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		if (TD_ON_RUNQ(td) ||
+		    TD_IS_RUNNING(td))
+			val = 1;
+		estcpu += sched_pctcpu(td);
+		thread_unlock(td);
+	}
+	*estcpup = estcpu;
+
+	return (val);
+}
+
+static int
+thread_compare(struct thread *td, struct thread *td2)
+{
+	int runa, runb;
+	int slpa, slpb;
+	fixpt_t esta, estb;
+
+	if (td == NULL)
+		return (1);
+
+	/*
+	 * Fetch running stats, pctcpu usage, and interruptable flag.
+	 */
+	thread_lock(td);
+	runa = TD_IS_RUNNING(td) | TD_ON_RUNQ(td);
+	slpa = td->td_flags & TDF_SINTR;
+	esta = sched_pctcpu(td);
+	thread_unlock(td);
+	thread_lock(td2);
+	runb = TD_IS_RUNNING(td2) | TD_ON_RUNQ(td2);
+	estb = sched_pctcpu(td2);
+	slpb = td2->td_flags & TDF_SINTR;
+	thread_unlock(td2);
+	/*
+	 * see if at least one of them is runnable
+	 */
+	switch (TESTAB(runa, runb)) {
+	case ONLYA:
+		return (0);
+	case ONLYB:
+		return (1);
+	case BOTH:
+		break;
+	}
+	/*
+	 *  favor one with highest recent cpu utilization
+	 */
+	if (estb > esta)
+		return (1);
+	if (esta > estb)
+		return (0);
+	/*
+	 * favor one sleeping in a non-interruptible sleep
+	 */
+	switch (TESTAB(slpa, slpb)) {
+	case ONLYA:
+		return (0);
+	case ONLYB:
+		return (1);
+	case BOTH:
+		break;
+	}
+
+	return (td < td2);
+}
+
+static int
+proc_compare(struct proc *p1, struct proc *p2)
+{
+
+	int runa, runb;
+	fixpt_t esta, estb;
+
+	if (p1 == NULL)
+		return (1);
+
+	/*
+	 * Fetch various stats about these processes.  After we drop the
+	 * lock the information could be stale but the race is unimportant.
+	 */
+	PROC_LOCK(p1);
+	runa = proc_sum(p1, &esta);
+	PROC_UNLOCK(p1);
+	PROC_LOCK(p2);
+	runb = proc_sum(p2, &estb);
+	PROC_UNLOCK(p2);
+
+	/*
+	 * see if at least one of them is runnable
+	 */
+	switch (TESTAB(runa, runb)) {
+	case ONLYA:
+		return (0);
+	case ONLYB:
+		return (1);
+	case BOTH:
+		break;
+	}
+	/*
+	 *  favor one with highest recent cpu utilization
+	 */
+	if (estb > esta)
+		return (1);
+	if (esta > estb)
+		return (0);
+	/*
+	 * weed out zombies
+	 */
+	switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) {
+	case ONLYA:
+		return (1);
+	case ONLYB:
+		return (0);
+	case BOTH:
+		break;
+	}
+
+	return (p2->p_pid > p1->p_pid);		/* tie - return highest pid */
+}
+
+/*
+ * Report on state of foreground process group.
+ */
+void
+tty_info(struct tty *tp)
+{
+	struct timeval rtime, utime, stime;
+	struct proc *p, *ppick;
+	struct thread *td, *tdpick;
+	const char *stateprefix, *state;
+	long rss;
+	int load, pctcpu;
+	pid_t pid;
+	char comm[MAXCOMLEN + 1];
+	struct rusage ru;
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (tty_checkoutq(tp) == 0)
+		return;
+
+	/* Print load average. */
+	load = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT;
+	ttyprintf(tp, "%sload: %d.%02d ", tp->t_column == 0 ? "" : "\n",
+	    load / 100, load % 100);
+
+	if (tp->t_session == NULL) {
+		ttyprintf(tp, "not a controlling terminal\n");
+		return;
+	}
+	if (tp->t_pgrp == NULL) {
+		ttyprintf(tp, "no foreground process group\n");
+		return;
+	}
+	PGRP_LOCK(tp->t_pgrp);
+	if (LIST_EMPTY(&tp->t_pgrp->pg_members)) {
+		PGRP_UNLOCK(tp->t_pgrp);
+		ttyprintf(tp, "empty foreground process group\n");
+		return;
+	}
+
+	/*
+	 * Pick the most interesting process and copy some of its
+	 * state for printing later.  This operation could rely on stale
+	 * data as we can't hold the proc slock or thread locks over the
+	 * whole list. However, we're guaranteed not to reference an exited
+	 * thread or proc since we hold the tty locked.
+	 */
+	p = NULL;
+	LIST_FOREACH(ppick, &tp->t_pgrp->pg_members, p_pglist)
+		if (proc_compare(p, ppick))
+			p = ppick;
+
+	PROC_LOCK(p);
+	PGRP_UNLOCK(tp->t_pgrp);
+	td = NULL;
+	FOREACH_THREAD_IN_PROC(p, tdpick)
+		if (thread_compare(td, tdpick))
+			td = tdpick;
+	stateprefix = "";
+	thread_lock(td);
+	if (TD_IS_RUNNING(td))
+		state = "running";
+	else if (TD_ON_RUNQ(td) || TD_CAN_RUN(td))
+		state = "runnable";
+	else if (TD_IS_SLEEPING(td)) {
+		/* XXX: If we're sleeping, are we ever not in a queue? */
+		if (TD_ON_SLEEPQ(td))
+			state = td->td_wmesg;
+		else
+			state = "sleeping without queue";
+	} else if (TD_ON_LOCK(td)) {
+		state = td->td_lockname;
+		stateprefix = "*";
+	} else if (TD_IS_SUSPENDED(td))
+		state = "suspended";
+	else if (TD_AWAITING_INTR(td))
+		state = "intrwait";
+	else if (p->p_state == PRS_ZOMBIE)
+		state = "zombie";
+	else
+		state = "unknown";
+	pctcpu = (sched_pctcpu(td) * 10000 + FSCALE / 2) >> FSHIFT;
+	thread_unlock(td);
+	if (p->p_state == PRS_NEW || p->p_state == PRS_ZOMBIE)
+		rss = 0;
+	else
+		rss = pgtok(vmspace_resident_count(p->p_vmspace));
+	microuptime(&rtime);
+	timevalsub(&rtime, &p->p_stats->p_start);
+	rufetchcalc(p, &ru, &utime, &stime);
+	pid = p->p_pid;
+	strlcpy(comm, p->p_comm, sizeof comm);
+	PROC_UNLOCK(p);
+
+	/* Print command, pid, state, rtime, utime, stime, %cpu, and rss. */
+	ttyprintf(tp,
+	    " cmd: %s %d [%s%s] %ld.%02ldr %ld.%02ldu %ld.%02lds %d%% %ldk\n",
+	    comm, pid, stateprefix, state,
+	    (long)rtime.tv_sec, rtime.tv_usec / 10000,
+	    (long)utime.tv_sec, utime.tv_usec / 10000,
+	    (long)stime.tv_sec, stime.tv_usec / 10000,
+	    pctcpu / 100, rss);
+}
diff --git a/sys/kern/tty_inq.c b/sys/kern/tty_inq.c
new file mode 100644
index 0000000..97017ac
--- /dev/null
+++ b/sys/kern/tty_inq.c
@@ -0,0 +1,489 @@
+/*-
+ * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions of this software were developed under sponsorship from Snow
+ * B.V., the Netherlands.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/uio.h>
+
+#include <vm/uma.h>
+
+/*
+ * TTY input queue buffering.
+ *
+ * Unlike the output queue, the input queue has more features that are
+ * needed to properly implement various features offered by the TTY
+ * interface:
+ *
+ * - Data can be removed from the tail of the queue, which is used to
+ *   implement backspace.
+ * - Once in a while, input has to be `canonicalized'. When ICANON is
+ *   turned on, this will be done after a CR has been inserted.
+ *   Otherwise, it should be done after any character has been inserted.
+ * - The input queue can store one bit per byte, called the quoting bit.
+ *   This bit is used by TTYDISC to make backspace work on quoted
+ *   characters.
+ *
+ * In most cases, there is probably less input than output, so unlike
+ * the outq, we'll stick to 128 byte blocks here.
+ */
+
+static int ttyinq_flush_secure = 1;
+SYSCTL_INT(_kern, OID_AUTO, tty_inq_flush_secure, CTLFLAG_RW,
+	&ttyinq_flush_secure, 0, "Zero buffers while flushing");
+
+#define TTYINQ_QUOTESIZE	(TTYINQ_DATASIZE / BMSIZE)
+#define BMSIZE			32
+#define GETBIT(tib,boff) \
+	((tib)->tib_quotes[(boff) / BMSIZE] & (1 << ((boff) % BMSIZE)))
+#define SETBIT(tib,boff) \
+	((tib)->tib_quotes[(boff) / BMSIZE] |= (1 << ((boff) % BMSIZE)))
+#define CLRBIT(tib,boff) \
+	((tib)->tib_quotes[(boff) / BMSIZE] &= ~(1 << ((boff) % BMSIZE)))
+
+struct ttyinq_block {
+	struct ttyinq_block	*tib_prev;
+	struct ttyinq_block	*tib_next;
+	uint32_t		tib_quotes[TTYINQ_QUOTESIZE];
+	char			tib_data[TTYINQ_DATASIZE];
+};
+
+static uma_zone_t ttyinq_zone;
+
+#define	TTYINQ_INSERT_TAIL(ti, tib) do {				\
+	if (ti->ti_end == 0) {						\
+		tib->tib_prev = NULL;					\
+		tib->tib_next = ti->ti_firstblock;			\
+		ti->ti_firstblock = tib;				\
+	} else {							\
+		tib->tib_prev = ti->ti_lastblock;			\
+		tib->tib_next = ti->ti_lastblock->tib_next;		\
+		ti->ti_lastblock->tib_next = tib;			\
+	}								\
+	if (tib->tib_next != NULL)					\
+		tib->tib_next->tib_prev = tib;				\
+	ti->ti_nblocks++;						\
+} while (0)
+
+#define	TTYINQ_REMOVE_HEAD(ti) do {					\
+	ti->ti_firstblock = ti->ti_firstblock->tib_next;		\
+	if (ti->ti_firstblock != NULL)					\
+		ti->ti_firstblock->tib_prev = NULL;			\
+	ti->ti_nblocks--;						\
+} while (0)
+
+#define	TTYINQ_RECYCLE(ti, tib) do {					\
+	if (ti->ti_quota <= ti->ti_nblocks)				\
+		uma_zfree(ttyinq_zone, tib);				\
+	else								\
+		TTYINQ_INSERT_TAIL(ti, tib);				\
+} while (0)
+
+void
+ttyinq_setsize(struct ttyinq *ti, struct tty *tp, size_t size)
+{
+	struct ttyinq_block *tib;
+
+	ti->ti_quota = howmany(size, TTYINQ_DATASIZE);
+
+	while (ti->ti_quota > ti->ti_nblocks) {
+		/*
+		 * List is getting bigger.
+		 * Add new blocks to the tail of the list.
+		 *
+		 * We must unlock the TTY temporarily, because we need
+		 * to allocate memory. This won't be a problem, because
+		 * in the worst case, another thread ends up here, which
+		 * may cause us to allocate too many blocks, but this
+		 * will be caught by the loop below.
+		 */
+		tty_unlock(tp);
+		tib = uma_zalloc(ttyinq_zone, M_WAITOK);
+		tty_lock(tp);
+
+		TTYINQ_INSERT_TAIL(ti, tib);
+	}
+}
+
+void
+ttyinq_free(struct ttyinq *ti)
+{
+	struct ttyinq_block *tib;
+
+	ttyinq_flush(ti);
+	ti->ti_quota = 0;
+
+	while ((tib = ti->ti_firstblock) != NULL) {
+		TTYINQ_REMOVE_HEAD(ti);
+		uma_zfree(ttyinq_zone, tib);
+	}
+
+	MPASS(ti->ti_nblocks == 0);
+}
+
+int
+ttyinq_read_uio(struct ttyinq *ti, struct tty *tp, struct uio *uio,
+    size_t rlen, size_t flen)
+{
+
+	MPASS(rlen <= uio->uio_resid);
+
+	while (rlen > 0) {
+		int error;
+		struct ttyinq_block *tib;
+		size_t cbegin, cend, clen;
+
+		/* See if there still is data. */
+		if (ti->ti_begin == ti->ti_linestart)
+			return (0);
+		tib = ti->ti_firstblock;
+		if (tib == NULL)
+			return (0);
+
+		/*
+		 * The end address should be the lowest of these three:
+		 * - The write pointer
+		 * - The blocksize - we can't read beyond the block
+		 * - The end address if we could perform the full read
+		 */
+		cbegin = ti->ti_begin;
+		cend = MIN(MIN(ti->ti_linestart, ti->ti_begin + rlen),
+		    TTYINQ_DATASIZE);
+		clen = cend - cbegin;
+		MPASS(clen >= flen);
+		rlen -= clen;
+
+		/*
+		 * We can prevent buffering in some cases:
+		 * - We need to read the block until the end.
+		 * - We don't need to read the block until the end, but
+		 *   there is no data beyond it, which allows us to move
+		 *   the write pointer to a new block.
+		 */
+		if (cend == TTYINQ_DATASIZE || cend == ti->ti_end) {
+			/*
+			 * Fast path: zero copy. Remove the first block,
+			 * so we can unlock the TTY temporarily.
+			 */
+			TTYINQ_REMOVE_HEAD(ti);
+			ti->ti_begin = 0;
+
+			/*
+			 * Because we remove the first block, we must
+			 * fix up the block offsets.
+			 */
+#define CORRECT_BLOCK(t) do {			\
+	if (t <= TTYINQ_DATASIZE)		\
+		t = 0;				\
+	else					\
+		t -= TTYINQ_DATASIZE;		\
+} while (0)
+			CORRECT_BLOCK(ti->ti_linestart);
+			CORRECT_BLOCK(ti->ti_reprint);
+			CORRECT_BLOCK(ti->ti_end);
+#undef CORRECT_BLOCK
+
+			/*
+			 * Temporary unlock and copy the data to
+			 * userspace. We may need to flush trailing
+			 * bytes, like EOF characters.
+			 */
+			tty_unlock(tp);
+			error = uiomove(tib->tib_data + cbegin,
+			    clen - flen, uio);
+			tty_lock(tp);
+
+			/* Block can now be readded to the list. */
+			TTYINQ_RECYCLE(ti, tib);
+		} else {
+			char ob[TTYINQ_DATASIZE - 1];
+
+			/*
+			 * Slow path: store data in a temporary buffer.
+			 */
+			memcpy(ob, tib->tib_data + cbegin, clen - flen);
+			ti->ti_begin += clen;
+			MPASS(ti->ti_begin < TTYINQ_DATASIZE);
+
+			/* Temporary unlock and copy the data to userspace. */
+			tty_unlock(tp);
+			error = uiomove(ob, clen - flen, uio);
+			tty_lock(tp);
+		}
+
+		if (error != 0)
+			return (error);
+		if (tty_gone(tp))
+			return (ENXIO);
+	}
+
+	return (0);
+}
+
+static __inline void
+ttyinq_set_quotes(struct ttyinq_block *tib, size_t offset,
+    size_t length, int value)
+{
+
+	if (value) {
+		/* Set the bits. */
+		for (; length > 0; length--, offset++)
+			SETBIT(tib, offset);
+	} else {
+		/* Unset the bits. */
+		for (; length > 0; length--, offset++)
+			CLRBIT(tib, offset);
+	}
+}
+
+size_t
+ttyinq_write(struct ttyinq *ti, const void *buf, size_t nbytes, int quote)
+{
+	const char *cbuf = buf;
+	struct ttyinq_block *tib;
+	unsigned int boff;
+	size_t l;
+
+	while (nbytes > 0) {
+		boff = ti->ti_end % TTYINQ_DATASIZE;
+
+		if (ti->ti_end == 0) {
+			/* First time we're being used or drained. */
+			MPASS(ti->ti_begin == 0);
+			tib = ti->ti_firstblock;
+			if (tib == NULL) {
+				/* Queue has no blocks. */
+				break;
+			}
+			ti->ti_lastblock = tib;
+		} else if (boff == 0) {
+			/* We reached the end of this block on last write. */
+			tib = ti->ti_lastblock->tib_next;
+			if (tib == NULL) {
+				/* We've reached the watermark. */
+				break;
+			}
+			ti->ti_lastblock = tib;
+		} else {
+			tib = ti->ti_lastblock;
+		}
+
+		/* Don't copy more than was requested. */
+		l = MIN(nbytes, TTYINQ_DATASIZE - boff);
+		MPASS(l > 0);
+		memcpy(tib->tib_data + boff, cbuf, l);
+
+		/* Set the quoting bits for the proper region. */
+		ttyinq_set_quotes(tib, boff, l, quote);
+
+		cbuf += l;
+		nbytes -= l;
+		ti->ti_end += l;
+	}
+
+	return (cbuf - (const char *)buf);
+}
+
+int
+ttyinq_write_nofrag(struct ttyinq *ti, const void *buf, size_t nbytes, int quote)
+{
+	size_t ret;
+
+	if (ttyinq_bytesleft(ti) < nbytes)
+		return (-1);
+
+	/* We should always be able to write it back. */
+	ret = ttyinq_write(ti, buf, nbytes, quote);
+	MPASS(ret == nbytes);
+
+	return (0);
+}
+
+void
+ttyinq_canonicalize(struct ttyinq *ti)
+{
+
+	ti->ti_linestart = ti->ti_reprint = ti->ti_end;
+	ti->ti_startblock = ti->ti_reprintblock = ti->ti_lastblock;
+}
+
+size_t
+ttyinq_findchar(struct ttyinq *ti, const char *breakc, size_t maxlen,
+    char *lastc)
+{
+	struct ttyinq_block *tib = ti->ti_firstblock;
+	unsigned int boff = ti->ti_begin;
+	unsigned int bend = MIN(MIN(TTYINQ_DATASIZE, ti->ti_linestart),
+	    ti->ti_begin + maxlen);
+
+	MPASS(maxlen > 0);
+
+	if (tib == NULL)
+		return (0);
+
+	while (boff < bend) {
+		if (strchr(breakc, tib->tib_data[boff]) && !GETBIT(tib, boff)) {
+			*lastc = tib->tib_data[boff];
+			return (boff - ti->ti_begin + 1);
+		}
+		boff++;
+	}
+
+	/* Not found - just process the entire block. */
+	return (bend - ti->ti_begin);
+}
+
+void
+ttyinq_flush(struct ttyinq *ti)
+{
+	struct ttyinq_block *tib;
+
+	ti->ti_begin = 0;
+	ti->ti_linestart = 0;
+	ti->ti_reprint = 0;
+	ti->ti_end = 0;
+
+	/* Zero all data in the input queue to get rid of passwords. */
+	if (ttyinq_flush_secure) {
+		for (tib = ti->ti_firstblock; tib != NULL; tib = tib->tib_next)
+			bzero(&tib->tib_data, sizeof tib->tib_data);
+	}
+}
+
+int
+ttyinq_peekchar(struct ttyinq *ti, char *c, int *quote)
+{
+	unsigned int boff;
+	struct ttyinq_block *tib = ti->ti_lastblock;
+
+	if (ti->ti_linestart == ti->ti_end)
+		return (-1);
+
+	MPASS(ti->ti_end > 0);
+	boff = (ti->ti_end - 1) % TTYINQ_DATASIZE;
+
+	*c = tib->tib_data[boff];
+	*quote = GETBIT(tib, boff);
+
+	return (0);
+}
+
+void
+ttyinq_unputchar(struct ttyinq *ti)
+{
+
+	MPASS(ti->ti_linestart < ti->ti_end);
+
+	if (--ti->ti_end % TTYINQ_DATASIZE == 0) {
+		/* Roll back to the previous block. */
+		ti->ti_lastblock = ti->ti_lastblock->tib_prev;
+		/*
+		 * This can only fail if we are unputchar()'ing the
+		 * first character in the queue.
+		 */
+		MPASS((ti->ti_lastblock == NULL) == (ti->ti_end == 0));
+	}
+}
+
+void
+ttyinq_reprintpos_set(struct ttyinq *ti)
+{
+
+	ti->ti_reprint = ti->ti_end;
+	ti->ti_reprintblock = ti->ti_lastblock;
+}
+
+void
+ttyinq_reprintpos_reset(struct ttyinq *ti)
+{
+
+	ti->ti_reprint = ti->ti_linestart;
+	ti->ti_reprintblock = ti->ti_startblock;
+}
+
+static void
+ttyinq_line_iterate(struct ttyinq *ti,
+    ttyinq_line_iterator_t *iterator, void *data,
+    unsigned int offset, struct ttyinq_block *tib)
+{
+	unsigned int boff;
+
+	/* Use the proper block when we're at the queue head. */
+	if (offset == 0)
+		tib = ti->ti_firstblock;
+
+	/* Iterate all characters and call the iterator function. */
+	for (; offset < ti->ti_end; offset++) {
+		boff = offset % TTYINQ_DATASIZE;
+		MPASS(tib != NULL);
+
+		/* Call back the iterator function. */
+		iterator(data, tib->tib_data[boff], GETBIT(tib, boff));
+
+		/* Last byte iterated - go to the next block. */
+		if (boff == TTYINQ_DATASIZE - 1)
+			tib = tib->tib_next;
+		MPASS(tib != NULL);
+	}
+}
+
+void
+ttyinq_line_iterate_from_linestart(struct ttyinq *ti,
+    ttyinq_line_iterator_t *iterator, void *data)
+{
+
+	ttyinq_line_iterate(ti, iterator, data,
+	    ti->ti_linestart, ti->ti_startblock);
+}
+
+void
+ttyinq_line_iterate_from_reprintpos(struct ttyinq *ti,
+    ttyinq_line_iterator_t *iterator, void *data)
+{
+
+	ttyinq_line_iterate(ti, iterator, data,
+	    ti->ti_reprint, ti->ti_reprintblock);
+}
+
+static void
+ttyinq_startup(void *dummy)
+{
+
+	ttyinq_zone = uma_zcreate("ttyinq", sizeof(struct ttyinq_block),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+
+SYSINIT(ttyinq, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyinq_startup, NULL);
diff --git a/sys/kern/tty_outq.c b/sys/kern/tty_outq.c
new file mode 100644
index 0000000..5d40abe
--- /dev/null
+++ b/sys/kern/tty_outq.c
@@ -0,0 +1,339 @@
+/*-
+ * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions of this software were developed under sponsorship from Snow
+ * B.V., the Netherlands.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/queue.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/uio.h>
+
+#include <vm/uma.h>
+
+/*
+ * TTY output queue buffering.
+ *
+ * The previous design of the TTY layer offered the so-called clists.
+ * These clists were used for both the input queues and the output
+ * queue. We don't use certain features on the output side, like quoting
+ * bits for parity marking and such. This mechanism is similar to the
+ * old clists, but only contains the features we need to buffer the
+ * output.
+ */
+
+struct ttyoutq_block {
+	struct ttyoutq_block	*tob_next;
+	char			tob_data[TTYOUTQ_DATASIZE];
+};
+
+static uma_zone_t ttyoutq_zone;
+
+#define	TTYOUTQ_INSERT_TAIL(to, tob) do {				\
+	if (to->to_end == 0) {						\
+		tob->tob_next = to->to_firstblock;			\
+		to->to_firstblock = tob;				\
+	} else {							\
+		tob->tob_next = to->to_lastblock->tob_next;		\
+		to->to_lastblock->tob_next = tob;			\
+	}								\
+	to->to_nblocks++;						\
+} while (0)
+
+#define	TTYOUTQ_REMOVE_HEAD(to) do {					\
+	to->to_firstblock = to->to_firstblock->tob_next;		\
+	to->to_nblocks--;						\
+} while (0)
+
+#define	TTYOUTQ_RECYCLE(to, tob) do {					\
+	if (to->to_quota <= to->to_nblocks)				\
+		uma_zfree(ttyoutq_zone, tob);				\
+	else								\
+		TTYOUTQ_INSERT_TAIL(to, tob);				\
+} while(0)
+
+void
+ttyoutq_flush(struct ttyoutq *to)
+{
+
+	to->to_begin = 0;
+	to->to_end = 0;
+}
+
+void
+ttyoutq_setsize(struct ttyoutq *to, struct tty *tp, size_t size)
+{
+	struct ttyoutq_block *tob;
+
+	to->to_quota = howmany(size, TTYOUTQ_DATASIZE);
+
+	while (to->to_quota > to->to_nblocks) {
+		/*
+		 * List is getting bigger.
+		 * Add new blocks to the tail of the list.
+		 *
+		 * We must unlock the TTY temporarily, because we need
+		 * to allocate memory. This won't be a problem, because
+		 * in the worst case, another thread ends up here, which
+		 * may cause us to allocate too many blocks, but this
+		 * will be caught by the loop below.
+		 */
+		tty_unlock(tp);
+		tob = uma_zalloc(ttyoutq_zone, M_WAITOK);
+		tty_lock(tp);
+
+		TTYOUTQ_INSERT_TAIL(to, tob);
+	}
+}
+
+void
+ttyoutq_free(struct ttyoutq *to)
+{
+	struct ttyoutq_block *tob;
+
+	ttyoutq_flush(to);
+	to->to_quota = 0;
+
+	while ((tob = to->to_firstblock) != NULL) {
+		TTYOUTQ_REMOVE_HEAD(to);
+		uma_zfree(ttyoutq_zone, tob);
+	}
+
+	MPASS(to->to_nblocks == 0);
+}
+
+size_t
+ttyoutq_read(struct ttyoutq *to, void *buf, size_t len)
+{
+	char *cbuf = buf;
+
+	while (len > 0) {
+		struct ttyoutq_block *tob;
+		size_t cbegin, cend, clen;
+
+		/* See if there still is data. */
+		if (to->to_begin == to->to_end)
+			break;
+		tob = to->to_firstblock;
+		if (tob == NULL)
+			break;
+
+		/*
+		 * The end address should be the lowest of these three:
+		 * - The write pointer
+		 * - The blocksize - we can't read beyond the block
+		 * - The end address if we could perform the full read
+		 */
+		cbegin = to->to_begin;
+		cend = MIN(MIN(to->to_end, to->to_begin + len),
+		    TTYOUTQ_DATASIZE);
+		clen = cend - cbegin;
+
+		/* Copy the data out of the buffers. */
+		memcpy(cbuf, tob->tob_data + cbegin, clen);
+		cbuf += clen;
+		len -= clen;
+
+		if (cend == to->to_end) {
+			/* Read the complete queue. */
+			to->to_begin = 0;
+			to->to_end = 0;
+		} else if (cend == TTYOUTQ_DATASIZE) {
+			/* Read the block until the end. */
+			TTYOUTQ_REMOVE_HEAD(to);
+			to->to_begin = 0;
+			to->to_end -= TTYOUTQ_DATASIZE;
+			TTYOUTQ_RECYCLE(to, tob);
+		} else {
+			/* Read the block partially. */
+			to->to_begin += clen;
+		}
+	}
+
+	return (cbuf - (char *)buf);
+}
+
+/*
+ * An optimized version of ttyoutq_read() which can be used in pseudo
+ * TTY drivers to directly copy data from the outq to userspace, instead
+ * of buffering it.
+ *
+ * We can only copy data directly if we need to read the entire block
+ * back to the user, because we temporarily remove the block from the
+ * queue. Otherwise we need to copy it to a temporary buffer first, to
+ * make sure data remains in the correct order.
+ */
+int
+ttyoutq_read_uio(struct ttyoutq *to, struct tty *tp, struct uio *uio)
+{
+
+	while (uio->uio_resid > 0) {
+		int error;
+		struct ttyoutq_block *tob;
+		size_t cbegin, cend, clen;
+
+		/* See if there still is data. */
+		if (to->to_begin == to->to_end)
+			return (0);
+		tob = to->to_firstblock;
+		if (tob == NULL)
+			return (0);
+
+		/*
+		 * The end address should be the lowest of these three:
+		 * - The write pointer
+		 * - The blocksize - we can't read beyond the block
+		 * - The end address if we could perform the full read
+		 */
+		cbegin = to->to_begin;
+		cend = MIN(MIN(to->to_end, to->to_begin + uio->uio_resid),
+		    TTYOUTQ_DATASIZE);
+		clen = cend - cbegin;
+
+		/*
+		 * We can prevent buffering in some cases:
+		 * - We need to read the block until the end.
+		 * - We don't need to read the block until the end, but
+		 *   there is no data beyond it, which allows us to move
+		 *   the write pointer to a new block.
+		 */
+		if (cend == TTYOUTQ_DATASIZE || cend == to->to_end) {
+			/*
+			 * Fast path: zero copy. Remove the first block,
+			 * so we can unlock the TTY temporarily.
+			 */
+			TTYOUTQ_REMOVE_HEAD(to);
+			to->to_begin = 0;
+			if (to->to_end <= TTYOUTQ_DATASIZE)
+				to->to_end = 0;
+			else
+				to->to_end -= TTYOUTQ_DATASIZE;
+
+			/* Temporary unlock and copy the data to userspace. */
+			tty_unlock(tp);
+			error = uiomove(tob->tob_data + cbegin, clen, uio);
+			tty_lock(tp);
+
+			/* Block can now be readded to the list. */
+			TTYOUTQ_RECYCLE(to, tob);
+		} else {
+			char ob[TTYOUTQ_DATASIZE - 1];
+
+			/*
+			 * Slow path: store data in a temporary buffer.
+			 */
+			memcpy(ob, tob->tob_data + cbegin, clen);
+			to->to_begin += clen;
+			MPASS(to->to_begin < TTYOUTQ_DATASIZE);
+
+			/* Temporary unlock and copy the data to userspace. */
+			tty_unlock(tp);
+			error = uiomove(ob, clen, uio);
+			tty_lock(tp);
+		}
+
+		if (error != 0)
+			return (error);
+	}
+
+	return (0);
+}
+
+size_t
+ttyoutq_write(struct ttyoutq *to, const void *buf, size_t nbytes)
+{
+	const char *cbuf = buf;
+	struct ttyoutq_block *tob;
+	unsigned int boff;
+	size_t l;
+
+	while (nbytes > 0) {
+		boff = to->to_end % TTYOUTQ_DATASIZE;
+
+		if (to->to_end == 0) {
+			/* First time we're being used or drained. */
+			MPASS(to->to_begin == 0);
+			tob = to->to_firstblock;
+			if (tob == NULL) {
+				/* Queue has no blocks. */
+				break;
+			}
+			to->to_lastblock = tob;
+		} else if (boff == 0) {
+			/* We reached the end of this block on last write. */
+			tob = to->to_lastblock->tob_next;
+			if (tob == NULL) {
+				/* We've reached the watermark. */
+				break;
+			}
+			to->to_lastblock = tob;
+		} else {
+			tob = to->to_lastblock;
+		}
+
+		/* Don't copy more than was requested. */
+		l = MIN(nbytes, TTYOUTQ_DATASIZE - boff);
+		MPASS(l > 0);
+		memcpy(tob->tob_data + boff, cbuf, l);
+
+		cbuf += l;
+		nbytes -= l;
+		to->to_end += l;
+	}
+
+	return (cbuf - (const char *)buf);
+}
+
+int
+ttyoutq_write_nofrag(struct ttyoutq *to, const void *buf, size_t nbytes)
+{
+	size_t ret;
+
+	if (ttyoutq_bytesleft(to) < nbytes)
+		return (-1);
+
+	/* We should always be able to write it back. */
+	ret = ttyoutq_write(to, buf, nbytes);
+	MPASS(ret == nbytes);
+
+	return (0);
+}
+
+static void
+ttyoutq_startup(void *dummy)
+{
+
+	ttyoutq_zone = uma_zcreate("ttyoutq", sizeof(struct ttyoutq_block),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+
+SYSINIT(ttyoutq, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyoutq_startup, NULL);
diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c
new file mode 100644
index 0000000..8d2ac03
--- /dev/null
+++ b/sys/kern/tty_pts.c
@@ -0,0 +1,858 @@
+/*-
+ * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions of this software were developed under sponsorship from Snow
+ * B.V., the Netherlands.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/* Add compatibility bits for FreeBSD. */
+#define PTS_COMPAT
+/* Add pty(4) compat bits. */
+#define PTS_EXTERNAL
+/* Add bits to make Linux binaries work. */
+#define PTS_LINUX
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/condvar.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/serial.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/ttycom.h>
+
+#include <machine/stdarg.h>
+
+/*
+ * Our utmp(5) format is limited to 8-byte TTY line names.  This means
+ * we can at most allocate 1000 pseudo-terminals ("pts/999").  Allow
+ * users to increase this number, assuming they have manually increased
+ * UT_LINESIZE.
+ */
+static struct unrhdr *pts_pool;
+
+static MALLOC_DEFINE(M_PTS, "pts", "pseudo tty device");
+
+/*
+ * Per-PTS structure.
+ *
+ * List of locks
+ * (t)	locked by tty_lock()
+ * (c)	const until freeing
+ */
+struct pts_softc {
+	int		pts_unit;	/* (c) Device unit number. */
+	unsigned int	pts_flags;	/* (t) Device flags. */
+#define	PTS_PKT		0x1	/* Packet mode. */
+#define	PTS_FINISHED	0x2	/* Return errors on read()/write(). */
+	char		pts_pkt;	/* (t) Unread packet mode data. */
+
+	struct cv	pts_inwait;	/* (t) Blocking write() on master. */
+	struct selinfo	pts_inpoll;	/* (t) Select queue for write(). */
+	struct cv	pts_outwait;	/* (t) Blocking read() on master. */
+	struct selinfo	pts_outpoll;	/* (t) Select queue for read(). */
+
+#ifdef PTS_EXTERNAL
+	struct cdev	*pts_cdev;	/* (c) Master device node. */
+#endif /* PTS_EXTERNAL */
+
+	struct ucred	*pts_cred;	/* (c) Resource limit. */
+};
+
+/*
+ * Controller-side file operations.
+ */
+
+static int
+ptsdev_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	struct tty *tp = fp->f_data;
+	struct pts_softc *psc = tty_softc(tp);
+	int error = 0;
+	char pkt;
+
+	if (uio->uio_resid == 0)
+		return (0);
+
+	tty_lock(tp);
+
+	for (;;) {
+		/*
+		 * Implement packet mode. When packet mode is turned on,
+		 * the first byte contains a bitmask of events that
+		 * occured (start, stop, flush, window size, etc).
+		 */
+		if (psc->pts_flags & PTS_PKT && psc->pts_pkt) {
+			pkt = psc->pts_pkt;
+			psc->pts_pkt = 0;
+			tty_unlock(tp);
+
+			error = ureadc(pkt, uio);
+			return (error);
+		}
+
+		/*
+		 * Transmit regular data.
+		 *
+		 * XXX: We shouldn't use ttydisc_getc_poll()! Even
+		 * though in this implementation, there is likely going
+		 * to be data, we should just call ttydisc_getc_uio()
+		 * and use its return value to sleep.
+		 */
+		if (ttydisc_getc_poll(tp)) {
+			if (psc->pts_flags & PTS_PKT) {
+				/*
+				 * XXX: Small race. Fortunately PTY
+				 * consumers aren't multithreaded.
+				 */
+
+				tty_unlock(tp);
+				error = ureadc(TIOCPKT_DATA, uio);
+				if (error)
+					return (error);
+				tty_lock(tp);
+			}
+
+			error = ttydisc_getc_uio(tp, uio);
+			break;
+		}
+
+		/* Maybe the device isn't used anyway. */
+		if (psc->pts_flags & PTS_FINISHED)
+			break;
+
+		/* Wait for more data. */
+		if (fp->f_flag & O_NONBLOCK) {
+			error = EWOULDBLOCK;
+			break;
+		}
+		error = cv_wait_sig(&psc->pts_outwait, tp->t_mtx);
+		if (error != 0)
+			break;
+	}
+
+	tty_unlock(tp);
+
+	return (error);
+}
+
+static int
+ptsdev_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	struct tty *tp = fp->f_data;
+	struct pts_softc *psc = tty_softc(tp);
+	char ib[256], *ibstart;
+	size_t iblen, rintlen;
+	int error = 0;
+
+	if (uio->uio_resid == 0)
+		return (0);
+
+	for (;;) {
+		ibstart = ib;
+		iblen = MIN(uio->uio_resid, sizeof ib);
+		error = uiomove(ib, iblen, uio);
+
+		tty_lock(tp);
+		if (error != 0) {
+			iblen = 0;
+			goto done;
+		}
+
+		/*
+		 * When possible, avoid the slow path. rint_bypass()
+		 * copies all input to the input queue at once.
+		 */
+		MPASS(iblen > 0);
+		do {
+			rintlen = ttydisc_rint_simple(tp, ibstart, iblen);
+			ibstart += rintlen;
+			iblen -= rintlen;
+			if (iblen == 0) {
+				/* All data written. */
+				break;
+			}
+
+			/* Maybe the device isn't used anyway. */
+			if (psc->pts_flags & PTS_FINISHED) {
+				error = EIO;
+				goto done;
+			}
+
+			/* Wait for more data. */
+			if (fp->f_flag & O_NONBLOCK) {
+				error = EWOULDBLOCK;
+				goto done;
+			}
+
+			/* Wake up users on the slave side. */
+			ttydisc_rint_done(tp);
+			error = cv_wait_sig(&psc->pts_inwait, tp->t_mtx);
+			if (error != 0)
+				goto done;
+		} while (iblen > 0);
+
+		if (uio->uio_resid == 0)
+			break;
+		tty_unlock(tp);
+	}
+
+done:	ttydisc_rint_done(tp);
+	tty_unlock(tp);
+
+	/*
+	 * Don't account for the part of the buffer that we couldn't
+	 * pass to the TTY.
+	 */
+	uio->uio_resid += iblen;
+	return (error);
+}
+
+static int
+ptsdev_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+static int
+ptsdev_ioctl(struct file *fp, u_long cmd, void *data,
+    struct ucred *active_cred, struct thread *td)
+{
+	struct tty *tp = fp->f_data;
+	struct pts_softc *psc = tty_softc(tp);
+	int error = 0, sig;
+
+	switch (cmd) {
+	case FIONBIO:
+		/* This device supports non-blocking operation. */
+		return (0);
+	case FIONREAD:
+		tty_lock(tp);
+		if (psc->pts_flags & PTS_FINISHED) {
+			/* Force read() to be called. */
+			*(int *)data = 1;
+		} else {
+			*(int *)data = ttydisc_getc_poll(tp);
+		}
+		tty_unlock(tp);
+		return (0);
+	case FIODGNAME: {
+		struct fiodgname_arg *fgn;
+		const char *p;
+		int i;
+
+		/* Reverse device name lookups, for ptsname() and ttyname(). */
+		fgn = data;
+		p = tty_devname(tp);
+		i = strlen(p) + 1;
+		if (i > fgn->len)
+			return (EINVAL);
+		return copyout(p, fgn->buf, i);
+	}
+
+	/*
+	 * We need to implement TIOCGPGRP and TIOCGSID here again. When
+	 * called on the pseudo-terminal master, it should not check if
+	 * the terminal is the foreground terminal of the calling
+	 * process.
+	 *
+	 * TIOCGETA is also implemented here. Various Linux PTY routines
+	 * often call isatty(), which is implemented by tcgetattr().
+	 */
+#ifdef PTS_LINUX
+	case TIOCGETA:
+		/* Obtain terminal flags through tcgetattr(). */
+		tty_lock(tp);
+		*(struct termios*)data = tp->t_termios;
+		tty_unlock(tp);
+		return (0);
+#endif /* PTS_LINUX */
+	case TIOCSETAF:
+	case TIOCSETAW:
+		/*
+		 * We must make sure we turn tcsetattr() calls of TCSAFLUSH and
+		 * TCSADRAIN into something different. If an application would
+		 * call TCSAFLUSH or TCSADRAIN on the master descriptor, it may
+		 * deadlock waiting for all data to be read.
+		 */
+		cmd = TIOCSETA;
+		break;
+#if defined(PTS_COMPAT) || defined(PTS_LINUX)
+	case TIOCGPTN:
+		/*
+		 * Get the device unit number.
+		 */
+		if (psc->pts_unit < 0)
+			return (ENOTTY);
+		*(unsigned int *)data = psc->pts_unit;
+		return (0);
+#endif /* PTS_COMPAT || PTS_LINUX */
+	case TIOCGPGRP:
+		/* Get the foreground process group ID. */
+		tty_lock(tp);
+		if (tp->t_pgrp != NULL)
+			*(int *)data = tp->t_pgrp->pg_id;
+		else
+			*(int *)data = NO_PID;
+		tty_unlock(tp);
+		return (0);
+	case TIOCGSID:
+		/* Get the session leader process ID. */
+		tty_lock(tp);
+		if (tp->t_session == NULL)
+			error = ENOTTY;
+		else
+			*(int *)data = tp->t_session->s_sid;
+		tty_unlock(tp);
+		return (error);
+	case TIOCPTMASTER:
+		/* Yes, we are a pseudo-terminal master. */
+		return (0);
+	case TIOCSIG:
+		/* Signal the foreground process group. */
+		sig = *(int *)data;
+		if (sig < 1 || sig >= NSIG)
+			return (EINVAL);
+
+		tty_lock(tp);
+		tty_signal_pgrp(tp, sig);
+		tty_unlock(tp);
+		return (0);
+	case TIOCPKT:
+		/* Enable/disable packet mode. */
+		tty_lock(tp);
+		if (*(int *)data)
+			psc->pts_flags |= PTS_PKT;
+		else
+			psc->pts_flags &= ~PTS_PKT;
+		tty_unlock(tp);
+		return (0);
+	}
+
+	/* Just redirect this ioctl to the slave device. */
+	tty_lock(tp);
+	error = tty_ioctl(tp, cmd, data, fp->f_flag, td);
+	tty_unlock(tp);
+	if (error == ENOIOCTL)
+		error = ENOTTY;
+
+	return (error);
+}
+
+static int
+ptsdev_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct tty *tp = fp->f_data;
+	struct pts_softc *psc = tty_softc(tp);
+	int revents = 0;
+
+	tty_lock(tp);
+
+	if (psc->pts_flags & PTS_FINISHED) {
+		/* Slave device is not opened. */
+		tty_unlock(tp);
+		return ((events & (POLLIN|POLLRDNORM)) | POLLHUP);
+	}
+
+	if (events & (POLLIN|POLLRDNORM)) {
+		/* See if we can getc something. */
+		if (ttydisc_getc_poll(tp) ||
+		    (psc->pts_flags & PTS_PKT && psc->pts_pkt))
+			revents |= events & (POLLIN|POLLRDNORM);
+	}
+	if (events & (POLLOUT|POLLWRNORM)) {
+		/* See if we can rint something. */
+		if (ttydisc_rint_poll(tp))
+			revents |= events & (POLLOUT|POLLWRNORM);
+	}
+
+	/*
+	 * No need to check for POLLHUP here. This device cannot be used
+	 * as a callout device, which means we always have a carrier,
+	 * because the master is.
+	 */
+
+	if (revents == 0) {
+		/*
+		 * This code might look misleading, but the naming of
+		 * poll events on this side is the opposite of the slave
+		 * device.
+		 */
+		if (events & (POLLIN|POLLRDNORM))
+			selrecord(td, &psc->pts_outpoll);
+		if (events & (POLLOUT|POLLWRNORM))
+			selrecord(td, &psc->pts_inpoll);
+	}
+
+	tty_unlock(tp);
+
+	return (revents);
+}
+
+/*
+ * kqueue support.
+ */
+
+static void
+pts_kqops_read_detach(struct knote *kn)
+{
+	struct file *fp = kn->kn_fp;
+	struct tty *tp = fp->f_data;
+	struct pts_softc *psc = tty_softc(tp);
+
+	knlist_remove(&psc->pts_outpoll.si_note, kn, 0);
+}
+
+static int
+pts_kqops_read_event(struct knote *kn, long hint)
+{
+	struct file *fp = kn->kn_fp;
+	struct tty *tp = fp->f_data;
+	struct pts_softc *psc = tty_softc(tp);
+
+	if (psc->pts_flags & PTS_FINISHED) {
+		kn->kn_flags |= EV_EOF;
+		return (1);
+	} else {
+		kn->kn_data = ttydisc_getc_poll(tp);
+		return (kn->kn_data > 0);
+	}
+}
+
+static void
+pts_kqops_write_detach(struct knote *kn)
+{
+	struct file *fp = kn->kn_fp;
+	struct tty *tp = fp->f_data;
+	struct pts_softc *psc = tty_softc(tp);
+
+	knlist_remove(&psc->pts_inpoll.si_note, kn, 0);
+}
+
+static int
+pts_kqops_write_event(struct knote *kn, long hint)
+{
+	struct file *fp = kn->kn_fp;
+	struct tty *tp = fp->f_data;
+	struct pts_softc *psc = tty_softc(tp);
+
+	if (psc->pts_flags & PTS_FINISHED) {
+		kn->kn_flags |= EV_EOF;
+		return (1);
+	} else {
+		kn->kn_data = ttydisc_rint_poll(tp);
+		return (kn->kn_data > 0);
+	}
+}
+
+static struct filterops pts_kqops_read = {
+	.f_isfd = 1,
+	.f_detach = pts_kqops_read_detach,
+	.f_event = pts_kqops_read_event,
+};
+static struct filterops pts_kqops_write = {
+	.f_isfd = 1,
+	.f_detach = pts_kqops_write_detach,
+	.f_event = pts_kqops_write_event,
+};
+
+static int
+ptsdev_kqfilter(struct file *fp, struct knote *kn)
+{
+	struct tty *tp = fp->f_data;
+	struct pts_softc *psc = tty_softc(tp);
+	int error = 0;
+
+	tty_lock(tp);
+
+	switch (kn->kn_filter) {
+	case EVFILT_READ:
+		kn->kn_fop = &pts_kqops_read;
+		knlist_add(&psc->pts_outpoll.si_note, kn, 1);
+		break;
+	case EVFILT_WRITE:
+		kn->kn_fop = &pts_kqops_write;
+		knlist_add(&psc->pts_inpoll.si_note, kn, 1);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	tty_unlock(tp);
+	return (error);
+}
+
+static int
+ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct tty *tp = fp->f_data;
+#ifdef PTS_EXTERNAL
+	struct pts_softc *psc = tty_softc(tp);
+#endif /* PTS_EXTERNAL */
+	struct cdev *dev = tp->t_dev;
+
+	/*
+	 * According to POSIX, we must implement an fstat(). This also
+	 * makes this implementation compatible with Linux binaries,
+	 * because Linux calls fstat() on the pseudo-terminal master to
+	 * obtain st_rdev.
+	 *
+	 * XXX: POSIX also mentions we must fill in st_dev, but how?
+	 */
+
+	bzero(sb, sizeof *sb);
+#ifdef PTS_EXTERNAL
+	if (psc->pts_cdev != NULL)
+		sb->st_ino = sb->st_rdev = dev2udev(psc->pts_cdev);
+	else
+#endif /* PTS_EXTERNAL */
+		sb->st_ino = sb->st_rdev = tty_udev(tp);
+
+	sb->st_atim = dev->si_atime;
+	sb->st_ctim = dev->si_ctime;
+	sb->st_mtim = dev->si_mtime;
+	sb->st_uid = dev->si_uid;
+	sb->st_gid = dev->si_gid;
+	sb->st_mode = dev->si_mode | S_IFCHR;
+
+	return (0);
+}
+
+static int
+ptsdev_close(struct file *fp, struct thread *td)
+{
+	struct tty *tp = fp->f_data;
+
+	/* Deallocate TTY device. */
+	tty_lock(tp);
+	tty_rel_gone(tp);
+
+	/*
+	 * Open of /dev/ptmx or /dev/ptyXX changes the type of file
+	 * from DTYPE_VNODE to DTYPE_PTS. vn_open() increases vnode
+	 * use count, we need to decrement it, and possibly do other
+	 * required cleanup.
+	 */
+	if (fp->f_vnode != NULL)
+		return (vnops.fo_close(fp, td));
+
+	return (0);
+}
+
+static struct fileops ptsdev_ops = {
+	.fo_read	= ptsdev_read,
+	.fo_write	= ptsdev_write,
+	.fo_truncate	= ptsdev_truncate,
+	.fo_ioctl	= ptsdev_ioctl,
+	.fo_poll	= ptsdev_poll,
+	.fo_kqfilter	= ptsdev_kqfilter,
+	.fo_stat	= ptsdev_stat,
+	.fo_close	= ptsdev_close,
+	.fo_chmod	= invfo_chmod,
+	.fo_chown	= invfo_chown,
+	.fo_sendfile	= invfo_sendfile,
+	.fo_flags	= DFLAG_PASSABLE,
+};
+
+/*
+ * Driver-side hooks.
+ */
+
+static void
+ptsdrv_outwakeup(struct tty *tp)
+{
+	struct pts_softc *psc = tty_softc(tp);
+
+	cv_broadcast(&psc->pts_outwait);
+	selwakeup(&psc->pts_outpoll);
+	KNOTE_LOCKED(&psc->pts_outpoll.si_note, 0);
+}
+
+static void
+ptsdrv_inwakeup(struct tty *tp)
+{
+	struct pts_softc *psc = tty_softc(tp);
+
+	cv_broadcast(&psc->pts_inwait);
+	selwakeup(&psc->pts_inpoll);
+	KNOTE_LOCKED(&psc->pts_inpoll.si_note, 0);
+}
+
+static int
+ptsdrv_open(struct tty *tp)
+{
+	struct pts_softc *psc = tty_softc(tp);
+
+	psc->pts_flags &= ~PTS_FINISHED;
+
+	return (0);
+}
+
+static void
+ptsdrv_close(struct tty *tp)
+{
+	struct pts_softc *psc = tty_softc(tp);
+
+	/* Wake up any blocked readers/writers. */
+	psc->pts_flags |= PTS_FINISHED;
+	ptsdrv_outwakeup(tp);
+	ptsdrv_inwakeup(tp);
+}
+
+static void
+ptsdrv_pktnotify(struct tty *tp, char event)
+{
+	struct pts_softc *psc = tty_softc(tp);
+
+	/*
+	 * Clear conflicting flags.
+	 */
+
+	switch (event) {
+	case TIOCPKT_STOP:
+		psc->pts_pkt &= ~TIOCPKT_START;
+		break;
+	case TIOCPKT_START:
+		psc->pts_pkt &= ~TIOCPKT_STOP;
+		break;
+	case TIOCPKT_NOSTOP:
+		psc->pts_pkt &= ~TIOCPKT_DOSTOP;
+		break;
+	case TIOCPKT_DOSTOP:
+		psc->pts_pkt &= ~TIOCPKT_NOSTOP;
+		break;
+	}
+
+	psc->pts_pkt |= event;
+	ptsdrv_outwakeup(tp);
+}
+
+static void
+ptsdrv_free(void *softc)
+{
+	struct pts_softc *psc = softc;
+
+	/* Make device number available again. */
+	if (psc->pts_unit >= 0)
+		free_unr(pts_pool, psc->pts_unit);
+
+	chgptscnt(psc->pts_cred->cr_ruidinfo, -1, 0);
+	racct_sub_cred(psc->pts_cred, RACCT_NPTS, 1);
+	crfree(psc->pts_cred);
+
+	seldrain(&psc->pts_inpoll);
+	seldrain(&psc->pts_outpoll);
+	knlist_destroy(&psc->pts_inpoll.si_note);
+	knlist_destroy(&psc->pts_outpoll.si_note);
+
+#ifdef PTS_EXTERNAL
+	/* Destroy master device as well. */
+	if (psc->pts_cdev != NULL)
+		destroy_dev_sched(psc->pts_cdev);
+#endif /* PTS_EXTERNAL */
+
+	free(psc, M_PTS);
+}
+
+static struct ttydevsw pts_class = {
+	.tsw_flags	= TF_NOPREFIX,
+	.tsw_outwakeup	= ptsdrv_outwakeup,
+	.tsw_inwakeup	= ptsdrv_inwakeup,
+	.tsw_open	= ptsdrv_open,
+	.tsw_close	= ptsdrv_close,
+	.tsw_pktnotify	= ptsdrv_pktnotify,
+	.tsw_free	= ptsdrv_free,
+};
+
+#ifndef PTS_EXTERNAL
+static
+#endif /* !PTS_EXTERNAL */
+int
+pts_alloc(int fflags, struct thread *td, struct file *fp)
+{
+	int unit, ok, error;
+	struct tty *tp;
+	struct pts_softc *psc;
+	struct proc *p = td->td_proc;
+	struct ucred *cred = td->td_ucred;
+
+	/* Resource limiting. */
+	PROC_LOCK(p);
+	error = racct_add(p, RACCT_NPTS, 1);
+	if (error != 0) {
+		PROC_UNLOCK(p);
+		return (EAGAIN);
+	}
+	ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(p, RLIMIT_NPTS));
+	if (!ok) {
+		racct_sub(p, RACCT_NPTS, 1);
+		PROC_UNLOCK(p);
+		return (EAGAIN);
+	}
+	PROC_UNLOCK(p);
+
+	/* Try to allocate a new pts unit number. */
+	unit = alloc_unr(pts_pool);
+	if (unit < 0) {
+		racct_sub(p, RACCT_NPTS, 1);
+		chgptscnt(cred->cr_ruidinfo, -1, 0);
+		return (EAGAIN);
+	}
+
+	/* Allocate TTY and softc. */
+	psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
+	cv_init(&psc->pts_inwait, "ptsin");
+	cv_init(&psc->pts_outwait, "ptsout");
+
+	psc->pts_unit = unit;
+	psc->pts_cred = crhold(cred);
+
+	tp = tty_alloc(&pts_class, psc);
+	knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
+	knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
+
+	/* Expose the slave device as well. */
+	tty_makedev(tp, td->td_ucred, "pts/%u", psc->pts_unit);
+
+	finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
+
+	return (0);
+}
+
+#ifdef PTS_EXTERNAL
+int
+pts_alloc_external(int fflags, struct thread *td, struct file *fp,
+    struct cdev *dev, const char *name)
+{
+	int ok, error;
+	struct tty *tp;
+	struct pts_softc *psc;
+	struct proc *p = td->td_proc;
+	struct ucred *cred = td->td_ucred;
+
+	/* Resource limiting. */
+	PROC_LOCK(p);
+	error = racct_add(p, RACCT_NPTS, 1);
+	if (error != 0) {
+		PROC_UNLOCK(p);
+		return (EAGAIN);
+	}
+	ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(p, RLIMIT_NPTS));
+	if (!ok) {
+		racct_sub(p, RACCT_NPTS, 1);
+		PROC_UNLOCK(p);
+		return (EAGAIN);
+	}
+	PROC_UNLOCK(p);
+
+	/* Allocate TTY and softc. */
+	psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
+	cv_init(&psc->pts_inwait, "ptsin");
+	cv_init(&psc->pts_outwait, "ptsout");
+
+	psc->pts_unit = -1;
+	psc->pts_cdev = dev;
+	psc->pts_cred = crhold(cred);
+
+	tp = tty_alloc(&pts_class, psc);
+	knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
+	knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
+
+	/* Expose the slave device as well. */
+	tty_makedev(tp, td->td_ucred, "%s", name);
+
+	finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
+
+	return (0);
+}
+#endif /* PTS_EXTERNAL */
+
+int
+sys_posix_openpt(struct thread *td, struct posix_openpt_args *uap)
+{
+	int error, fd;
+	struct file *fp;
+
+	/*
+	 * POSIX states it's unspecified when other flags are passed. We
+	 * don't allow this.
+	 */
+	if (uap->flags & ~(O_RDWR|O_NOCTTY|O_CLOEXEC))
+		return (EINVAL);
+
+	error = falloc(td, &fp, &fd, uap->flags);
+	if (error)
+		return (error);
+
+	/* Allocate the actual pseudo-TTY. */
+	error = pts_alloc(FFLAGS(uap->flags & O_ACCMODE), td, fp);
+	if (error != 0) {
+		fdclose(td->td_proc->p_fd, fp, fd, td);
+		fdrop(fp, td);
+		return (error);
+	}
+
+	/* Pass it back to userspace. */
+	td->td_retval[0] = fd;
+	fdrop(fp, td);
+
+	return (0);
+}
+
+static void
+pts_init(void *unused)
+{
+
+	pts_pool = new_unrhdr(0, INT_MAX, NULL);
+}
+
+SYSINIT(pts, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, pts_init, NULL);
diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c
new file mode 100644
index 0000000..07d8358
--- /dev/null
+++ b/sys/kern/tty_tty.c
@@ -0,0 +1,94 @@
+/*-
+ * Copyright (c) 2003 Poul-Henning Kamp.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sx.h>
+#include <sys/vnode.h>
+
+#include <fs/devfs/devfs.h>
+#include <fs/devfs/devfs_int.h>
+
+static	d_open_t	cttyopen;
+
+static struct cdevsw ctty_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_open =	cttyopen,
+	.d_name =	"ctty",
+};
+
+static struct cdev *ctty;
+
+static	int
+cttyopen(struct cdev *dev, int flag, int mode, struct thread *td)
+{
+
+	return (ENXIO);
+}
+
+static void
+ctty_clone(void *arg, struct ucred *cred, char *name, int namelen,
+    struct cdev **dev)
+{
+
+	if (*dev != NULL)
+		return;
+	if (strcmp(name, "tty"))
+		return;
+	sx_sunlock(&clone_drain_lock);
+	sx_slock(&proctree_lock);
+	sx_slock(&clone_drain_lock);
+	dev_lock();
+	if (!(curthread->td_proc->p_flag & P_CONTROLT))
+		*dev = ctty;
+	else if (curthread->td_proc->p_session->s_ttyvp == NULL)
+		*dev = ctty;
+	else if (curthread->td_proc->p_session->s_ttyvp->v_type == VBAD ||
+	    curthread->td_proc->p_session->s_ttyvp->v_rdev == NULL) {
+		/* e.g. s_ttyvp was revoked */
+		*dev = ctty;
+	} else
+		*dev = curthread->td_proc->p_session->s_ttyvp->v_rdev;
+	dev_refl(*dev);
+	dev_unlock();
+	sx_sunlock(&proctree_lock);
+}
+
+static void
+ctty_drvinit(void *unused)
+{
+
+	EVENTHANDLER_REGISTER(dev_clone, ctty_clone, 0, 1000);
+	ctty = make_dev_credf(MAKEDEV_ETERNAL, &ctty_cdevsw, 0, NULL, UID_ROOT,
+	    GID_WHEEL, 0666, "ctty");
+}
+
+SYSINIT(cttydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,ctty_drvinit,NULL);
diff --git a/sys/kern/tty_ttydisc.c b/sys/kern/tty_ttydisc.c
new file mode 100644
index 0000000..63b144a
--- /dev/null
+++ b/sys/kern/tty_ttydisc.c
@@ -0,0 +1,1268 @@
+/*-
+ * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions of this software were developed under sponsorship from Snow
+ * B.V., the Netherlands.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/fcntl.h>
+#include <sys/filio.h>
+#include <sys/kernel.h>
+#include <sys/signal.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/ttycom.h>
+#include <sys/ttydefaults.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+/*
+ * Standard TTYDISC `termios' line discipline.
+ */
+
+/* Statistics. */
+static unsigned long tty_nin = 0;
+SYSCTL_ULONG(_kern, OID_AUTO, tty_nin, CTLFLAG_RD,
+	&tty_nin, 0, "Total amount of bytes received");
+static unsigned long tty_nout = 0;
+SYSCTL_ULONG(_kern, OID_AUTO, tty_nout, CTLFLAG_RD,
+	&tty_nout, 0, "Total amount of bytes transmitted");
+
+/* termios comparison macro's. */
+#define	CMP_CC(v,c) (tp->t_termios.c_cc[v] != _POSIX_VDISABLE && \
+			tp->t_termios.c_cc[v] == (c))
+#define	CMP_FLAG(field,opt) (tp->t_termios.c_ ## field ## flag & (opt))
+
+/* Characters that cannot be modified through c_cc. */
+#define CTAB	'\t'
+#define CNL	'\n'
+#define CCR	'\r'
+
+/* Character is a control character. */
+#define CTL_VALID(c)	((c) == 0x7f || (unsigned char)(c) < 0x20)
+/* Control character should be processed on echo. */
+#define CTL_ECHO(c,q)	(!(q) && ((c) == CERASE2 || (c) == CTAB || \
+    (c) == CNL || (c) == CCR))
+/* Control character should be printed using ^X notation. */
+#define CTL_PRINT(c,q)	((c) == 0x7f || ((unsigned char)(c) < 0x20 && \
+    ((q) || ((c) != CTAB && (c) != CNL))))
+/* Character is whitespace. */
+#define CTL_WHITE(c)	((c) == ' ' || (c) == CTAB)
+/* Character is alphanumeric. */
+#define CTL_ALNUM(c)	(((c) >= '0' && (c) <= '9') || \
+    ((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
+
+#define	TTY_STACKBUF	256
+
+void
+ttydisc_open(struct tty *tp)
+{
+	ttydisc_optimize(tp);
+}
+
+void
+ttydisc_close(struct tty *tp)
+{
+
+	/* Clean up our flags when leaving the discipline. */
+	tp->t_flags &= ~(TF_STOPPED|TF_HIWAT|TF_ZOMBIE);
+
+	/* POSIX states we should flush when close() is called. */
+	ttyinq_flush(&tp->t_inq);
+	ttyoutq_flush(&tp->t_outq);
+
+	if (!tty_gone(tp)) {
+		ttydevsw_inwakeup(tp);
+		ttydevsw_outwakeup(tp);
+	}
+
+	if (ttyhook_hashook(tp, close))
+		ttyhook_close(tp);
+}
+
+static int
+ttydisc_read_canonical(struct tty *tp, struct uio *uio, int ioflag)
+{
+	char breakc[4] = { CNL }; /* enough to hold \n, VEOF and VEOL. */
+	int error;
+	size_t clen, flen = 0, n = 1;
+	unsigned char lastc = _POSIX_VDISABLE;
+
+#define BREAK_ADD(c) do { \
+	if (tp->t_termios.c_cc[c] != _POSIX_VDISABLE)	\
+		breakc[n++] = tp->t_termios.c_cc[c];	\
+} while (0)
+	/* Determine which characters we should trigger on. */
+	BREAK_ADD(VEOF);
+	BREAK_ADD(VEOL);
+#undef BREAK_ADD
+	breakc[n] = '\0';
+
+	do {
+		error = tty_wait_background(tp, curthread, SIGTTIN);
+		if (error)
+			return (error);
+
+		/*
+		 * Quite a tricky case: unlike the old TTY
+		 * implementation, this implementation copies data back
+		 * to userspace in large chunks. Unfortunately, we can't
+		 * calculate the line length on beforehand if it crosses
+		 * ttyinq_block boundaries, because multiple reads could
+		 * then make this code read beyond the newline.
+		 *
+		 * This is why we limit the read to:
+		 * - The size the user has requested
+		 * - The blocksize (done in tty_inq.c)
+		 * - The amount of bytes until the newline
+		 *
+		 * This causes the line length to be recalculated after
+		 * each block has been copied to userspace. This will
+		 * cause the TTY layer to return data in chunks using
+		 * the blocksize (except the first and last blocks).
+		 */
+		clen = ttyinq_findchar(&tp->t_inq, breakc, uio->uio_resid,
+		    &lastc);
+
+		/* No more data. */
+		if (clen == 0) {
+			if (tp->t_flags & TF_ZOMBIE)
+				return (0);
+			else if (ioflag & IO_NDELAY)
+				return (EWOULDBLOCK);
+
+			error = tty_wait(tp, &tp->t_inwait);
+			if (error)
+				return (error);
+			continue;
+		}
+
+		/* Don't send the EOF char back to userspace. */
+		if (CMP_CC(VEOF, lastc))
+			flen = 1;
+
+		MPASS(flen <= clen);
+
+		/* Read and throw away the EOF character. */
+		error = ttyinq_read_uio(&tp->t_inq, tp, uio, clen, flen);
+		if (error)
+			return (error);
+
+	} while (uio->uio_resid > 0 && lastc == _POSIX_VDISABLE);
+
+	return (0);
+}
+
+static int
+ttydisc_read_raw_no_timer(struct tty *tp, struct uio *uio, int ioflag)
+{
+	size_t vmin = tp->t_termios.c_cc[VMIN];
+	ssize_t oresid = uio->uio_resid;
+	int error;
+
+	MPASS(tp->t_termios.c_cc[VTIME] == 0);
+
+	/*
+	 * This routine implements the easy cases of read()s while in
+	 * non-canonical mode, namely case B and D, where we don't have
+	 * any timers at all.
+	 */
+
+	for (;;) {
+		error = tty_wait_background(tp, curthread, SIGTTIN);
+		if (error)
+			return (error);
+
+		error = ttyinq_read_uio(&tp->t_inq, tp, uio,
+		    uio->uio_resid, 0);
+		if (error)
+			return (error);
+		if (uio->uio_resid == 0 || (oresid - uio->uio_resid) >= vmin)
+			return (0);
+
+		/* We have to wait for more. */
+		if (tp->t_flags & TF_ZOMBIE)
+			return (0);
+		else if (ioflag & IO_NDELAY)
+			return (EWOULDBLOCK);
+
+		error = tty_wait(tp, &tp->t_inwait);
+		if (error)
+			return (error);
+	}
+}
+
+static int
+ttydisc_read_raw_read_timer(struct tty *tp, struct uio *uio, int ioflag,
+    int oresid)
+{
+	size_t vmin = MAX(tp->t_termios.c_cc[VMIN], 1);
+	unsigned int vtime = tp->t_termios.c_cc[VTIME];
+	struct timeval end, now, left;
+	int error, hz;
+
+	MPASS(tp->t_termios.c_cc[VTIME] != 0);
+
+	/* Determine when the read should be expired. */
+	end.tv_sec = vtime / 10;
+	end.tv_usec = (vtime % 10) * 100000;
+	getmicrotime(&now);
+	timevaladd(&end, &now);
+
+	for (;;) {
+		error = tty_wait_background(tp, curthread, SIGTTIN);
+		if (error)
+			return (error);
+
+		error = ttyinq_read_uio(&tp->t_inq, tp, uio,
+		    uio->uio_resid, 0);
+		if (error)
+			return (error);
+		if (uio->uio_resid == 0 || (oresid - uio->uio_resid) >= vmin)
+			return (0);
+
+		/* Calculate how long we should wait. */
+		getmicrotime(&now);
+		if (timevalcmp(&now, &end, >))
+			return (0);
+		left = end;
+		timevalsub(&left, &now);
+		hz = tvtohz(&left);
+
+		/*
+		 * We have to wait for more. If the timer expires, we
+		 * should return a 0-byte read.
+		 */
+		if (tp->t_flags & TF_ZOMBIE)
+			return (0);
+		else if (ioflag & IO_NDELAY)
+			return (EWOULDBLOCK);
+
+		error = tty_timedwait(tp, &tp->t_inwait, hz);
+		if (error)
+			return (error == EWOULDBLOCK ? 0 : error);
+	}
+
+	return (0);
+}
+
+static int
+ttydisc_read_raw_interbyte_timer(struct tty *tp, struct uio *uio, int ioflag)
+{
+	size_t vmin = tp->t_termios.c_cc[VMIN];
+	ssize_t oresid = uio->uio_resid;
+	int error;
+
+	MPASS(tp->t_termios.c_cc[VMIN] != 0);
+	MPASS(tp->t_termios.c_cc[VTIME] != 0);
+
+	/*
+	 * When using the interbyte timer, the timer should be started
+	 * after the first byte has been received. We just call into the
+	 * generic read timer code after we've received the first byte.
+	 */
+
+	for (;;) {
+		error = tty_wait_background(tp, curthread, SIGTTIN);
+		if (error)
+			return (error);
+
+		error = ttyinq_read_uio(&tp->t_inq, tp, uio,
+		    uio->uio_resid, 0);
+		if (error)
+			return (error);
+		if (uio->uio_resid == 0 || (oresid - uio->uio_resid) >= vmin)
+			return (0);
+
+		/*
+		 * Not enough data, but we did receive some, which means
+		 * we'll now start using the interbyte timer.
+		 */
+		if (oresid != uio->uio_resid)
+			break;
+
+		/* We have to wait for more. */
+		if (tp->t_flags & TF_ZOMBIE)
+			return (0);
+		else if (ioflag & IO_NDELAY)
+			return (EWOULDBLOCK);
+
+		error = tty_wait(tp, &tp->t_inwait);
+		if (error)
+			return (error);
+	}
+
+	return ttydisc_read_raw_read_timer(tp, uio, ioflag, oresid);
+}
+
+int
+ttydisc_read(struct tty *tp, struct uio *uio, int ioflag)
+{
+	int error;
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (uio->uio_resid == 0)
+		return (0);
+
+	if (CMP_FLAG(l, ICANON))
+		error = ttydisc_read_canonical(tp, uio, ioflag);
+	else if (tp->t_termios.c_cc[VTIME] == 0)
+		error = ttydisc_read_raw_no_timer(tp, uio, ioflag);
+	else if (tp->t_termios.c_cc[VMIN] == 0)
+		error = ttydisc_read_raw_read_timer(tp, uio, ioflag,
+		    uio->uio_resid);
+	else
+		error = ttydisc_read_raw_interbyte_timer(tp, uio, ioflag);
+
+	if (ttyinq_bytesleft(&tp->t_inq) >= tp->t_inlow ||
+	    ttyinq_bytescanonicalized(&tp->t_inq) == 0) {
+		/* Unset the input watermark when we've got enough space. */
+		tty_hiwat_in_unblock(tp);
+	}
+
+	return (error);
+}
+
+static __inline unsigned int
+ttydisc_findchar(const char *obstart, unsigned int oblen)
+{
+	const char *c = obstart;
+
+	while (oblen--) {
+		if (CTL_VALID(*c))
+			break;
+		c++;
+	}
+
+	return (c - obstart);
+}
+
+static int
+ttydisc_write_oproc(struct tty *tp, char c)
+{
+	unsigned int scnt, error;
+
+	MPASS(CMP_FLAG(o, OPOST));
+	MPASS(CTL_VALID(c));
+
+#define PRINT_NORMAL() ttyoutq_write_nofrag(&tp->t_outq, &c, 1)
+	switch (c) {
+	case CEOF:
+		/* End-of-text dropping. */
+		if (CMP_FLAG(o, ONOEOT))
+			return (0);
+		return PRINT_NORMAL();
+
+	case CERASE2:
+		/* Handle backspace to fix tab expansion. */
+		if (PRINT_NORMAL() != 0)
+			return (-1);
+		if (tp->t_column > 0)
+			tp->t_column--;
+		return (0);
+
+	case CTAB:
+		/* Tab expansion. */
+		scnt = 8 - (tp->t_column & 7);
+		if (CMP_FLAG(o, TAB3)) {
+			error = ttyoutq_write_nofrag(&tp->t_outq,
+			    "        ", scnt);
+		} else {
+			error = PRINT_NORMAL();
+		}
+		if (error)
+			return (-1);
+
+		tp->t_column += scnt;
+		MPASS((tp->t_column % 8) == 0);
+		return (0);
+
+	case CNL:
+		/* Newline conversion. */
+		if (CMP_FLAG(o, ONLCR)) {
+			/* Convert \n to \r\n. */
+			error = ttyoutq_write_nofrag(&tp->t_outq, "\r\n", 2);
+		} else {
+			error = PRINT_NORMAL();
+		}
+		if (error)
+			return (-1);
+
+		if (CMP_FLAG(o, ONLCR|ONLRET)) {
+			tp->t_column = tp->t_writepos = 0;
+			ttyinq_reprintpos_set(&tp->t_inq);
+		}
+		return (0);
+
+	case CCR:
+		/* Carriage return to newline conversion. */
+		if (CMP_FLAG(o, OCRNL))
+			c = CNL;
+		/* Omit carriage returns on column 0. */
+		if (CMP_FLAG(o, ONOCR) && tp->t_column == 0)
+			return (0);
+		if (PRINT_NORMAL() != 0)
+			return (-1);
+
+		tp->t_column = tp->t_writepos = 0;
+		ttyinq_reprintpos_set(&tp->t_inq);
+		return (0);
+	}
+
+	/*
+	 * Invisible control character. Print it, but don't
+	 * increase the column count.
+	 */
+	return PRINT_NORMAL();
+#undef PRINT_NORMAL
+}
+
+/*
+ * Just like the old TTY implementation, we need to copy data in chunks
+ * into a temporary buffer. One of the reasons why we need to do this,
+ * is because output processing (only TAB3 though) may allow the buffer
+ * to grow eight times.
+ */
+int
+ttydisc_write(struct tty *tp, struct uio *uio, int ioflag)
+{
+	char ob[TTY_STACKBUF];
+	char *obstart;
+	int error = 0;
+	unsigned int oblen = 0;
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (tp->t_flags & TF_ZOMBIE)
+		return (EIO);
+
+	/*
+	 * We don't need to check whether the process is the foreground
+	 * process group or if we have a carrier. This is already done
+	 * in ttydev_write().
+	 */
+
+	while (uio->uio_resid > 0) {
+		unsigned int nlen;
+
+		MPASS(oblen == 0);
+
+		/* Step 1: read data. */
+		obstart = ob;
+		nlen = MIN(uio->uio_resid, sizeof ob);
+		tty_unlock(tp);
+		error = uiomove(ob, nlen, uio);
+		tty_lock(tp);
+		if (error != 0)
+			break;
+		oblen = nlen;
+
+		if (tty_gone(tp)) {
+			error = ENXIO;
+			break;
+		}
+
+		MPASS(oblen > 0);
+
+		/* Step 2: process data. */
+		do {
+			unsigned int plen, wlen;
+
+			/* Search for special characters for post processing. */
+			if (CMP_FLAG(o, OPOST)) {
+				plen = ttydisc_findchar(obstart, oblen);
+			} else {
+				plen = oblen;
+			}
+
+			if (plen == 0) {
+				/*
+				 * We're going to process a character
+				 * that needs processing
+				 */
+				if (ttydisc_write_oproc(tp, *obstart) == 0) {
+					obstart++;
+					oblen--;
+
+					tp->t_writepos = tp->t_column;
+					ttyinq_reprintpos_set(&tp->t_inq);
+					continue;
+				}
+			} else {
+				/* We're going to write regular data. */
+				wlen = ttyoutq_write(&tp->t_outq, obstart, plen);
+				obstart += wlen;
+				oblen -= wlen;
+				tp->t_column += wlen;
+
+				tp->t_writepos = tp->t_column;
+				ttyinq_reprintpos_set(&tp->t_inq);
+
+				if (wlen == plen)
+					continue;
+			}
+
+			/* Watermark reached. Try to sleep. */
+			tp->t_flags |= TF_HIWAT_OUT;
+
+			if (ioflag & IO_NDELAY) {
+				error = EWOULDBLOCK;
+				goto done;
+			}
+
+			/*
+			 * The driver may write back the data
+			 * synchronously. Be sure to check the high
+			 * water mark before going to sleep.
+			 */
+			ttydevsw_outwakeup(tp);
+			if ((tp->t_flags & TF_HIWAT_OUT) == 0)
+				continue;
+
+			error = tty_wait(tp, &tp->t_outwait);
+			if (error)
+				goto done;
+
+			if (tp->t_flags & TF_ZOMBIE) {
+				error = EIO;
+				goto done;
+			}
+		} while (oblen > 0);
+	}
+
+done:
+	if (!tty_gone(tp))
+		ttydevsw_outwakeup(tp);
+
+	/*
+	 * Add the amount of bytes that we didn't process back to the
+	 * uio counters. We need to do this to make sure write() doesn't
+	 * count the bytes we didn't store in the queue.
+	 */
+	uio->uio_resid += oblen;
+	return (error);
+}
+
+void
+ttydisc_optimize(struct tty *tp)
+{
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (ttyhook_hashook(tp, rint_bypass)) {
+		tp->t_flags |= TF_BYPASS;
+	} else if (ttyhook_hashook(tp, rint)) {
+		tp->t_flags &= ~TF_BYPASS;
+	} else if (!CMP_FLAG(i, ICRNL|IGNCR|IMAXBEL|INLCR|ISTRIP|IXON) &&
+	    (!CMP_FLAG(i, BRKINT) || CMP_FLAG(i, IGNBRK)) &&
+	    (!CMP_FLAG(i, PARMRK) ||
+		CMP_FLAG(i, IGNPAR|IGNBRK) == (IGNPAR|IGNBRK)) &&
+	    !CMP_FLAG(l, ECHO|ICANON|IEXTEN|ISIG|PENDIN)) {
+		tp->t_flags |= TF_BYPASS;
+	} else {
+		tp->t_flags &= ~TF_BYPASS;
+	}
+}
+
+void
+ttydisc_modem(struct tty *tp, int open)
+{
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (open)
+		cv_broadcast(&tp->t_dcdwait);
+
+	/*
+	 * Ignore modem status lines when CLOCAL is turned on, but don't
+	 * enter the zombie state when the TTY isn't opened, because
+	 * that would cause the TTY to be in zombie state after being
+	 * opened.
+	 */
+	if (!tty_opened(tp) || CMP_FLAG(c, CLOCAL))
+		return;
+
+	if (open == 0) {
+		/*
+		 * Lost carrier.
+		 */
+		tp->t_flags |= TF_ZOMBIE;
+
+		tty_signal_sessleader(tp, SIGHUP);
+		tty_flush(tp, FREAD|FWRITE);
+	} else {
+		/*
+		 * Carrier is back again.
+		 */
+
+		/* XXX: what should we do here? */
+	}
+}
+
+static int
+ttydisc_echo_force(struct tty *tp, char c, int quote)
+{
+
+	if (CMP_FLAG(o, OPOST) && CTL_ECHO(c, quote)) {
+		/*
+		 * Only perform postprocessing when OPOST is turned on
+		 * and the character is an unquoted BS/TB/NL/CR.
+		 */
+		return ttydisc_write_oproc(tp, c);
+	} else if (CMP_FLAG(l, ECHOCTL) && CTL_PRINT(c, quote)) {
+		/*
+		 * Only use ^X notation when ECHOCTL is turned on and
+		 * we've got an quoted control character.
+		 *
+		 * Print backspaces when echoing an end-of-file.
+		 */
+		char ob[4] = "^?\b\b";
+
+		/* Print ^X notation. */
+		if (c != 0x7f)
+			ob[1] = c + 'A' - 1;
+
+		if (!quote && CMP_CC(VEOF, c)) {
+			return ttyoutq_write_nofrag(&tp->t_outq, ob, 4);
+		} else {
+			tp->t_column += 2;
+			return ttyoutq_write_nofrag(&tp->t_outq, ob, 2);
+		}
+	} else {
+		/* Can just be printed. */
+		tp->t_column++;
+		return ttyoutq_write_nofrag(&tp->t_outq, &c, 1);
+	}
+}
+
+static int
+ttydisc_echo(struct tty *tp, char c, int quote)
+{
+
+	/*
+	 * Only echo characters when ECHO is turned on, or ECHONL when
+	 * the character is an unquoted newline.
+	 */
+	if (!CMP_FLAG(l, ECHO) &&
+	    (!CMP_FLAG(l, ECHONL) || c != CNL || quote))
+		return (0);
+
+	return ttydisc_echo_force(tp, c, quote);
+}
+
+static void
+ttydisc_reprint_char(void *d, char c, int quote)
+{
+	struct tty *tp = d;
+
+	ttydisc_echo(tp, c, quote);
+}
+
+static void
+ttydisc_reprint(struct tty *tp)
+{
+	cc_t c;
+
+	/* Print  ^R\n, followed by the line. */
+	c = tp->t_termios.c_cc[VREPRINT];
+	if (c != _POSIX_VDISABLE)
+		ttydisc_echo(tp, c, 0);
+	ttydisc_echo(tp, CNL, 0);
+	ttyinq_reprintpos_reset(&tp->t_inq);
+
+	ttyinq_line_iterate_from_linestart(&tp->t_inq, ttydisc_reprint_char, tp);
+}
+
+struct ttydisc_recalc_length {
+	struct tty *tp;
+	unsigned int curlen;
+};
+
+static void
+ttydisc_recalc_charlength(void *d, char c, int quote)
+{
+	struct ttydisc_recalc_length *data = d;
+	struct tty *tp = data->tp;
+
+	if (CTL_PRINT(c, quote)) {
+		if (CMP_FLAG(l, ECHOCTL))
+			data->curlen += 2;
+	} else if (c == CTAB) {
+		data->curlen += 8 - (data->curlen & 7);
+	} else {
+		data->curlen++;
+	}
+}
+
+static unsigned int
+ttydisc_recalc_linelength(struct tty *tp)
+{
+	struct ttydisc_recalc_length data = { tp, tp->t_writepos };
+
+	ttyinq_line_iterate_from_reprintpos(&tp->t_inq,
+	    ttydisc_recalc_charlength, &data);
+	return (data.curlen);
+}
+
+static int
+ttydisc_rubchar(struct tty *tp)
+{
+	char c;
+	int quote;
+	unsigned int prevpos, tablen;
+
+	if (ttyinq_peekchar(&tp->t_inq, &c, &quote) != 0)
+		return (-1);
+	ttyinq_unputchar(&tp->t_inq);
+
+	if (CMP_FLAG(l, ECHO)) {
+		/*
+		 * Remove the character from the screen. This is even
+		 * safe for characters that span multiple characters
+		 * (tabs, quoted, etc).
+		 */
+		if (tp->t_writepos >= tp->t_column) {
+			/* Retype the sentence. */
+			ttydisc_reprint(tp);
+		} else if (CMP_FLAG(l, ECHOE)) {
+			if (CTL_PRINT(c, quote)) {
+				/* Remove ^X formatted chars. */
+				if (CMP_FLAG(l, ECHOCTL)) {
+					tp->t_column -= 2;
+					ttyoutq_write_nofrag(&tp->t_outq,
+					    "\b\b  \b\b", 6);
+				}
+			} else if (c == ' ') {
+				/* Space character needs no rubbing. */
+				tp->t_column -= 1;
+				ttyoutq_write_nofrag(&tp->t_outq, "\b", 1);
+			} else if (c == CTAB) {
+				/*
+				 * Making backspace work with tabs is
+				 * quite hard. Recalculate the length of
+				 * this character and remove it.
+				 *
+				 * Because terminal settings could be
+				 * changed while the line is being
+				 * inserted, the calculations don't have
+				 * to be correct. Make sure we keep the
+				 * tab length within proper bounds.
+				 */
+				prevpos = ttydisc_recalc_linelength(tp);
+				if (prevpos >= tp->t_column)
+					tablen = 1;
+				else
+					tablen = tp->t_column - prevpos;
+				if (tablen > 8)
+					tablen = 8;
+
+				tp->t_column = prevpos;
+				ttyoutq_write_nofrag(&tp->t_outq,
+				    "\b\b\b\b\b\b\b\b", tablen);
+				return (0);
+			} else {
+				/*
+				 * Remove a regular character by
+				 * punching a space over it.
+				 */
+				tp->t_column -= 1;
+				ttyoutq_write_nofrag(&tp->t_outq, "\b \b", 3);
+			}
+		} else {
+			/* Don't print spaces. */
+			ttydisc_echo(tp, tp->t_termios.c_cc[VERASE], 0);
+		}
+	}
+
+	return (0);
+}
+
+static void
+ttydisc_rubword(struct tty *tp)
+{
+	char c;
+	int quote, alnum;
+
+	/* Strip whitespace first. */
+	for (;;) {
+		if (ttyinq_peekchar(&tp->t_inq, &c, &quote) != 0)
+			return;
+		if (!CTL_WHITE(c))
+			break;
+		ttydisc_rubchar(tp);
+	}
+
+	/*
+	 * Record whether the last character from the previous iteration
+	 * was alphanumeric or not. We need this to implement ALTWERASE.
+	 */
+	alnum = CTL_ALNUM(c);
+	for (;;) {
+		ttydisc_rubchar(tp);
+
+		if (ttyinq_peekchar(&tp->t_inq, &c, &quote) != 0)
+			return;
+		if (CTL_WHITE(c))
+			return;
+		if (CMP_FLAG(l, ALTWERASE) && CTL_ALNUM(c) != alnum)
+			return;
+	}
+}
+
+int
+ttydisc_rint(struct tty *tp, char c, int flags)
+{
+	int signal, quote = 0;
+	char ob[3] = { 0xff, 0x00 };
+	size_t ol;
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	atomic_add_long(&tty_nin, 1);
+
+	if (ttyhook_hashook(tp, rint))
+		return ttyhook_rint(tp, c, flags);
+
+	if (tp->t_flags & TF_BYPASS)
+		goto processed;
+
+	if (flags) {
+		if (flags & TRE_BREAK) {
+			if (CMP_FLAG(i, IGNBRK)) {
+				/* Ignore break characters. */
+				return (0);
+			} else if (CMP_FLAG(i, BRKINT)) {
+				/* Generate SIGINT on break. */
+				tty_flush(tp, FREAD|FWRITE);
+				tty_signal_pgrp(tp, SIGINT);
+				return (0);
+			} else {
+				/* Just print it. */
+				goto parmrk;
+			}
+		} else if (flags & TRE_FRAMING ||
+		    (flags & TRE_PARITY && CMP_FLAG(i, INPCK))) {
+			if (CMP_FLAG(i, IGNPAR)) {
+				/* Ignore bad characters. */
+				return (0);
+			} else {
+				/* Just print it. */
+				goto parmrk;
+			}
+		}
+	}
+
+	/* Allow any character to perform a wakeup. */
+	if (CMP_FLAG(i, IXANY))
+		tp->t_flags &= ~TF_STOPPED;
+
+	/* Remove the top bit. */
+	if (CMP_FLAG(i, ISTRIP))
+		c &= ~0x80;
+
+	/* Skip input processing when we want to print it literally. */
+	if (tp->t_flags & TF_LITERAL) {
+		tp->t_flags &= ~TF_LITERAL;
+		quote = 1;
+		goto processed;
+	}
+
+	/* Special control characters that are implementation dependent. */
+	if (CMP_FLAG(l, IEXTEN)) {
+		/* Accept the next character as literal. */
+		if (CMP_CC(VLNEXT, c)) {
+			if (CMP_FLAG(l, ECHO)) {
+				if (CMP_FLAG(l, ECHOE))
+					ttyoutq_write_nofrag(&tp->t_outq, "^\b", 2);
+				else
+					ttydisc_echo(tp, c, 0);
+			}
+			tp->t_flags |= TF_LITERAL;
+			return (0);
+		}
+	}
+
+	/*
+	 * Handle signal processing.
+	 */
+	if (CMP_FLAG(l, ISIG)) {
+		if (CMP_FLAG(l, ICANON|IEXTEN) == (ICANON|IEXTEN)) {
+			if (CMP_CC(VSTATUS, c)) {
+				tty_signal_pgrp(tp, SIGINFO);
+				return (0);
+			}
+		}
+
+		/*
+		 * When compared to the old implementation, this
+		 * implementation also flushes the output queue. POSIX
+		 * is really brief about this, but does makes us assume
+		 * we have to do so.
+		 */
+		signal = 0;
+		if (CMP_CC(VINTR, c)) {
+			signal = SIGINT;
+		} else if (CMP_CC(VQUIT, c)) {
+			signal = SIGQUIT;
+		} else if (CMP_CC(VSUSP, c)) {
+			signal = SIGTSTP;
+		}
+
+		if (signal != 0) {
+			/*
+			 * Echo the character before signalling the
+			 * processes.
+			 */
+			if (!CMP_FLAG(l, NOFLSH))
+				tty_flush(tp, FREAD|FWRITE);
+			ttydisc_echo(tp, c, 0);
+			tty_signal_pgrp(tp, signal);
+			return (0);
+		}
+	}
+
+	/*
+	 * Handle start/stop characters.
+	 */
+	if (CMP_FLAG(i, IXON)) {
+		if (CMP_CC(VSTOP, c)) {
+			/* Stop it if we aren't stopped yet. */
+			if ((tp->t_flags & TF_STOPPED) == 0) {
+				tp->t_flags |= TF_STOPPED;
+				return (0);
+			}
+			/*
+			 * Fallthrough:
+			 * When VSTART == VSTOP, we should make this key
+			 * toggle it.
+			 */
+			if (!CMP_CC(VSTART, c))
+				return (0);
+		}
+		if (CMP_CC(VSTART, c)) {
+			tp->t_flags &= ~TF_STOPPED;
+			return (0);
+		}
+	}
+
+	/* Conversion of CR and NL. */
+	switch (c) {
+	case CCR:
+		if (CMP_FLAG(i, IGNCR))
+			return (0);
+		if (CMP_FLAG(i, ICRNL))
+			c = CNL;
+		break;
+	case CNL:
+		if (CMP_FLAG(i, INLCR))
+			c = CCR;
+		break;
+	}
+
+	/* Canonical line editing. */
+	if (CMP_FLAG(l, ICANON)) {
+		if (CMP_CC(VERASE, c) || CMP_CC(VERASE2, c)) {
+			ttydisc_rubchar(tp);
+			return (0);
+		} else if (CMP_CC(VKILL, c)) {
+			while (ttydisc_rubchar(tp) == 0);
+			return (0);
+		} else if (CMP_FLAG(l, IEXTEN)) {
+			if (CMP_CC(VWERASE, c)) {
+				ttydisc_rubword(tp);
+				return (0);
+			} else if (CMP_CC(VREPRINT, c)) {
+				ttydisc_reprint(tp);
+				return (0);
+			}
+		}
+	}
+
+processed:
+	if (CMP_FLAG(i, PARMRK) && (unsigned char)c == 0xff) {
+		/* Print 0xff 0xff. */
+		ob[1] = 0xff;
+		ol = 2;
+		quote = 1;
+	} else {
+		ob[0] = c;
+		ol = 1;
+	}
+
+	goto print;
+
+parmrk:
+	if (CMP_FLAG(i, PARMRK)) {
+		/* Prepend 0xff 0x00 0x.. */
+		ob[2] = c;
+		ol = 3;
+		quote = 1;
+	} else {
+		ob[0] = c;
+		ol = 1;
+	}
+
+print:
+	/* See if we can store this on the input queue. */
+	if (ttyinq_write_nofrag(&tp->t_inq, ob, ol, quote) != 0) {
+		if (CMP_FLAG(i, IMAXBEL))
+			ttyoutq_write_nofrag(&tp->t_outq, "\a", 1);
+
+		/*
+		 * Prevent a deadlock here. It may be possible that a
+		 * user has entered so much data, there is no data
+		 * available to read(), but the buffers are full anyway.
+		 *
+		 * Only enter the high watermark if the device driver
+		 * can actually transmit something.
+		 */
+		if (ttyinq_bytescanonicalized(&tp->t_inq) == 0)
+			return (0);
+
+		tty_hiwat_in_block(tp);
+		return (-1);
+	}
+
+	/*
+	 * In raw mode, we canonicalize after receiving a single
+	 * character. Otherwise, we canonicalize when we receive a
+	 * newline, VEOL or VEOF, but only when it isn't quoted.
+	 */
+	if (!CMP_FLAG(l, ICANON) ||
+	    (!quote && (c == CNL || CMP_CC(VEOL, c) || CMP_CC(VEOF, c)))) {
+		ttyinq_canonicalize(&tp->t_inq);
+	}
+
+	ttydisc_echo(tp, c, quote);
+
+	return (0);
+}
+
+size_t
+ttydisc_rint_simple(struct tty *tp, const void *buf, size_t len)
+{
+	const char *cbuf;
+
+	if (ttydisc_can_bypass(tp))
+		return (ttydisc_rint_bypass(tp, buf, len));
+
+	for (cbuf = buf; len-- > 0; cbuf++) {
+		if (ttydisc_rint(tp, *cbuf, 0) != 0)
+			break;
+	}
+
+	return (cbuf - (const char *)buf);
+}
+
+size_t
+ttydisc_rint_bypass(struct tty *tp, const void *buf, size_t len)
+{
+	size_t ret;
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	MPASS(tp->t_flags & TF_BYPASS);
+
+	atomic_add_long(&tty_nin, len);
+
+	if (ttyhook_hashook(tp, rint_bypass))
+		return ttyhook_rint_bypass(tp, buf, len);
+
+	ret = ttyinq_write(&tp->t_inq, buf, len, 0);
+	ttyinq_canonicalize(&tp->t_inq);
+	if (ret < len)
+		tty_hiwat_in_block(tp);
+
+	return (ret);
+}
+
+void
+ttydisc_rint_done(struct tty *tp)
+{
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (ttyhook_hashook(tp, rint_done))
+		ttyhook_rint_done(tp);
+
+	/* Wake up readers. */
+	tty_wakeup(tp, FREAD);
+	/* Wake up driver for echo. */
+	ttydevsw_outwakeup(tp);
+}
+
+size_t
+ttydisc_rint_poll(struct tty *tp)
+{
+	size_t l;
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (ttyhook_hashook(tp, rint_poll))
+		return ttyhook_rint_poll(tp);
+
+	/*
+	 * XXX: Still allow character input when there's no space in the
+	 * buffers, but we haven't entered the high watermark. This is
+	 * to allow backspace characters to be inserted when in
+	 * canonical mode.
+	 */
+	l = ttyinq_bytesleft(&tp->t_inq);
+	if (l == 0 && (tp->t_flags & TF_HIWAT_IN) == 0)
+		return (1);
+
+	return (l);
+}
+
+static void
+ttydisc_wakeup_watermark(struct tty *tp)
+{
+	size_t c;
+
+	c = ttyoutq_bytesleft(&tp->t_outq);
+	if (tp->t_flags & TF_HIWAT_OUT) {
+		/* Only allow us to run when we're below the watermark. */
+		if (c < tp->t_outlow)
+			return;
+
+		/* Reset the watermark. */
+		tp->t_flags &= ~TF_HIWAT_OUT;
+	} else {
+		/* Only run when we have data at all. */
+		if (c == 0)
+			return;
+	}
+	tty_wakeup(tp, FWRITE);
+}
+
+size_t
+ttydisc_getc(struct tty *tp, void *buf, size_t len)
+{
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (tp->t_flags & TF_STOPPED)
+		return (0);
+
+	if (ttyhook_hashook(tp, getc_inject))
+		return ttyhook_getc_inject(tp, buf, len);
+
+	len = ttyoutq_read(&tp->t_outq, buf, len);
+
+	if (ttyhook_hashook(tp, getc_capture))
+		ttyhook_getc_capture(tp, buf, len);
+
+	ttydisc_wakeup_watermark(tp);
+	atomic_add_long(&tty_nout, len);
+
+	return (len);
+}
+
+int
+ttydisc_getc_uio(struct tty *tp, struct uio *uio)
+{
+	int error = 0;
+	ssize_t obytes = uio->uio_resid;
+	size_t len;
+	char buf[TTY_STACKBUF];
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (tp->t_flags & TF_STOPPED)
+		return (0);
+
+	/*
+	 * When a TTY hook is attached, we cannot perform unbuffered
+	 * copying to userspace. Just call ttydisc_getc() and
+	 * temporarily store data in a shadow buffer.
+	 */
+	if (ttyhook_hashook(tp, getc_capture) ||
+	    ttyhook_hashook(tp, getc_inject)) {
+		while (uio->uio_resid > 0) {
+			/* Read to shadow buffer. */
+			len = ttydisc_getc(tp, buf,
+			    MIN(uio->uio_resid, sizeof buf));
+			if (len == 0)
+				break;
+
+			/* Copy to userspace. */
+			tty_unlock(tp);
+			error = uiomove(buf, len, uio);
+			tty_lock(tp);
+
+			if (error != 0)
+				break;
+		}
+	} else {
+		error = ttyoutq_read_uio(&tp->t_outq, tp, uio);
+
+		ttydisc_wakeup_watermark(tp);
+		atomic_add_long(&tty_nout, obytes - uio->uio_resid);
+	}
+
+	return (error);
+}
+
+size_t
+ttydisc_getc_poll(struct tty *tp)
+{
+
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (tp->t_flags & TF_STOPPED)
+		return (0);
+
+	if (ttyhook_hashook(tp, getc_poll))
+		return ttyhook_getc_poll(tp);
+
+	return ttyoutq_bytesused(&tp->t_outq);
+}
+
+/*
+ * XXX: not really related to the TTYDISC, but we'd better put
+ * tty_putchar() here, because we need to perform proper output
+ * processing.
+ */
+
+int
+tty_putchar(struct tty *tp, char c)
+{
+	tty_lock_assert(tp, MA_OWNED);
+
+	if (tty_gone(tp))
+		return (-1);
+
+	ttydisc_echo_force(tp, c, 0);
+	tp->t_writepos = tp->t_column;
+	ttyinq_reprintpos_set(&tp->t_inq);
+
+	ttydevsw_outwakeup(tp);
+	return (0);
+}
diff --git a/sys/kern/uipc_accf.c b/sys/kern/uipc_accf.c
new file mode 100644
index 0000000..236b60d
--- /dev/null
+++ b/sys/kern/uipc_accf.c
@@ -0,0 +1,298 @@
+/*-
+ * Copyright (c) 2000 Paycounter, Inc.
+ * Copyright (c) 2005 Robert N. M. Watson
+ * Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define ACCEPT_FILTER_MOD
+
+#include "opt_param.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/protosw.h>
+#include <sys/sysctl.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/queue.h>
+
+static struct mtx accept_filter_mtx;
+MTX_SYSINIT(accept_filter, &accept_filter_mtx, "accept_filter_mtx",
+	MTX_DEF);
+#define	ACCEPT_FILTER_LOCK()	mtx_lock(&accept_filter_mtx)
+#define	ACCEPT_FILTER_UNLOCK()	mtx_unlock(&accept_filter_mtx)
+
+static SLIST_HEAD(, accept_filter) accept_filtlsthd =
+	SLIST_HEAD_INITIALIZER(accept_filtlsthd);
+
+MALLOC_DEFINE(M_ACCF, "accf", "accept filter data");
+
+static int unloadable = 0;
+
+SYSCTL_DECL(_net_inet);	/* XXX: some header should do this for me */
+SYSCTL_NODE(_net_inet, OID_AUTO, accf, CTLFLAG_RW, 0, "Accept filters");
+SYSCTL_INT(_net_inet_accf, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
+	"Allow unload of accept filters (not recommended)");
+
+/*
+ * Must be passed a malloc'd structure so we don't explode if the kld is
+ * unloaded, we leak the struct on deallocation to deal with this, but if a
+ * filter is loaded with the same name as a leaked one we re-use the entry.
+ */
+int
+accept_filt_add(struct accept_filter *filt)
+{
+	struct accept_filter *p;
+
+	ACCEPT_FILTER_LOCK();
+	SLIST_FOREACH(p, &accept_filtlsthd, accf_next)
+		if (strcmp(p->accf_name, filt->accf_name) == 0)  {
+			if (p->accf_callback != NULL) {
+				ACCEPT_FILTER_UNLOCK();
+				return (EEXIST);
+			} else {
+				p->accf_callback = filt->accf_callback;
+				ACCEPT_FILTER_UNLOCK();
+				free(filt, M_ACCF);
+				return (0);
+			}
+		}
+				
+	if (p == NULL)
+		SLIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next);
+	ACCEPT_FILTER_UNLOCK();
+	return (0);
+}
+
+int
+accept_filt_del(char *name)
+{
+	struct accept_filter *p;
+
+	p = accept_filt_get(name);
+	if (p == NULL)
+		return (ENOENT);
+
+	p->accf_callback = NULL;
+	return (0);
+}
+
+struct accept_filter *
+accept_filt_get(char *name)
+{
+	struct accept_filter *p;
+
+	ACCEPT_FILTER_LOCK();
+	SLIST_FOREACH(p, &accept_filtlsthd, accf_next)
+		if (strcmp(p->accf_name, name) == 0)
+			break;
+	ACCEPT_FILTER_UNLOCK();
+
+	return (p);
+}
+
+int
+accept_filt_generic_mod_event(module_t mod, int event, void *data)
+{
+	struct accept_filter *p;
+	struct accept_filter *accfp = (struct accept_filter *) data;
+	int error;
+
+	switch (event) {
+	case MOD_LOAD:
+		p = malloc(sizeof(*p), M_ACCF,
+		    M_WAITOK);
+		bcopy(accfp, p, sizeof(*p));
+		error = accept_filt_add(p);
+		break;
+
+	case MOD_UNLOAD:
+		/*
+		 * Do not support unloading yet. we don't keep track of
+		 * refcounts and unloading an accept filter callback and then
+		 * having it called is a bad thing.  A simple fix would be to
+		 * track the refcount in the struct accept_filter.
+		 */
+		if (unloadable != 0) {
+			error = accept_filt_del(accfp->accf_name);
+		} else
+			error = EOPNOTSUPP;
+		break;
+
+	case MOD_SHUTDOWN:
+		error = 0;
+		break;
+
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+
+	return (error);
+}
+
+int
+do_getopt_accept_filter(struct socket *so, struct sockopt *sopt)
+{
+	struct accept_filter_arg *afap;
+	int error;
+
+	error = 0;
+	afap = malloc(sizeof(*afap), M_TEMP,
+	    M_WAITOK | M_ZERO);
+	SOCK_LOCK(so);
+	if ((so->so_options & SO_ACCEPTCONN) == 0) {
+		error = EINVAL;
+		goto out;
+	}
+	if ((so->so_options & SO_ACCEPTFILTER) == 0) {
+		error = EINVAL;
+		goto out;
+	}
+	strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
+	if (so->so_accf->so_accept_filter_str != NULL)
+		strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
+out:
+	SOCK_UNLOCK(so);
+	if (error == 0)
+		error = sooptcopyout(sopt, afap, sizeof(*afap));
+	free(afap, M_TEMP);
+	return (error);
+}
+
+int
+do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
+{
+	struct accept_filter_arg *afap;
+	struct accept_filter *afp;
+	struct so_accf *newaf;
+	int error = 0;
+
+	/*
+	 * Handle the simple delete case first.
+	 */
+	if (sopt == NULL || sopt->sopt_val == NULL) {
+		SOCK_LOCK(so);
+		if ((so->so_options & SO_ACCEPTCONN) == 0) {
+			SOCK_UNLOCK(so);
+			return (EINVAL);
+		}
+		if (so->so_accf != NULL) {
+			struct so_accf *af = so->so_accf;
+			if (af->so_accept_filter != NULL &&
+				af->so_accept_filter->accf_destroy != NULL) {
+				af->so_accept_filter->accf_destroy(so);
+			}
+			if (af->so_accept_filter_str != NULL)
+				free(af->so_accept_filter_str, M_ACCF);
+			free(af, M_ACCF);
+			so->so_accf = NULL;
+		}
+		so->so_options &= ~SO_ACCEPTFILTER;
+		SOCK_UNLOCK(so);
+		return (0);
+	}
+
+	/*
+	 * Pre-allocate any memory we may need later to avoid blocking at
+	 * untimely moments.  This does not optimize for invalid arguments.
+	 */
+	afap = malloc(sizeof(*afap), M_TEMP,
+	    M_WAITOK);
+	error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
+	afap->af_name[sizeof(afap->af_name)-1] = '\0';
+	afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
+	if (error) {
+		free(afap, M_TEMP);
+		return (error);
+	}
+	afp = accept_filt_get(afap->af_name);
+	if (afp == NULL) {
+		free(afap, M_TEMP);
+		return (ENOENT);
+	}
+	/*
+	 * Allocate the new accept filter instance storage.  We may
+	 * have to free it again later if we fail to attach it.  If
+	 * attached properly, 'newaf' is NULLed to avoid a free()
+	 * while in use.
+	 */
+	newaf = malloc(sizeof(*newaf), M_ACCF, M_WAITOK |
+	    M_ZERO);
+	if (afp->accf_create != NULL && afap->af_name[0] != '\0') {
+		int len = strlen(afap->af_name) + 1;
+		newaf->so_accept_filter_str = malloc(len, M_ACCF,
+		    M_WAITOK);
+		strcpy(newaf->so_accept_filter_str, afap->af_name);
+	}
+
+	/*
+	 * Require a listen socket; don't try to replace an existing filter
+	 * without first removing it.
+	 */
+	SOCK_LOCK(so);
+	if (((so->so_options & SO_ACCEPTCONN) == 0) ||
+	    (so->so_accf != NULL)) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Invoke the accf_create() method of the filter if required.  The
+	 * socket mutex is held over this call, so create methods for filters
+	 * can't block.
+	 */
+	if (afp->accf_create != NULL) {
+		newaf->so_accept_filter_arg =
+		    afp->accf_create(so, afap->af_arg);
+		if (newaf->so_accept_filter_arg == NULL) {
+			error = EINVAL;
+			goto out;
+		}
+	}
+	newaf->so_accept_filter = afp;
+	so->so_accf = newaf;
+	so->so_options |= SO_ACCEPTFILTER;
+	newaf = NULL;
+out:
+	SOCK_UNLOCK(so);
+	if (newaf != NULL) {
+		if (newaf->so_accept_filter_str != NULL)
+			free(newaf->so_accept_filter_str, M_ACCF);
+		free(newaf, M_ACCF);
+	}
+	if (afap != NULL)
+		free(afap, M_TEMP);
+	return (error);
+}
diff --git a/sys/kern/uipc_cow.c b/sys/kern/uipc_cow.c
new file mode 100644
index 0000000..8a3a5ff
--- /dev/null
+++ b/sys/kern/uipc_cow.c
@@ -0,0 +1,182 @@
+/*--
+ * Copyright (c) 1997, Duke University
+ * All rights reserved.
+ *
+ * Author:
+ *         Andrew Gallatin <gallatin@cs.duke.edu>  
+ *            
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of Duke University may not be used to endorse or promote 
+ *    products derived from this software without specific prior written 
+ *    permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY DUKE UNIVERSITY ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL DUKE UNIVERSITY BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITSOR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+ */
+
+/*
+ * This is a set of routines for enabling and disabling copy on write
+ * protection for data written into sockets.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/mbuf.h>
+#include <sys/sf_buf.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+
+FEATURE(zero_copy_sockets, "Zero copy sockets support");
+
+struct netsend_cow_stats {
+	int attempted;
+	int fail_not_mapped;
+	int fail_sf_buf;
+	int success;
+	int iodone;
+};
+
+static struct netsend_cow_stats socow_stats;
+
+static int socow_iodone(struct mbuf *m, void *addr, void *args);
+
+static int
+socow_iodone(struct mbuf *m, void *addr, void *args)
+{	
+	struct sf_buf *sf;
+	vm_page_t pp;
+
+	sf = args;
+	pp = sf_buf_page(sf);
+	sf_buf_free(sf);
+	/* remove COW mapping  */
+	vm_page_lock(pp);
+	vm_page_cowclear(pp);
+	vm_page_unwire(pp, 0);
+	/*
+	 * Check for the object going away on us. This can
+	 * happen since we don't hold a reference to it.
+	 * If so, we're responsible for freeing the page.
+	 */
+	if (pp->wire_count == 0 && pp->object == NULL)
+		vm_page_free(pp);
+	vm_page_unlock(pp);
+	socow_stats.iodone++;
+	return (EXT_FREE_OK);
+}
+
+int
+socow_setup(struct mbuf *m0, struct uio *uio)
+{
+	struct sf_buf *sf;
+	vm_page_t pp;
+	struct iovec *iov;
+	struct vmspace *vmspace;
+	struct vm_map *map;
+	vm_offset_t offset, uva;
+	vm_size_t len;
+
+	socow_stats.attempted++;
+	vmspace = curproc->p_vmspace;
+	map = &vmspace->vm_map;
+	uva = (vm_offset_t) uio->uio_iov->iov_base;
+	offset = uva & PAGE_MASK;
+	len = PAGE_SIZE - offset;
+
+	/*
+	 * Verify that access to the given address is allowed from user-space.
+	 */
+	if (vm_fault_quick_hold_pages(map, uva, len, VM_PROT_READ, &pp, 1) <
+	    0) {
+		socow_stats.fail_not_mapped++;
+		return(0);
+	}
+
+	/* 
+	 * set up COW
+	 */
+	vm_page_lock(pp);
+	if (vm_page_cowsetup(pp) != 0) {
+		vm_page_unhold(pp);
+		vm_page_unlock(pp);
+		return (0);
+	}
+
+	/*
+	 * wire the page for I/O
+	 */
+	vm_page_wire(pp);
+	vm_page_unhold(pp);
+	vm_page_unlock(pp);
+	/*
+	 * Allocate an sf buf
+	 */
+	sf = sf_buf_alloc(pp, SFB_CATCH);
+	if (sf == NULL) {
+		vm_page_lock(pp);
+		vm_page_cowclear(pp);
+		vm_page_unwire(pp, 0);
+		/*
+		 * Check for the object going away on us. This can
+		 * happen since we don't hold a reference to it.
+		 * If so, we're responsible for freeing the page.
+		 */
+		if (pp->wire_count == 0 && pp->object == NULL)
+			vm_page_free(pp);
+		vm_page_unlock(pp);
+		socow_stats.fail_sf_buf++;
+		return(0);
+	}
+	/* 
+	 * attach to mbuf
+	 */
+	MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, socow_iodone,
+	    (void*)sf_buf_kva(sf), sf, M_RDONLY, EXT_SFBUF);
+	m0->m_len = len;
+	m0->m_data = (caddr_t)sf_buf_kva(sf) + offset;
+	socow_stats.success++;
+
+	iov = uio->uio_iov;
+	iov->iov_base = (char *)iov->iov_base + m0->m_len;
+	iov->iov_len -= m0->m_len;
+	uio->uio_resid -= m0->m_len;
+	uio->uio_offset += m0->m_len;
+	if (iov->iov_len == 0) {
+		uio->uio_iov++;
+		uio->uio_iovcnt--;
+	}
+
+	return(m0->m_len);
+}
diff --git a/sys/kern/uipc_debug.c b/sys/kern/uipc_debug.c
new file mode 100644
index 0000000..128c64b
--- /dev/null
+++ b/sys/kern/uipc_debug.c
@@ -0,0 +1,531 @@
+/*-
+ * Copyright (c) 2007 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Debugger routines relating to sockets, protocols, etc, for use in DDB.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+
+static void
+db_print_sotype(short so_type)
+{
+
+	switch (so_type) {
+	case SOCK_STREAM:
+		db_printf("SOCK_STREAM");
+		break;
+
+	case SOCK_DGRAM:
+		db_printf("SOCK_DGRAM");
+		break;
+
+	case SOCK_RAW:
+		db_printf("SOCK_RAW");
+		break;
+
+	case SOCK_RDM:
+		db_printf("SOCK_RDM");
+		break;
+
+	case SOCK_SEQPACKET:
+		db_printf("SOCK_SEQPACKET");
+		break;
+
+	default:
+		db_printf("unknown");
+		break;
+	}
+}
+
+static void
+db_print_sooptions(short so_options)
+{
+	int comma;
+
+	comma = 0;
+	if (so_options & SO_DEBUG) {
+		db_printf("%sSO_DEBUG", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_ACCEPTCONN) {
+		db_printf("%sSO_ACCEPTCONN", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_REUSEADDR) {
+		db_printf("%sSO_REUSEADDR", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_KEEPALIVE) {
+		db_printf("%sSO_KEEPALIVE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_DONTROUTE) {
+		db_printf("%sSO_DONTROUTE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_BROADCAST) {
+		db_printf("%sSO_BROADCAST", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_USELOOPBACK) {
+		db_printf("%sSO_USELOOPBACK", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_LINGER) {
+		db_printf("%sSO_LINGER", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_OOBINLINE) {
+		db_printf("%sSO_OOBINLINE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_REUSEPORT) {
+		db_printf("%sSO_REUSEPORT", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_TIMESTAMP) {
+		db_printf("%sSO_TIMESTAMP", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_NOSIGPIPE) {
+		db_printf("%sSO_NOSIGPIPE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_ACCEPTFILTER) {
+		db_printf("%sSO_ACCEPTFILTER", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_BINTIME) {
+		db_printf("%sSO_BINTIME", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_NO_OFFLOAD) {
+		db_printf("%sSO_NO_OFFLOAD", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_NO_DDP) {
+		db_printf("%sSO_NO_DDP", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_sostate(short so_state)
+{
+	int comma;
+
+	comma = 0;
+	if (so_state & SS_NOFDREF) {
+		db_printf("%sSS_NOFDREF", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_state & SS_ISCONNECTED) {
+		db_printf("%sSS_ISCONNECTED", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_state & SS_ISCONNECTING) {
+		db_printf("%sSS_ISCONNECTING", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_state & SS_ISDISCONNECTING) {
+		db_printf("%sSS_ISDISCONNECTING", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_state & SS_NBIO) {
+		db_printf("%sSS_NBIO", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_state & SS_ASYNC) {
+		db_printf("%sSS_ASYNC", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_state & SS_ISCONFIRMING) {
+		db_printf("%sSS_ISCONFIRMING", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_state & SS_PROTOREF) {
+		db_printf("%sSS_PROTOREF", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_soqstate(int so_qstate)
+{
+	int comma;
+
+	comma = 0;
+	if (so_qstate & SQ_INCOMP) {
+		db_printf("%sSQ_INCOMP", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_qstate & SQ_COMP) {
+		db_printf("%sSQ_COMP", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_sbstate(short sb_state)
+{
+	int comma;
+
+	comma = 0;
+	if (sb_state & SBS_CANTSENDMORE) {
+		db_printf("%sSS_CANTSENDMORE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_state & SBS_CANTRCVMORE) {
+		db_printf("%sSS_CANTRCVMORE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_state & SBS_RCVATMARK) {
+		db_printf("%sSS_RCVATMARK", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_indent(int indent)
+{
+	int i;
+
+	for (i = 0; i < indent; i++)
+		db_printf(" ");
+}
+
+static void
+db_print_domain(struct domain *d, const char *domain_name, int indent)
+{
+
+	db_print_indent(indent);
+	db_printf("%s at %p\n", domain_name, d);
+
+	indent += 2;
+
+	db_print_indent(indent);
+	db_printf("dom_family: %d   ", d->dom_family);
+	db_printf("dom_name: %s\n", d->dom_name);
+
+	db_print_indent(indent);
+	db_printf("dom_init: %p   ", d->dom_init);
+	db_printf("dom_externalize: %p   ", d->dom_externalize);
+	db_printf("dom_dispose: %p\n", d->dom_dispose);
+
+	db_print_indent(indent);
+	db_printf("dom_protosw: %p   ", d->dom_protosw);
+	db_printf("dom_next: %p\n", d->dom_next);
+
+	db_print_indent(indent);
+	db_printf("dom_rtattach: %p   ", d->dom_rtattach);
+	db_printf("dom_rtoffset: %d   ", d->dom_rtoffset);
+	db_printf("dom_maxrtkey: %d\n", d->dom_maxrtkey);
+
+	db_print_indent(indent);
+	db_printf("dom_ifattach: %p   ", d->dom_ifattach);
+	db_printf("dom_ifdetach: %p\n", d->dom_ifdetach);
+}
+
+static void
+db_print_prflags(short pr_flags)
+{
+	int comma;
+
+	comma = 0;
+	if (pr_flags & PR_ATOMIC) {
+		db_printf("%sPR_ATOMIC", comma ? ", " : "");
+		comma = 1;
+	}
+	if (pr_flags & PR_ADDR) {
+		db_printf("%sPR_ADDR", comma ? ", " : "");
+		comma = 1;
+	}
+	if (pr_flags & PR_CONNREQUIRED) {
+		db_printf("%sPR_CONNREQUIRED", comma ? ", " : "");
+		comma = 1;
+	}
+	if (pr_flags & PR_WANTRCVD) {
+		db_printf("%sPR_WANTRCVD", comma ? ", " : "");
+		comma = 1;
+	}
+	if (pr_flags & PR_RIGHTS) {
+		db_printf("%sPR_RIGHTS", comma ? ", " : "");
+		comma = 1;
+	}
+	if (pr_flags & PR_IMPLOPCL) {
+		db_printf("%sPR_IMPLOPCL", comma ? ", " : "");
+		comma = 1;
+	}
+	if (pr_flags & PR_LASTHDR) {
+		db_printf("%sPR_LASTHDR", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_protosw(struct protosw *pr, const char *prname, int indent)
+{
+
+	db_print_indent(indent);
+	db_printf("%s at %p\n", prname, pr);
+
+	indent += 2;
+
+	db_print_indent(indent);
+	db_printf("pr_type: %d   ", pr->pr_type);
+	db_printf("pr_domain: %p\n", pr->pr_domain);
+	if (pr->pr_domain != NULL)
+		db_print_domain(pr->pr_domain, "pr_domain", indent);
+
+	db_print_indent(indent);
+	db_printf("pr_protocol: %d\n", pr->pr_protocol);
+
+	db_print_indent(indent);
+	db_printf("pr_flags: %d (", pr->pr_flags);
+	db_print_prflags(pr->pr_flags);
+	db_printf(")\n");
+
+	db_print_indent(indent);
+	db_printf("pr_input: %p   ", pr->pr_input);
+	db_printf("pr_output: %p   ", pr->pr_output);
+	db_printf("pr_ctlinput: %p\n", pr->pr_ctlinput);
+
+	db_print_indent(indent);
+	db_printf("pr_ctloutput: %p   ", pr->pr_ctloutput);
+	db_printf("pr_init: %p\n", pr->pr_init);
+
+	db_print_indent(indent);
+	db_printf("pr_fasttimo: %p   ", pr->pr_fasttimo);
+	db_printf("pr_slowtimo: %p   ", pr->pr_slowtimo);
+	db_printf("pr_drain: %p\n", pr->pr_drain);
+
+	db_print_indent(indent);
+}
+
+static void
+db_print_sbflags(short sb_flags)
+{
+	int comma;
+
+	comma = 0;
+	if (sb_flags & SB_WAIT) {
+		db_printf("%sSB_WAIT", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_SEL) {
+		db_printf("%sSB_SEL", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_ASYNC) {
+		db_printf("%sSB_ASYNC", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_UPCALL) {
+		db_printf("%sSB_UPCALL", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_NOINTR) {
+		db_printf("%sSB_NOINTR", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_AIO) {
+		db_printf("%sSB_AIO", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_KNOTE) {
+		db_printf("%sSB_KNOTE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_AUTOSIZE) {
+		db_printf("%sSB_AUTOSIZE", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_sockbuf(struct sockbuf *sb, const char *sockbufname, int indent)
+{
+
+	db_print_indent(indent);
+	db_printf("%s at %p\n", sockbufname, sb);
+
+	indent += 2;
+
+	db_print_indent(indent);
+	db_printf("sb_state: 0x%x (", sb->sb_state);
+	db_print_sbstate(sb->sb_state);
+	db_printf(")\n");
+
+	db_print_indent(indent);
+	db_printf("sb_mb: %p   ", sb->sb_mb);
+	db_printf("sb_mbtail: %p   ", sb->sb_mbtail);
+	db_printf("sb_lastrecord: %p\n", sb->sb_lastrecord);
+
+	db_print_indent(indent);
+	db_printf("sb_sndptr: %p   ", sb->sb_sndptr);
+	db_printf("sb_sndptroff: %u\n", sb->sb_sndptroff);
+
+	db_print_indent(indent);
+	db_printf("sb_cc: %u   ", sb->sb_cc);
+	db_printf("sb_hiwat: %u   ", sb->sb_hiwat);
+	db_printf("sb_mbcnt: %u   ", sb->sb_mbcnt);
+	db_printf("sb_mbmax: %u\n", sb->sb_mbmax);
+
+	db_print_indent(indent);
+	db_printf("sb_ctl: %u   ", sb->sb_ctl);
+	db_printf("sb_lowat: %d   ", sb->sb_lowat);
+	db_printf("sb_timeo: %jd\n", sb->sb_timeo);
+
+	db_print_indent(indent);
+	db_printf("sb_flags: 0x%x (", sb->sb_flags);
+	db_print_sbflags(sb->sb_flags);
+	db_printf(")\n");
+}
+
+static void
+db_print_socket(struct socket *so, const char *socketname, int indent)
+{
+
+	db_print_indent(indent);
+	db_printf("%s at %p\n", socketname, so);
+
+	indent += 2;
+
+	db_print_indent(indent);
+	db_printf("so_count: %d   ", so->so_count);
+	db_printf("so_type: %d (", so->so_type);
+	db_print_sotype(so->so_type);
+	db_printf(")\n");
+
+	db_print_indent(indent);
+	db_printf("so_options: 0x%x (", so->so_options);
+	db_print_sooptions(so->so_options);
+	db_printf(")\n");
+
+	db_print_indent(indent);
+	db_printf("so_linger: %d   ", so->so_linger);
+	db_printf("so_state: 0x%x (", so->so_state);
+	db_print_sostate(so->so_state);
+	db_printf(")\n");
+
+	db_print_indent(indent);
+	db_printf("so_qstate: 0x%x (", so->so_qstate);
+	db_print_soqstate(so->so_qstate);
+	db_printf(")   ");
+	db_printf("so_pcb: %p   ", so->so_pcb);
+	db_printf("so_proto: %p\n", so->so_proto);
+
+	if (so->so_proto != NULL)
+		db_print_protosw(so->so_proto, "so_proto", indent);
+
+	db_print_indent(indent);
+	db_printf("so_head: %p   ", so->so_head);
+	db_printf("so_incomp first: %p   ", TAILQ_FIRST(&so->so_incomp));
+	db_printf("so_comp first: %p\n", TAILQ_FIRST(&so->so_comp));
+
+	db_print_indent(indent);
+	/* so_list skipped */
+	db_printf("so_qlen: %d   ", so->so_qlen);
+	db_printf("so_incqlen: %d   ", so->so_incqlen);
+	db_printf("so_qlimit: %d   ", so->so_qlimit);
+	db_printf("so_timeo: %d   ", so->so_timeo);
+	db_printf("so_error: %d\n", so->so_error);
+
+	db_print_indent(indent);
+	db_printf("so_sigio: %p   ", so->so_sigio);
+	db_printf("so_oobmark: %lu   ", so->so_oobmark);
+	db_printf("so_aiojobq first: %p\n", TAILQ_FIRST(&so->so_aiojobq));
+
+	db_print_sockbuf(&so->so_rcv, "so_rcv", indent);
+	db_print_sockbuf(&so->so_snd, "so_snd", indent);
+}
+
+DB_SHOW_COMMAND(socket, db_show_socket)
+{
+	struct socket *so;
+
+	if (!have_addr) {
+		db_printf("usage: show socket <addr>\n");
+		return;
+	}
+	so = (struct socket *)addr;
+
+	db_print_socket(so, "socket", 0);
+}
+
+DB_SHOW_COMMAND(sockbuf, db_show_sockbuf)
+{
+	struct sockbuf *sb;
+
+	if (!have_addr) {
+		db_printf("usage: show sockbuf <addr>\n");
+		return;
+	}
+	sb = (struct sockbuf *)addr;
+
+	db_print_sockbuf(sb, "sockbuf", 0);
+}
+
+DB_SHOW_COMMAND(protosw, db_show_protosw)
+{
+	struct protosw *pr;
+
+	if (!have_addr) {
+		db_printf("usage: show protosw <addr>\n");
+		return;
+	}
+	pr = (struct protosw *)addr;
+
+	db_print_protosw(pr, "protosw", 0);
+}
+
+DB_SHOW_COMMAND(domain, db_show_domain)
+{
+	struct domain *d;
+
+	if (!have_addr) {
+		db_printf("usage: show protosw <addr>\n");
+		return;
+	}
+	d = (struct domain *)addr;
+
+	db_print_domain(d, "domain", 0);
+}
+#endif
diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c
new file mode 100644
index 0000000..709cc0e
--- /dev/null
+++ b/sys/kern/uipc_domain.c
@@ -0,0 +1,523 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_domain.c	8.2 (Berkeley) 10/18/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/eventhandler.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/socketvar.h>
+#include <sys/systm.h>
+
+#include <net/vnet.h>
+
+/*
+ * System initialization
+ *
+ * Note: domain initialization takes place on a per domain basis
+ * as a result of traversing a SYSINIT linker set.  Most likely,
+ * each domain would want to call DOMAIN_SET(9) itself, which
+ * would cause the domain to be added just after domaininit()
+ * is called during startup.
+ *
+ * See DOMAIN_SET(9) for details on its use.
+ */
+
+static void domaininit(void *);
+SYSINIT(domain, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, domaininit, NULL);
+
+static void domainfinalize(void *);
+SYSINIT(domainfin, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, domainfinalize,
+    NULL);
+
+static struct callout pffast_callout;
+static struct callout pfslow_callout;
+
+static void	pffasttimo(void *);
+static void	pfslowtimo(void *);
+
+struct domain *domains;		/* registered protocol domains */
+int domain_init_status = 0;
+static struct mtx dom_mtx;		/* domain list lock */
+MTX_SYSINIT(domain, &dom_mtx, "domain list", MTX_DEF);
+
+/*
+ * Dummy protocol specific user requests function pointer array.
+ * All functions return EOPNOTSUPP.
+ */
+struct pr_usrreqs nousrreqs = {
+	.pru_accept =		pru_accept_notsupp,
+	.pru_attach =		pru_attach_notsupp,
+	.pru_bind =		pru_bind_notsupp,
+	.pru_connect =		pru_connect_notsupp,
+	.pru_connect2 =		pru_connect2_notsupp,
+	.pru_control =		pru_control_notsupp,
+	.pru_disconnect	=	pru_disconnect_notsupp,
+	.pru_listen =		pru_listen_notsupp,
+	.pru_peeraddr =		pru_peeraddr_notsupp,
+	.pru_rcvd =		pru_rcvd_notsupp,
+	.pru_rcvoob =		pru_rcvoob_notsupp,
+	.pru_send =		pru_send_notsupp,
+	.pru_sense =		pru_sense_null,
+	.pru_shutdown =		pru_shutdown_notsupp,
+	.pru_sockaddr =		pru_sockaddr_notsupp,
+	.pru_sosend =		pru_sosend_notsupp,
+	.pru_soreceive =	pru_soreceive_notsupp,
+	.pru_sopoll =		pru_sopoll_notsupp,
+};
+
+static void
+protosw_init(struct protosw *pr)
+{
+	struct pr_usrreqs *pu;
+
+	pu = pr->pr_usrreqs;
+	KASSERT(pu != NULL, ("protosw_init: %ssw[%d] has no usrreqs!",
+	    pr->pr_domain->dom_name,
+	    (int)(pr - pr->pr_domain->dom_protosw)));
+
+	/*
+	 * Protocol switch methods fall into three categories: mandatory,
+	 * mandatory but protosw_init() provides a default, and optional.
+	 *
+	 * For true protocols (i.e., pru_attach != NULL), KASSERT truly
+	 * mandatory methods with no defaults, and initialize defaults for
+	 * other mandatory methods if the protocol hasn't defined an
+	 * implementation (NULL function pointer).
+	 */
+#if 0
+	if (pu->pru_attach != NULL) {
+		KASSERT(pu->pru_abort != NULL,
+		    ("protosw_init: %ssw[%d] pru_abort NULL",
+		    pr->pr_domain->dom_name,
+		    (int)(pr - pr->pr_domain->dom_protosw)));
+		KASSERT(pu->pru_send != NULL,
+		    ("protosw_init: %ssw[%d] pru_send NULL",
+		    pr->pr_domain->dom_name,
+		    (int)(pr - pr->pr_domain->dom_protosw)));
+	}
+#endif
+
+#define DEFAULT(foo, bar)	if ((foo) == NULL)  (foo) = (bar)
+	DEFAULT(pu->pru_accept, pru_accept_notsupp);
+	DEFAULT(pu->pru_bind, pru_bind_notsupp);
+	DEFAULT(pu->pru_bindat, pru_bindat_notsupp);
+	DEFAULT(pu->pru_connect, pru_connect_notsupp);
+	DEFAULT(pu->pru_connect2, pru_connect2_notsupp);
+	DEFAULT(pu->pru_connectat, pru_connectat_notsupp);
+	DEFAULT(pu->pru_control, pru_control_notsupp);
+	DEFAULT(pu->pru_disconnect, pru_disconnect_notsupp);
+	DEFAULT(pu->pru_listen, pru_listen_notsupp);
+	DEFAULT(pu->pru_peeraddr, pru_peeraddr_notsupp);
+	DEFAULT(pu->pru_rcvd, pru_rcvd_notsupp);
+	DEFAULT(pu->pru_rcvoob, pru_rcvoob_notsupp);
+	DEFAULT(pu->pru_sense, pru_sense_null);
+	DEFAULT(pu->pru_shutdown, pru_shutdown_notsupp);
+	DEFAULT(pu->pru_sockaddr, pru_sockaddr_notsupp);
+	DEFAULT(pu->pru_sosend, sosend_generic);
+	DEFAULT(pu->pru_soreceive, soreceive_generic);
+	DEFAULT(pu->pru_sopoll, sopoll_generic);
+#undef DEFAULT
+	if (pr->pr_init)
+		(*pr->pr_init)();
+}
+
+/*
+ * Add a new protocol domain to the list of supported domains
+ * Note: you cant unload it again because a socket may be using it.
+ * XXX can't fail at this time.
+ */
+void
+domain_init(void *arg)
+{
+	struct domain *dp = arg;
+	struct protosw *pr;
+
+	if (dp->dom_init)
+		(*dp->dom_init)();
+	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+		protosw_init(pr);
+	/*
+	 * update global information about maximums
+	 */
+	max_hdr = max_linkhdr + max_protohdr;
+	max_datalen = MHLEN - max_hdr;
+	if (max_datalen < 1)
+		panic("%s: max_datalen < 1", __func__);
+}
+
+#ifdef VIMAGE
+void
+vnet_domain_init(void *arg)
+{
+
+	/* Virtualized case is no different -- call init functions. */
+	domain_init(arg);
+}
+
+void
+vnet_domain_uninit(void *arg)
+{
+	struct domain *dp = arg;
+	struct protosw *pr;
+
+	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+		if (pr->pr_destroy)
+			(*pr->pr_destroy)();
+	if (dp->dom_destroy)
+		(*dp->dom_destroy)();
+}
+#endif
+
+/*
+ * Add a new protocol domain to the list of supported domains
+ * Note: you cant unload it again because a socket may be using it.
+ * XXX can't fail at this time.
+ */
+void
+domain_add(void *data)
+{
+	struct domain *dp;
+
+	dp = (struct domain *)data;
+	mtx_lock(&dom_mtx);
+	dp->dom_next = domains;
+	domains = dp;
+
+	KASSERT(domain_init_status >= 1,
+	    ("attempt to domain_add(%s) before domaininit()",
+	    dp->dom_name));
+#ifndef INVARIANTS
+	if (domain_init_status < 1)
+		printf("WARNING: attempt to domain_add(%s) before "
+		    "domaininit()\n", dp->dom_name);
+#endif
+#ifdef notyet
+	KASSERT(domain_init_status < 2,
+	    ("attempt to domain_add(%s) after domainfinalize()",
+	    dp->dom_name));
+#else
+	if (domain_init_status >= 2)
+		printf("WARNING: attempt to domain_add(%s) after "
+		    "domainfinalize()\n", dp->dom_name);
+#endif
+	mtx_unlock(&dom_mtx);
+}
+
+/* ARGSUSED*/
+static void
+domaininit(void *dummy)
+{
+
+	if (max_linkhdr < 16)		/* XXX */
+		max_linkhdr = 16;
+
+	callout_init(&pffast_callout, CALLOUT_MPSAFE);
+	callout_init(&pfslow_callout, CALLOUT_MPSAFE);
+
+	mtx_lock(&dom_mtx);
+	KASSERT(domain_init_status == 0, ("domaininit called too late!"));
+	domain_init_status = 1;
+	mtx_unlock(&dom_mtx);
+}
+
+/* ARGSUSED*/
+static void
+domainfinalize(void *dummy)
+{
+
+	mtx_lock(&dom_mtx);
+	KASSERT(domain_init_status == 1, ("domainfinalize called too late!"));
+	domain_init_status = 2;
+	mtx_unlock(&dom_mtx);	
+
+	callout_reset(&pffast_callout, 1, pffasttimo, NULL);
+	callout_reset(&pfslow_callout, 1, pfslowtimo, NULL);
+}
+
+struct domain *
+pffinddomain(int family)
+{
+	struct domain *dp;
+
+	for (dp = domains; dp != NULL; dp = dp->dom_next)
+		if (dp->dom_family == family)
+			return (dp);
+	return (NULL);
+}
+
+struct protosw *
+pffindtype(int family, int type)
+{
+	struct domain *dp;
+	struct protosw *pr;
+
+	dp = pffinddomain(family);
+	if (dp == NULL)
+		return (NULL);
+
+	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+		if (pr->pr_type && pr->pr_type == type)
+			return (pr);
+	return (NULL);
+}
+
+struct protosw *
+pffindproto(int family, int protocol, int type)
+{
+	struct domain *dp;
+	struct protosw *pr;
+	struct protosw *maybe;
+
+	maybe = NULL;
+	if (family == 0)
+		return (NULL);
+
+	dp = pffinddomain(family);
+	if (dp == NULL)
+		return (NULL);
+
+	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
+		if ((pr->pr_protocol == protocol) && (pr->pr_type == type))
+			return (pr);
+
+		if (type == SOCK_RAW && pr->pr_type == SOCK_RAW &&
+		    pr->pr_protocol == 0 && maybe == NULL)
+			maybe = pr;
+	}
+	return (maybe);
+}
+
+/*
+ * The caller must make sure that the new protocol is fully set up and ready to
+ * accept requests before it is registered.
+ */
+int
+pf_proto_register(int family, struct protosw *npr)
+{
+	VNET_ITERATOR_DECL(vnet_iter);
+	struct domain *dp;
+	struct protosw *pr, *fpr;
+
+	/* Sanity checks. */
+	if (family == 0)
+		return (EPFNOSUPPORT);
+	if (npr->pr_type == 0)
+		return (EPROTOTYPE);
+	if (npr->pr_protocol == 0)
+		return (EPROTONOSUPPORT);
+	if (npr->pr_usrreqs == NULL)
+		return (ENXIO);
+
+	/* Try to find the specified domain based on the family. */
+	dp = pffinddomain(family);
+	if (dp == NULL)
+		return (EPFNOSUPPORT);
+
+	/* Initialize backpointer to struct domain. */
+	npr->pr_domain = dp;
+	fpr = NULL;
+
+	/*
+	 * Protect us against races when two protocol registrations for
+	 * the same protocol happen at the same time.
+	 */
+	mtx_lock(&dom_mtx);
+
+	/* The new protocol must not yet exist. */
+	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
+		if ((pr->pr_type == npr->pr_type) &&
+		    (pr->pr_protocol == npr->pr_protocol)) {
+			mtx_unlock(&dom_mtx);
+			return (EEXIST);	/* XXX: Check only protocol? */
+		}
+		/* While here, remember the first free spacer. */
+		if ((fpr == NULL) && (pr->pr_protocol == PROTO_SPACER))
+			fpr = pr;
+	}
+
+	/* If no free spacer is found we can't add the new protocol. */
+	if (fpr == NULL) {
+		mtx_unlock(&dom_mtx);
+		return (ENOMEM);
+	}
+
+	/* Copy the new struct protosw over the spacer. */
+	bcopy(npr, fpr, sizeof(*fpr));
+
+	/* Job is done, no more protection required. */
+	mtx_unlock(&dom_mtx);
+
+	/* Initialize and activate the protocol. */
+	VNET_LIST_RLOCK();
+	VNET_FOREACH(vnet_iter) {
+		CURVNET_SET_QUIET(vnet_iter);
+		protosw_init(fpr);
+		CURVNET_RESTORE();
+	}
+	VNET_LIST_RUNLOCK();
+
+	return (0);
+}
+
+/*
+ * The caller must make sure the protocol and its functions correctly shut down
+ * all sockets and release all locks and memory references.
+ */
+int
+pf_proto_unregister(int family, int protocol, int type)
+{
+	struct domain *dp;
+	struct protosw *pr, *dpr;
+
+	/* Sanity checks. */
+	if (family == 0)
+		return (EPFNOSUPPORT);
+	if (protocol == 0)
+		return (EPROTONOSUPPORT);
+	if (type == 0)
+		return (EPROTOTYPE);
+
+	/* Try to find the specified domain based on the family type. */
+	dp = pffinddomain(family);
+	if (dp == NULL)
+		return (EPFNOSUPPORT);
+
+	dpr = NULL;
+
+	/* Lock out everyone else while we are manipulating the protosw. */
+	mtx_lock(&dom_mtx);
+
+	/* The protocol must exist and only once. */
+	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
+		if ((pr->pr_type == type) && (pr->pr_protocol == protocol)) {
+			if (dpr != NULL) {
+				mtx_unlock(&dom_mtx);
+				return (EMLINK);   /* Should not happen! */
+			} else
+				dpr = pr;
+		}
+	}
+
+	/* Protocol does not exist. */
+	if (dpr == NULL) {
+		mtx_unlock(&dom_mtx);
+		return (EPROTONOSUPPORT);
+	}
+
+	/* De-orbit the protocol and make the slot available again. */
+	dpr->pr_type = 0;
+	dpr->pr_domain = dp;
+	dpr->pr_protocol = PROTO_SPACER;
+	dpr->pr_flags = 0;
+	dpr->pr_input = NULL;
+	dpr->pr_output = NULL;
+	dpr->pr_ctlinput = NULL;
+	dpr->pr_ctloutput = NULL;
+	dpr->pr_init = NULL;
+	dpr->pr_fasttimo = NULL;
+	dpr->pr_slowtimo = NULL;
+	dpr->pr_drain = NULL;
+	dpr->pr_usrreqs = &nousrreqs;
+
+	/* Job is done, not more protection required. */
+	mtx_unlock(&dom_mtx);
+
+	return (0);
+}
+
+void
+pfctlinput(int cmd, struct sockaddr *sa)
+{
+	struct domain *dp;
+	struct protosw *pr;
+
+	for (dp = domains; dp; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_ctlinput)
+				(*pr->pr_ctlinput)(cmd, sa, (void *)0);
+}
+
+void
+pfctlinput2(int cmd, struct sockaddr *sa, void *ctlparam)
+{
+	struct domain *dp;
+	struct protosw *pr;
+
+	if (!sa)
+		return;
+	for (dp = domains; dp; dp = dp->dom_next) {
+		/*
+		 * the check must be made by xx_ctlinput() anyways, to
+		 * make sure we use data item pointed to by ctlparam in
+		 * correct way.  the following check is made just for safety.
+		 */
+		if (dp->dom_family != sa->sa_family)
+			continue;
+
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_ctlinput)
+				(*pr->pr_ctlinput)(cmd, sa, ctlparam);
+	}
+}
+
+static void
+pfslowtimo(void *arg)
+{
+	struct domain *dp;
+	struct protosw *pr;
+
+	for (dp = domains; dp; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_slowtimo)
+				(*pr->pr_slowtimo)();
+	callout_reset(&pfslow_callout, hz/2, pfslowtimo, NULL);
+}
+
+static void
+pffasttimo(void *arg)
+{
+	struct domain *dp;
+	struct protosw *pr;
+
+	for (dp = domains; dp; dp = dp->dom_next)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+			if (pr->pr_fasttimo)
+				(*pr->pr_fasttimo)();
+	callout_reset(&pffast_callout, hz/5, pffasttimo, NULL);
+}
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
new file mode 100644
index 0000000..8e278a4
--- /dev/null
+++ b/sys/kern/uipc_mbuf.c
@@ -0,0 +1,2182 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_param.h"
+#include "opt_mbuf_stress_test.h"
+#include "opt_mbuf_profiling.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/sysctl.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+#include <sys/uio.h>
+
+int	max_linkhdr;
+int	max_protohdr;
+int	max_hdr;
+int	max_datalen;
+#ifdef MBUF_STRESS_TEST
+int	m_defragpackets;
+int	m_defragbytes;
+int	m_defraguseless;
+int	m_defragfailure;
+int	m_defragrandomfailures;
+#endif
+
+/*
+ * sysctl(8) exported objects
+ */
+SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
+	   &max_linkhdr, 0, "Size of largest link layer header");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
+	   &max_protohdr, 0, "Size of largest protocol layer header");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
+	   &max_hdr, 0, "Size of largest link plus protocol header");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD,
+	   &max_datalen, 0, "Minimum space left in mbuf after max_hdr");
+#ifdef MBUF_STRESS_TEST
+SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
+	   &m_defragpackets, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
+	   &m_defragbytes, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
+	   &m_defraguseless, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
+	   &m_defragfailure, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
+	   &m_defragrandomfailures, 0, "");
+#endif
+
+/*
+ * Ensure the correct size of various mbuf parameters.  It could be off due
+ * to compiler-induced padding and alignment artifacts.
+ */
+CTASSERT(sizeof(struct mbuf) == MSIZE);
+CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
+CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
+
+/*
+ * m_get2() allocates minimum mbuf that would fit "size" argument.
+ */
+struct mbuf *
+m_get2(int size, int how, short type, int flags)
+{
+	struct mb_args args;
+	struct mbuf *m, *n;
+
+	args.flags = flags;
+	args.type = type;
+
+	if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0))
+		return (uma_zalloc_arg(zone_mbuf, &args, how));
+	if (size <= MCLBYTES)
+		return (uma_zalloc_arg(zone_pack, &args, how));
+
+	if (size > MJUMPAGESIZE)
+		return (NULL);
+
+	m = uma_zalloc_arg(zone_mbuf, &args, how);
+	if (m == NULL)
+		return (NULL);
+
+	n = uma_zalloc_arg(zone_jumbop, m, how);
+	if (n == NULL) {
+		uma_zfree(zone_mbuf, m);
+		return (NULL);
+	}
+
+	return (m);
+}
+
+/*
+ * m_getjcl() returns an mbuf with a cluster of the specified size attached.
+ * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
+ */
+struct mbuf *
+m_getjcl(int how, short type, int flags, int size)
+{
+	struct mb_args args;
+	struct mbuf *m, *n;
+	uma_zone_t zone;
+
+	if (size == MCLBYTES)
+		return m_getcl(how, type, flags);
+
+	args.flags = flags;
+	args.type = type;
+
+	m = uma_zalloc_arg(zone_mbuf, &args, how);
+	if (m == NULL)
+		return (NULL);
+
+	zone = m_getzone(size);
+	n = uma_zalloc_arg(zone, m, how);
+	if (n == NULL) {
+		uma_zfree(zone_mbuf, m);
+		return (NULL);
+	}
+	return (m);
+}
+
+/*
+ * Allocate a given length worth of mbufs and/or clusters (whatever fits
+ * best) and return a pointer to the top of the allocated chain.  If an
+ * existing mbuf chain is provided, then we will append the new chain
+ * to the existing one but still return the top of the newly allocated
+ * chain.
+ */
+struct mbuf *
+m_getm2(struct mbuf *m, int len, int how, short type, int flags)
+{
+	struct mbuf *mb, *nm = NULL, *mtail = NULL;
+
+	KASSERT(len >= 0, ("%s: len is < 0", __func__));
+
+	/* Validate flags. */
+	flags &= (M_PKTHDR | M_EOR);
+
+	/* Packet header mbuf must be first in chain. */
+	if ((flags & M_PKTHDR) && m != NULL)
+		flags &= ~M_PKTHDR;
+
+	/* Loop and append maximum sized mbufs to the chain tail. */
+	while (len > 0) {
+		if (len > MCLBYTES)
+			mb = m_getjcl(how, type, (flags & M_PKTHDR),
+			    MJUMPAGESIZE);
+		else if (len >= MINCLSIZE)
+			mb = m_getcl(how, type, (flags & M_PKTHDR));
+		else if (flags & M_PKTHDR)
+			mb = m_gethdr(how, type);
+		else
+			mb = m_get(how, type);
+
+		/* Fail the whole operation if one mbuf can't be allocated. */
+		if (mb == NULL) {
+			if (nm != NULL)
+				m_freem(nm);
+			return (NULL);
+		}
+
+		/* Book keeping. */
+		len -= (mb->m_flags & M_EXT) ? mb->m_ext.ext_size :
+			((mb->m_flags & M_PKTHDR) ? MHLEN : MLEN);
+		if (mtail != NULL)
+			mtail->m_next = mb;
+		else
+			nm = mb;
+		mtail = mb;
+		flags &= ~M_PKTHDR;	/* Only valid on the first mbuf. */
+	}
+	if (flags & M_EOR)
+		mtail->m_flags |= M_EOR;  /* Only valid on the last mbuf. */
+
+	/* If mbuf was supplied, append new chain to the end of it. */
+	if (m != NULL) {
+		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
+			;
+		mtail->m_next = nm;
+		mtail->m_flags &= ~M_EOR;
+	} else
+		m = nm;
+
+	return (m);
+}
+
+/*
+ * Free an entire chain of mbufs and associated external buffers, if
+ * applicable.
+ */
+void
+m_freem(struct mbuf *mb)
+{
+
+	while (mb != NULL)
+		mb = m_free(mb);
+}
+
+/*-
+ * Configure a provided mbuf to refer to the provided external storage
+ * buffer and setup a reference count for said buffer.  If the setting
+ * up of the reference count fails, the M_EXT bit will not be set.  If
+ * successfull, the M_EXT bit is set in the mbuf's flags.
+ *
+ * Arguments:
+ *    mb     The existing mbuf to which to attach the provided buffer.
+ *    buf    The address of the provided external storage buffer.
+ *    size   The size of the provided buffer.
+ *    freef  A pointer to a routine that is responsible for freeing the
+ *           provided external storage buffer.
+ *    args   A pointer to an argument structure (of any type) to be passed
+ *           to the provided freef routine (may be NULL).
+ *    flags  Any other flags to be passed to the provided mbuf.
+ *    type   The type that the external storage buffer should be
+ *           labeled with.
+ *
+ * Returns:
+ *    Nothing.
+ */
+int
+m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
+    int (*freef)(struct mbuf *, void *, void *), void *arg1, void *arg2,
+    int flags, int type, int wait)
+{
+	KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__));
+
+	if (type != EXT_EXTREF)
+		mb->m_ext.ref_cnt = uma_zalloc(zone_ext_refcnt, wait);
+
+	if (mb->m_ext.ref_cnt == NULL)
+		return (ENOMEM);
+
+	*(mb->m_ext.ref_cnt) = 1;
+	mb->m_flags |= (M_EXT | flags);
+	mb->m_ext.ext_buf = buf;
+	mb->m_data = mb->m_ext.ext_buf;
+	mb->m_ext.ext_size = size;
+	mb->m_ext.ext_free = freef;
+	mb->m_ext.ext_arg1 = arg1;
+	mb->m_ext.ext_arg2 = arg2;
+	mb->m_ext.ext_type = type;
+	mb->m_ext.ext_flags = 0;
+
+	return (0);
+}
+
+/*
+ * Non-directly-exported function to clean up after mbufs with M_EXT
+ * storage attached to them if the reference count hits 1.
+ */
+void
+mb_free_ext(struct mbuf *m)
+{
+	int skipmbuf;
+	
+	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
+	KASSERT(m->m_ext.ref_cnt != NULL, ("%s: ref_cnt not set", __func__));
+
+	/*
+	 * check if the header is embedded in the cluster
+	 */
+	skipmbuf = (m->m_flags & M_NOFREE);
+
+	/* Free attached storage if this mbuf is the only reference to it. */
+	if (*(m->m_ext.ref_cnt) == 1 ||
+	    atomic_fetchadd_int(m->m_ext.ref_cnt, -1) == 1) {
+		switch (m->m_ext.ext_type) {
+		case EXT_PACKET:	/* The packet zone is special. */
+			if (*(m->m_ext.ref_cnt) == 0)
+				*(m->m_ext.ref_cnt) = 1;
+			uma_zfree(zone_pack, m);
+			return;		/* Job done. */
+		case EXT_CLUSTER:
+			uma_zfree(zone_clust, m->m_ext.ext_buf);
+			break;
+		case EXT_JUMBOP:
+			uma_zfree(zone_jumbop, m->m_ext.ext_buf);
+			break;
+		case EXT_JUMBO9:
+			uma_zfree(zone_jumbo9, m->m_ext.ext_buf);
+			break;
+		case EXT_JUMBO16:
+			uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
+			break;
+		case EXT_SFBUF:
+		case EXT_NET_DRV:
+		case EXT_MOD_TYPE:
+		case EXT_DISPOSABLE:
+			*(m->m_ext.ref_cnt) = 0;
+			uma_zfree(zone_ext_refcnt, __DEVOLATILE(u_int *,
+				m->m_ext.ref_cnt));
+			/* FALLTHROUGH */
+		case EXT_EXTREF:
+			KASSERT(m->m_ext.ext_free != NULL,
+				("%s: ext_free not set", __func__));
+			(void)(*(m->m_ext.ext_free))(m, m->m_ext.ext_arg1,
+			    m->m_ext.ext_arg2);
+			break;
+		default:
+			KASSERT(m->m_ext.ext_type == 0,
+				("%s: unknown ext_type", __func__));
+		}
+	}
+	if (skipmbuf)
+		return;
+	
+	/*
+	 * Free this mbuf back to the mbuf zone with all m_ext
+	 * information purged.
+	 */
+	m->m_ext.ext_buf = NULL;
+	m->m_ext.ext_free = NULL;
+	m->m_ext.ext_arg1 = NULL;
+	m->m_ext.ext_arg2 = NULL;
+	m->m_ext.ref_cnt = NULL;
+	m->m_ext.ext_size = 0;
+	m->m_ext.ext_type = 0;
+	m->m_ext.ext_flags = 0;
+	m->m_flags &= ~M_EXT;
+	uma_zfree(zone_mbuf, m);
+}
+
+/*
+ * Attach the cluster from *m to *n, set up m_ext in *n
+ * and bump the refcount of the cluster.
+ */
+static void
+mb_dupcl(struct mbuf *n, struct mbuf *m)
+{
+	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
+	KASSERT(m->m_ext.ref_cnt != NULL, ("%s: ref_cnt not set", __func__));
+	KASSERT((n->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
+
+	if (*(m->m_ext.ref_cnt) == 1)
+		*(m->m_ext.ref_cnt) += 1;
+	else
+		atomic_add_int(m->m_ext.ref_cnt, 1);
+	n->m_ext.ext_buf = m->m_ext.ext_buf;
+	n->m_ext.ext_free = m->m_ext.ext_free;
+	n->m_ext.ext_arg1 = m->m_ext.ext_arg1;
+	n->m_ext.ext_arg2 = m->m_ext.ext_arg2;
+	n->m_ext.ext_size = m->m_ext.ext_size;
+	n->m_ext.ref_cnt = m->m_ext.ref_cnt;
+	n->m_ext.ext_type = m->m_ext.ext_type;
+	n->m_ext.ext_flags = m->m_ext.ext_flags;
+	n->m_flags |= M_EXT;
+	n->m_flags |= m->m_flags & M_RDONLY;
+}
+
+/*
+ * Clean up mbuf (chain) from any tags and packet headers.
+ * If "all" is set then the first mbuf in the chain will be
+ * cleaned too.
+ */
+void
+m_demote(struct mbuf *m0, int all)
+{
+	struct mbuf *m;
+
+	for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
+		if (m->m_flags & M_PKTHDR) {
+			m_tag_delete_chain(m, NULL);
+			m->m_flags &= ~M_PKTHDR;
+			bzero(&m->m_pkthdr, sizeof(struct pkthdr));
+		}
+		if (m != m0 && m->m_nextpkt != NULL) {
+			KASSERT(m->m_nextpkt == NULL,
+			    ("%s: m_nextpkt not NULL", __func__));
+			m_freem(m->m_nextpkt);
+			m->m_nextpkt = NULL;
+		}
+		m->m_flags = m->m_flags & (M_EXT|M_RDONLY|M_NOFREE);
+	}
+}
+
+/*
+ * Sanity checks on mbuf (chain) for use in KASSERT() and general
+ * debugging.
+ * Returns 0 or panics when bad and 1 on all tests passed.
+ * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
+ * blow up later.
+ */
+int
+m_sanity(struct mbuf *m0, int sanitize)
+{
+	struct mbuf *m;
+	caddr_t a, b;
+	int pktlen = 0;
+
+#ifdef INVARIANTS
+#define	M_SANITY_ACTION(s)	panic("mbuf %p: " s, m)
+#else 
+#define	M_SANITY_ACTION(s)	printf("mbuf %p: " s, m)
+#endif
+
+	for (m = m0; m != NULL; m = m->m_next) {
+		/*
+		 * Basic pointer checks.  If any of these fails then some
+		 * unrelated kernel memory before or after us is trashed.
+		 * No way to recover from that.
+		 */
+		a = ((m->m_flags & M_EXT) ? m->m_ext.ext_buf :
+			((m->m_flags & M_PKTHDR) ? (caddr_t)(&m->m_pktdat) :
+			 (caddr_t)(&m->m_dat)) );
+		b = (caddr_t)(a + (m->m_flags & M_EXT ? m->m_ext.ext_size :
+			((m->m_flags & M_PKTHDR) ? MHLEN : MLEN)));
+		if ((caddr_t)m->m_data < a)
+			M_SANITY_ACTION("m_data outside mbuf data range left");
+		if ((caddr_t)m->m_data > b)
+			M_SANITY_ACTION("m_data outside mbuf data range right");
+		if ((caddr_t)m->m_data + m->m_len > b)
+			M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
+
+		/* m->m_nextpkt may only be set on first mbuf in chain. */
+		if (m != m0 && m->m_nextpkt != NULL) {
+			if (sanitize) {
+				m_freem(m->m_nextpkt);
+				m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
+			} else
+				M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
+		}
+
+		/* packet length (not mbuf length!) calculation */
+		if (m0->m_flags & M_PKTHDR)
+			pktlen += m->m_len;
+
+		/* m_tags may only be attached to first mbuf in chain. */
+		if (m != m0 && m->m_flags & M_PKTHDR &&
+		    !SLIST_EMPTY(&m->m_pkthdr.tags)) {
+			if (sanitize) {
+				m_tag_delete_chain(m, NULL);
+				/* put in 0xDEADC0DE perhaps? */
+			} else
+				M_SANITY_ACTION("m_tags on in-chain mbuf");
+		}
+
+		/* M_PKTHDR may only be set on first mbuf in chain */
+		if (m != m0 && m->m_flags & M_PKTHDR) {
+			if (sanitize) {
+				bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
+				m->m_flags &= ~M_PKTHDR;
+				/* put in 0xDEADCODE and leave hdr flag in */
+			} else
+				M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
+		}
+	}
+	m = m0;
+	if (pktlen && pktlen != m->m_pkthdr.len) {
+		if (sanitize)
+			m->m_pkthdr.len = 0;
+		else
+			M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
+	}
+	return 1;
+
+#undef	M_SANITY_ACTION
+}
+
+
+/*
+ * "Move" mbuf pkthdr from "from" to "to".
+ * "from" must have M_PKTHDR set, and "to" must be empty.
+ */
+void
+m_move_pkthdr(struct mbuf *to, struct mbuf *from)
+{
+
+#if 0
+	/* see below for why these are not enabled */
+	M_ASSERTPKTHDR(to);
+	/* Note: with MAC, this may not be a good assertion. */
+	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
+	    ("m_move_pkthdr: to has tags"));
+#endif
+#ifdef MAC
+	/*
+	 * XXXMAC: It could be this should also occur for non-MAC?
+	 */
+	if (to->m_flags & M_PKTHDR)
+		m_tag_delete_chain(to, NULL);
+#endif
+	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
+	if ((to->m_flags & M_EXT) == 0)
+		to->m_data = to->m_pktdat;
+	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
+	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
+	from->m_flags &= ~M_PKTHDR;
+}
+
+/*
+ * Duplicate "from"'s mbuf pkthdr in "to".
+ * "from" must have M_PKTHDR set, and "to" must be empty.
+ * In particular, this does a deep copy of the packet tags.
+ */
+int
+m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
+{
+
+#if 0
+	/*
+	 * The mbuf allocator only initializes the pkthdr
+	 * when the mbuf is allocated with m_gethdr(). Many users
+	 * (e.g. m_copy*, m_prepend) use m_get() and then
+	 * smash the pkthdr as needed causing these
+	 * assertions to trip.  For now just disable them.
+	 */
+	M_ASSERTPKTHDR(to);
+	/* Note: with MAC, this may not be a good assertion. */
+	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
+#endif
+	MBUF_CHECKSLEEP(how);
+#ifdef MAC
+	if (to->m_flags & M_PKTHDR)
+		m_tag_delete_chain(to, NULL);
+#endif
+	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
+	if ((to->m_flags & M_EXT) == 0)
+		to->m_data = to->m_pktdat;
+	to->m_pkthdr = from->m_pkthdr;
+	SLIST_INIT(&to->m_pkthdr.tags);
+	return (m_tag_copy_chain(to, from, MBTOM(how)));
+}
+
+/*
+ * Lesser-used path for M_PREPEND:
+ * allocate new mbuf to prepend to chain,
+ * copy junk along.
+ */
+struct mbuf *
+m_prepend(struct mbuf *m, int len, int how)
+{
+	struct mbuf *mn;
+
+	if (m->m_flags & M_PKTHDR)
+		mn = m_gethdr(how, m->m_type);
+	else
+		mn = m_get(how, m->m_type);
+	if (mn == NULL) {
+		m_freem(m);
+		return (NULL);
+	}
+	if (m->m_flags & M_PKTHDR)
+		m_move_pkthdr(mn, m);
+	mn->m_next = m;
+	m = mn;
+	if(m->m_flags & M_PKTHDR) {
+		if (len < MHLEN)
+			MH_ALIGN(m, len);
+	} else {
+		if (len < MLEN) 
+			M_ALIGN(m, len);
+	}
+	m->m_len = len;
+	return (m);
+}
+
+/*
+ * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
+ * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
+ * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
+ * Note that the copy is read-only, because clusters are not copied,
+ * only their reference counts are incremented.
+ */
+struct mbuf *
+m_copym(struct mbuf *m, int off0, int len, int wait)
+{
+	struct mbuf *n, **np;
+	int off = off0;
+	struct mbuf *top;
+	int copyhdr = 0;
+
+	KASSERT(off >= 0, ("m_copym, negative off %d", off));
+	KASSERT(len >= 0, ("m_copym, negative len %d", len));
+	MBUF_CHECKSLEEP(wait);
+	if (off == 0 && m->m_flags & M_PKTHDR)
+		copyhdr = 1;
+	while (off > 0) {
+		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
+		if (off < m->m_len)
+			break;
+		off -= m->m_len;
+		m = m->m_next;
+	}
+	np = &top;
+	top = 0;
+	while (len > 0) {
+		if (m == NULL) {
+			KASSERT(len == M_COPYALL, 
+			    ("m_copym, length > size of mbuf chain"));
+			break;
+		}
+		if (copyhdr)
+			n = m_gethdr(wait, m->m_type);
+		else
+			n = m_get(wait, m->m_type);
+		*np = n;
+		if (n == NULL)
+			goto nospace;
+		if (copyhdr) {
+			if (!m_dup_pkthdr(n, m, wait))
+				goto nospace;
+			if (len == M_COPYALL)
+				n->m_pkthdr.len -= off0;
+			else
+				n->m_pkthdr.len = len;
+			copyhdr = 0;
+		}
+		n->m_len = min(len, m->m_len - off);
+		if (m->m_flags & M_EXT) {
+			n->m_data = m->m_data + off;
+			mb_dupcl(n, m);
+		} else
+			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
+			    (u_int)n->m_len);
+		if (len != M_COPYALL)
+			len -= n->m_len;
+		off = 0;
+		m = m->m_next;
+		np = &n->m_next;
+	}
+
+	return (top);
+nospace:
+	m_freem(top);
+	return (NULL);
+}
+
+/*
+ * Returns mbuf chain with new head for the prepending case.
+ * Copies from mbuf (chain) n from off for len to mbuf (chain) m
+ * either prepending or appending the data.
+ * The resulting mbuf (chain) m is fully writeable.
+ * m is destination (is made writeable)
+ * n is source, off is offset in source, len is len from offset
+ * dir, 0 append, 1 prepend
+ * how, wait or nowait
+ */
+
+static int
+m_bcopyxxx(void *s, void *t, u_int len)
+{
+	bcopy(s, t, (size_t)len);
+	return 0;
+}
+
+struct mbuf *
+m_copymdata(struct mbuf *m, struct mbuf *n, int off, int len,
+    int prep, int how)
+{
+	struct mbuf *mm, *x, *z, *prev = NULL;
+	caddr_t p;
+	int i, nlen = 0;
+	caddr_t buf[MLEN];
+
+	KASSERT(m != NULL && n != NULL, ("m_copymdata, no target or source"));
+	KASSERT(off >= 0, ("m_copymdata, negative off %d", off));
+	KASSERT(len >= 0, ("m_copymdata, negative len %d", len));
+	KASSERT(prep == 0 || prep == 1, ("m_copymdata, unknown direction %d", prep));
+
+	mm = m;
+	if (!prep) {
+		while(mm->m_next) {
+			prev = mm;
+			mm = mm->m_next;
+		}
+	}
+	for (z = n; z != NULL; z = z->m_next)
+		nlen += z->m_len;
+	if (len == M_COPYALL)
+		len = nlen - off;
+	if (off + len > nlen || len < 1)
+		return NULL;
+
+	if (!M_WRITABLE(mm)) {
+		/* XXX: Use proper m_xxx function instead. */
+		x = m_getcl(how, MT_DATA, mm->m_flags);
+		if (x == NULL)
+			return NULL;
+		bcopy(mm->m_ext.ext_buf, x->m_ext.ext_buf, x->m_ext.ext_size);
+		p = x->m_ext.ext_buf + (mm->m_data - mm->m_ext.ext_buf);
+		x->m_data = p;
+		mm->m_next = NULL;
+		if (mm != m)
+			prev->m_next = x;
+		m_free(mm);
+		mm = x;
+	}
+
+	/*
+	 * Append/prepend the data.  Allocating mbufs as necessary.
+	 */
+	/* Shortcut if enough free space in first/last mbuf. */
+	if (!prep && M_TRAILINGSPACE(mm) >= len) {
+		m_apply(n, off, len, m_bcopyxxx, mtod(mm, caddr_t) +
+			 mm->m_len);
+		mm->m_len += len;
+		mm->m_pkthdr.len += len;
+		return m;
+	}
+	if (prep && M_LEADINGSPACE(mm) >= len) {
+		mm->m_data = mtod(mm, caddr_t) - len;
+		m_apply(n, off, len, m_bcopyxxx, mtod(mm, caddr_t));
+		mm->m_len += len;
+		mm->m_pkthdr.len += len;
+		return mm;
+	}
+
+	/* Expand first/last mbuf to cluster if possible. */
+	if (!prep && !(mm->m_flags & M_EXT) && len > M_TRAILINGSPACE(mm)) {
+		bcopy(mm->m_data, &buf, mm->m_len);
+		m_clget(mm, how);
+		if (!(mm->m_flags & M_EXT))
+			return NULL;
+		bcopy(&buf, mm->m_ext.ext_buf, mm->m_len);
+		mm->m_data = mm->m_ext.ext_buf;
+	}
+	if (prep && !(mm->m_flags & M_EXT) && len > M_LEADINGSPACE(mm)) {
+		bcopy(mm->m_data, &buf, mm->m_len);
+		m_clget(mm, how);
+		if (!(mm->m_flags & M_EXT))
+			return NULL;
+		bcopy(&buf, (caddr_t *)mm->m_ext.ext_buf +
+		       mm->m_ext.ext_size - mm->m_len, mm->m_len);
+		mm->m_data = (caddr_t)mm->m_ext.ext_buf +
+			      mm->m_ext.ext_size - mm->m_len;
+	}
+
+	/* Append/prepend as many mbuf (clusters) as necessary to fit len. */
+	if (!prep && len > M_TRAILINGSPACE(mm)) {
+		if (!m_getm(mm, len - M_TRAILINGSPACE(mm), how, MT_DATA))
+			return NULL;
+	}
+	if (prep && len > M_LEADINGSPACE(mm)) {
+		if (!(z = m_getm(NULL, len - M_LEADINGSPACE(mm), how, MT_DATA)))
+			return NULL;
+		i = 0;
+		for (x = z; x != NULL; x = x->m_next) {
+			i += x->m_flags & M_EXT ? x->m_ext.ext_size :
+			      (x->m_flags & M_PKTHDR ? MHLEN : MLEN);
+			if (!x->m_next)
+				break;
+		}
+		z->m_data += i - len;
+		m_move_pkthdr(mm, z);
+		x->m_next = mm;
+		mm = z;
+	}
+
+	/* Seek to start position in source mbuf. Optimization for long chains. */
+	while (off > 0) {
+		if (off < n->m_len)
+			break;
+		off -= n->m_len;
+		n = n->m_next;
+	}
+
+	/* Copy data into target mbuf. */
+	z = mm;
+	while (len > 0) {
+		KASSERT(z != NULL, ("m_copymdata, falling off target edge"));
+		i = M_TRAILINGSPACE(z);
+		m_apply(n, off, i, m_bcopyxxx, mtod(z, caddr_t) + z->m_len);
+		z->m_len += i;
+		/* fixup pkthdr.len if necessary */
+		if ((prep ? mm : m)->m_flags & M_PKTHDR)
+			(prep ? mm : m)->m_pkthdr.len += i;
+		off += i;
+		len -= i;
+		z = z->m_next;
+	}
+	return (prep ? mm : m);
+}
+
+/*
+ * Copy an entire packet, including header (which must be present).
+ * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
+ * Note that the copy is read-only, because clusters are not copied,
+ * only their reference counts are incremented.
+ * Preserve alignment of the first mbuf so if the creator has left
+ * some room at the beginning (e.g. for inserting protocol headers)
+ * the copies still have the room available.
+ */
+struct mbuf *
+m_copypacket(struct mbuf *m, int how)
+{
+	struct mbuf *top, *n, *o;
+
+	MBUF_CHECKSLEEP(how);
+	n = m_get(how, m->m_type);
+	top = n;
+	if (n == NULL)
+		goto nospace;
+
+	if (!m_dup_pkthdr(n, m, how))
+		goto nospace;
+	n->m_len = m->m_len;
+	if (m->m_flags & M_EXT) {
+		n->m_data = m->m_data;
+		mb_dupcl(n, m);
+	} else {
+		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
+		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+	}
+
+	m = m->m_next;
+	while (m) {
+		o = m_get(how, m->m_type);
+		if (o == NULL)
+			goto nospace;
+
+		n->m_next = o;
+		n = n->m_next;
+
+		n->m_len = m->m_len;
+		if (m->m_flags & M_EXT) {
+			n->m_data = m->m_data;
+			mb_dupcl(n, m);
+		} else {
+			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+		}
+
+		m = m->m_next;
+	}
+	return top;
+nospace:
+	m_freem(top);
+	return (NULL);
+}
+
+/*
+ * Copy data from an mbuf chain starting "off" bytes from the beginning,
+ * continuing for "len" bytes, into the indicated buffer.
+ */
+void
+m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
+{
+	u_int count;
+
+	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
+	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
+	while (off > 0) {
+		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
+		if (off < m->m_len)
+			break;
+		off -= m->m_len;
+		m = m->m_next;
+	}
+	while (len > 0) {
+		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
+		count = min(m->m_len - off, len);
+		bcopy(mtod(m, caddr_t) + off, cp, count);
+		len -= count;
+		cp += count;
+		off = 0;
+		m = m->m_next;
+	}
+}
+
+/*
+ * Copy a packet header mbuf chain into a completely new chain, including
+ * copying any mbuf clusters.  Use this instead of m_copypacket() when
+ * you need a writable copy of an mbuf chain.
+ */
+struct mbuf *
+m_dup(struct mbuf *m, int how)
+{
+	struct mbuf **p, *top = NULL;
+	int remain, moff, nsize;
+
+	MBUF_CHECKSLEEP(how);
+	/* Sanity check */
+	if (m == NULL)
+		return (NULL);
+	M_ASSERTPKTHDR(m);
+
+	/* While there's more data, get a new mbuf, tack it on, and fill it */
+	remain = m->m_pkthdr.len;
+	moff = 0;
+	p = &top;
+	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
+		struct mbuf *n;
+
+		/* Get the next new mbuf */
+		if (remain >= MINCLSIZE) {
+			n = m_getcl(how, m->m_type, 0);
+			nsize = MCLBYTES;
+		} else {
+			n = m_get(how, m->m_type);
+			nsize = MLEN;
+		}
+		if (n == NULL)
+			goto nospace;
+
+		if (top == NULL) {		/* First one, must be PKTHDR */
+			if (!m_dup_pkthdr(n, m, how)) {
+				m_free(n);
+				goto nospace;
+			}
+			if ((n->m_flags & M_EXT) == 0)
+				nsize = MHLEN;
+		}
+		n->m_len = 0;
+
+		/* Link it into the new chain */
+		*p = n;
+		p = &n->m_next;
+
+		/* Copy data from original mbuf(s) into new mbuf */
+		while (n->m_len < nsize && m != NULL) {
+			int chunk = min(nsize - n->m_len, m->m_len - moff);
+
+			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
+			moff += chunk;
+			n->m_len += chunk;
+			remain -= chunk;
+			if (moff == m->m_len) {
+				m = m->m_next;
+				moff = 0;
+			}
+		}
+
+		/* Check correct total mbuf length */
+		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
+		    	("%s: bogus m_pkthdr.len", __func__));
+	}
+	return (top);
+
+nospace:
+	m_freem(top);
+	return (NULL);
+}
+
+/*
+ * Concatenate mbuf chain n to m.
+ * Both chains must be of the same type (e.g. MT_DATA).
+ * Any m_pkthdr is not updated.
+ */
+void
+m_cat(struct mbuf *m, struct mbuf *n)
+{
+	while (m->m_next)
+		m = m->m_next;
+	while (n) {
+		if (!M_WRITABLE(m) ||
+		    M_TRAILINGSPACE(m) < n->m_len) {
+			/* just join the two chains */
+			m->m_next = n;
+			return;
+		}
+		/* splat the data from one into the other */
+		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
+		    (u_int)n->m_len);
+		m->m_len += n->m_len;
+		n = m_free(n);
+	}
+}
+
+void
+m_adj(struct mbuf *mp, int req_len)
+{
+	int len = req_len;
+	struct mbuf *m;
+	int count;
+
+	if ((m = mp) == NULL)
+		return;
+	if (len >= 0) {
+		/*
+		 * Trim from head.
+		 */
+		while (m != NULL && len > 0) {
+			if (m->m_len <= len) {
+				len -= m->m_len;
+				m->m_len = 0;
+				m = m->m_next;
+			} else {
+				m->m_len -= len;
+				m->m_data += len;
+				len = 0;
+			}
+		}
+		if (mp->m_flags & M_PKTHDR)
+			mp->m_pkthdr.len -= (req_len - len);
+	} else {
+		/*
+		 * Trim from tail.  Scan the mbuf chain,
+		 * calculating its length and finding the last mbuf.
+		 * If the adjustment only affects this mbuf, then just
+		 * adjust and return.  Otherwise, rescan and truncate
+		 * after the remaining size.
+		 */
+		len = -len;
+		count = 0;
+		for (;;) {
+			count += m->m_len;
+			if (m->m_next == (struct mbuf *)0)
+				break;
+			m = m->m_next;
+		}
+		if (m->m_len >= len) {
+			m->m_len -= len;
+			if (mp->m_flags & M_PKTHDR)
+				mp->m_pkthdr.len -= len;
+			return;
+		}
+		count -= len;
+		if (count < 0)
+			count = 0;
+		/*
+		 * Correct length for chain is "count".
+		 * Find the mbuf with last data, adjust its length,
+		 * and toss data from remaining mbufs on chain.
+		 */
+		m = mp;
+		if (m->m_flags & M_PKTHDR)
+			m->m_pkthdr.len = count;
+		for (; m; m = m->m_next) {
+			if (m->m_len >= count) {
+				m->m_len = count;
+				if (m->m_next != NULL) {
+					m_freem(m->m_next);
+					m->m_next = NULL;
+				}
+				break;
+			}
+			count -= m->m_len;
+		}
+	}
+}
+
+/*
+ * Rearange an mbuf chain so that len bytes are contiguous
+ * and in the data area of an mbuf (so that mtod will work
+ * for a structure of size len).  Returns the resulting
+ * mbuf chain on success, frees it and returns null on failure.
+ * If there is room, it will add up to max_protohdr-len extra bytes to the
+ * contiguous region in an attempt to avoid being called next time.
+ */
+struct mbuf *
+m_pullup(struct mbuf *n, int len)
+{
+	struct mbuf *m;
+	int count;
+	int space;
+
+	/*
+	 * If first mbuf has no cluster, and has room for len bytes
+	 * without shifting current data, pullup into it,
+	 * otherwise allocate a new mbuf to prepend to the chain.
+	 */
+	if ((n->m_flags & M_EXT) == 0 &&
+	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
+		if (n->m_len >= len)
+			return (n);
+		m = n;
+		n = n->m_next;
+		len -= m->m_len;
+	} else {
+		if (len > MHLEN)
+			goto bad;
+		m = m_get(M_NOWAIT, n->m_type);
+		if (m == NULL)
+			goto bad;
+		if (n->m_flags & M_PKTHDR)
+			m_move_pkthdr(m, n);
+	}
+	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
+	do {
+		count = min(min(max(len, max_protohdr), space), n->m_len);
+		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
+		  (u_int)count);
+		len -= count;
+		m->m_len += count;
+		n->m_len -= count;
+		space -= count;
+		if (n->m_len)
+			n->m_data += count;
+		else
+			n = m_free(n);
+	} while (len > 0 && n);
+	if (len > 0) {
+		(void) m_free(m);
+		goto bad;
+	}
+	m->m_next = n;
+	return (m);
+bad:
+	m_freem(n);
+	return (NULL);
+}
+
+/*
+ * Like m_pullup(), except a new mbuf is always allocated, and we allow
+ * the amount of empty space before the data in the new mbuf to be specified
+ * (in the event that the caller expects to prepend later).
+ */
+int MSFail;
+
+struct mbuf *
+m_copyup(struct mbuf *n, int len, int dstoff)
+{
+	struct mbuf *m;
+	int count, space;
+
+	if (len > (MHLEN - dstoff))
+		goto bad;
+	m = m_get(M_NOWAIT, n->m_type);
+	if (m == NULL)
+		goto bad;
+	if (n->m_flags & M_PKTHDR)
+		m_move_pkthdr(m, n);
+	m->m_data += dstoff;
+	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
+	do {
+		count = min(min(max(len, max_protohdr), space), n->m_len);
+		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
+		    (unsigned)count);
+		len -= count;
+		m->m_len += count;
+		n->m_len -= count;
+		space -= count;
+		if (n->m_len)
+			n->m_data += count;
+		else
+			n = m_free(n);
+	} while (len > 0 && n);
+	if (len > 0) {
+		(void) m_free(m);
+		goto bad;
+	}
+	m->m_next = n;
+	return (m);
+ bad:
+	m_freem(n);
+	MSFail++;
+	return (NULL);
+}
+
+/*
+ * Partition an mbuf chain in two pieces, returning the tail --
+ * all but the first len0 bytes.  In case of failure, it returns NULL and
+ * attempts to restore the chain to its original state.
+ *
+ * Note that the resulting mbufs might be read-only, because the new
+ * mbuf can end up sharing an mbuf cluster with the original mbuf if
+ * the "breaking point" happens to lie within a cluster mbuf. Use the
+ * M_WRITABLE() macro to check for this case.
+ */
+struct mbuf *
+m_split(struct mbuf *m0, int len0, int wait)
+{
+	struct mbuf *m, *n;
+	u_int len = len0, remain;
+
+	MBUF_CHECKSLEEP(wait);
+	for (m = m0; m && len > m->m_len; m = m->m_next)
+		len -= m->m_len;
+	if (m == NULL)
+		return (NULL);
+	remain = m->m_len - len;
+	if (m0->m_flags & M_PKTHDR && remain == 0) {
+		n = m_gethdr(wait, m0->m_type);
+			return (NULL);
+		n->m_next = m->m_next;
+		m->m_next = NULL;
+		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
+		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
+		m0->m_pkthdr.len = len0;
+		return (n);
+	} else if (m0->m_flags & M_PKTHDR) {
+		n = m_gethdr(wait, m0->m_type);
+		if (n == NULL)
+			return (NULL);
+		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
+		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
+		m0->m_pkthdr.len = len0;
+		if (m->m_flags & M_EXT)
+			goto extpacket;
+		if (remain > MHLEN) {
+			/* m can't be the lead packet */
+			MH_ALIGN(n, 0);
+			n->m_next = m_split(m, len, wait);
+			if (n->m_next == NULL) {
+				(void) m_free(n);
+				return (NULL);
+			} else {
+				n->m_len = 0;
+				return (n);
+			}
+		} else
+			MH_ALIGN(n, remain);
+	} else if (remain == 0) {
+		n = m->m_next;
+		m->m_next = NULL;
+		return (n);
+	} else {
+		n = m_get(wait, m->m_type);
+		if (n == NULL)
+			return (NULL);
+		M_ALIGN(n, remain);
+	}
+extpacket:
+	if (m->m_flags & M_EXT) {
+		n->m_data = m->m_data + len;
+		mb_dupcl(n, m);
+	} else {
+		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
+	}
+	n->m_len = remain;
+	m->m_len = len;
+	n->m_next = m->m_next;
+	m->m_next = NULL;
+	return (n);
+}
+/*
+ * Routine to copy from device local memory into mbufs.
+ * Note that `off' argument is offset into first mbuf of target chain from
+ * which to begin copying the data to.
+ */
+struct mbuf *
+m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
+    void (*copy)(char *from, caddr_t to, u_int len))
+{
+	struct mbuf *m;
+	struct mbuf *top = NULL, **mp = &top;
+	int len;
+
+	if (off < 0 || off > MHLEN)
+		return (NULL);
+
+	while (totlen > 0) {
+		if (top == NULL) {	/* First one, must be PKTHDR */
+			if (totlen + off >= MINCLSIZE) {
+				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
+				len = MCLBYTES;
+			} else {
+				m = m_gethdr(M_NOWAIT, MT_DATA);
+				len = MHLEN;
+
+				/* Place initial small packet/header at end of mbuf */
+				if (m && totlen + off + max_linkhdr <= MLEN) {
+					m->m_data += max_linkhdr;
+					len -= max_linkhdr;
+				}
+			}
+			if (m == NULL)
+				return NULL;
+			m->m_pkthdr.rcvif = ifp;
+			m->m_pkthdr.len = totlen;
+		} else {
+			if (totlen + off >= MINCLSIZE) {
+				m = m_getcl(M_NOWAIT, MT_DATA, 0);
+				len = MCLBYTES;
+			} else {
+				m = m_get(M_NOWAIT, MT_DATA);
+				len = MLEN;
+			}
+			if (m == NULL) {
+				m_freem(top);
+				return NULL;
+			}
+		}
+		if (off) {
+			m->m_data += off;
+			len -= off;
+			off = 0;
+		}
+		m->m_len = len = min(totlen, len);
+		if (copy)
+			copy(buf, mtod(m, caddr_t), (u_int)len);
+		else
+			bcopy(buf, mtod(m, caddr_t), (u_int)len);
+		buf += len;
+		*mp = m;
+		mp = &m->m_next;
+		totlen -= len;
+	}
+	return (top);
+}
+
+/*
+ * Copy data from a buffer back into the indicated mbuf chain,
+ * starting "off" bytes from the beginning, extending the mbuf
+ * chain if necessary.
+ */
+void
+m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
+{
+	int mlen;
+	struct mbuf *m = m0, *n;
+	int totlen = 0;
+
+	if (m0 == NULL)
+		return;
+	while (off > (mlen = m->m_len)) {
+		off -= mlen;
+		totlen += mlen;
+		if (m->m_next == NULL) {
+			n = m_get(M_NOWAIT, m->m_type);
+			if (n == NULL)
+				goto out;
+			bzero(mtod(n, caddr_t), MLEN);
+			n->m_len = min(MLEN, len + off);
+			m->m_next = n;
+		}
+		m = m->m_next;
+	}
+	while (len > 0) {
+		if (m->m_next == NULL && (len > m->m_len - off)) {
+			m->m_len += min(len - (m->m_len - off),
+			    M_TRAILINGSPACE(m));
+		}
+		mlen = min (m->m_len - off, len);
+		bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
+		cp += mlen;
+		len -= mlen;
+		mlen += off;
+		off = 0;
+		totlen += mlen;
+		if (len == 0)
+			break;
+		if (m->m_next == NULL) {
+			n = m_get(M_NOWAIT, m->m_type);
+			if (n == NULL)
+				break;
+			n->m_len = min(MLEN, len);
+			m->m_next = n;
+		}
+		m = m->m_next;
+	}
+out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
+		m->m_pkthdr.len = totlen;
+}
+
+/*
+ * Append the specified data to the indicated mbuf chain,
+ * Extend the mbuf chain if the new data does not fit in
+ * existing space.
+ *
+ * Return 1 if able to complete the job; otherwise 0.
+ */
+int
+m_append(struct mbuf *m0, int len, c_caddr_t cp)
+{
+	struct mbuf *m, *n;
+	int remainder, space;
+
+	for (m = m0; m->m_next != NULL; m = m->m_next)
+		;
+	remainder = len;
+	space = M_TRAILINGSPACE(m);
+	if (space > 0) {
+		/*
+		 * Copy into available space.
+		 */
+		if (space > remainder)
+			space = remainder;
+		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
+		m->m_len += space;
+		cp += space, remainder -= space;
+	}
+	while (remainder > 0) {
+		/*
+		 * Allocate a new mbuf; could check space
+		 * and allocate a cluster instead.
+		 */
+		n = m_get(M_NOWAIT, m->m_type);
+		if (n == NULL)
+			break;
+		n->m_len = min(MLEN, remainder);
+		bcopy(cp, mtod(n, caddr_t), n->m_len);
+		cp += n->m_len, remainder -= n->m_len;
+		m->m_next = n;
+		m = n;
+	}
+	if (m0->m_flags & M_PKTHDR)
+		m0->m_pkthdr.len += len - remainder;
+	return (remainder == 0);
+}
+
+/*
+ * Apply function f to the data in an mbuf chain starting "off" bytes from
+ * the beginning, continuing for "len" bytes.
+ */
+int
+m_apply(struct mbuf *m, int off, int len,
+    int (*f)(void *, void *, u_int), void *arg)
+{
+	u_int count;
+	int rval;
+
+	KASSERT(off >= 0, ("m_apply, negative off %d", off));
+	KASSERT(len >= 0, ("m_apply, negative len %d", len));
+	while (off > 0) {
+		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
+		if (off < m->m_len)
+			break;
+		off -= m->m_len;
+		m = m->m_next;
+	}
+	while (len > 0) {
+		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
+		count = min(m->m_len - off, len);
+		rval = (*f)(arg, mtod(m, caddr_t) + off, count);
+		if (rval)
+			return (rval);
+		len -= count;
+		off = 0;
+		m = m->m_next;
+	}
+	return (0);
+}
+
+/*
+ * Return a pointer to mbuf/offset of location in mbuf chain.
+ */
+struct mbuf *
+m_getptr(struct mbuf *m, int loc, int *off)
+{
+
+	while (loc >= 0) {
+		/* Normal end of search. */
+		if (m->m_len > loc) {
+			*off = loc;
+			return (m);
+		} else {
+			loc -= m->m_len;
+			if (m->m_next == NULL) {
+				if (loc == 0) {
+					/* Point at the end of valid data. */
+					*off = m->m_len;
+					return (m);
+				}
+				return (NULL);
+			}
+			m = m->m_next;
+		}
+	}
+	return (NULL);
+}
+
+void
+m_print(const struct mbuf *m, int maxlen)
+{
+	int len;
+	int pdata;
+	const struct mbuf *m2;
+
+	if (m == NULL) {
+		printf("mbuf: %p\n", m);
+		return;
+	}
+
+	if (m->m_flags & M_PKTHDR)
+		len = m->m_pkthdr.len;
+	else
+		len = -1;
+	m2 = m;
+	while (m2 != NULL && (len == -1 || len)) {
+		pdata = m2->m_len;
+		if (maxlen != -1 && pdata > maxlen)
+			pdata = maxlen;
+		printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
+		    m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
+		    "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
+		    "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
+		if (pdata)
+			printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
+		if (len != -1)
+			len -= m2->m_len;
+		m2 = m2->m_next;
+	}
+	if (len > 0)
+		printf("%d bytes unaccounted for.\n", len);
+	return;
+}
+
+u_int
+m_fixhdr(struct mbuf *m0)
+{
+	u_int len;
+
+	len = m_length(m0, NULL);
+	m0->m_pkthdr.len = len;
+	return (len);
+}
+
+u_int
+m_length(struct mbuf *m0, struct mbuf **last)
+{
+	struct mbuf *m;
+	u_int len;
+
+	len = 0;
+	for (m = m0; m != NULL; m = m->m_next) {
+		len += m->m_len;
+		if (m->m_next == NULL)
+			break;
+	}
+	if (last != NULL)
+		*last = m;
+	return (len);
+}
+
+/*
+ * Defragment a mbuf chain, returning the shortest possible
+ * chain of mbufs and clusters.  If allocation fails and
+ * this cannot be completed, NULL will be returned, but
+ * the passed in chain will be unchanged.  Upon success,
+ * the original chain will be freed, and the new chain
+ * will be returned.
+ *
+ * If a non-packet header is passed in, the original
+ * mbuf (chain?) will be returned unharmed.
+ */
+struct mbuf *
+m_defrag(struct mbuf *m0, int how)
+{
+	struct mbuf *m_new = NULL, *m_final = NULL;
+	int progress = 0, length;
+
+	MBUF_CHECKSLEEP(how);
+	if (!(m0->m_flags & M_PKTHDR))
+		return (m0);
+
+	m_fixhdr(m0); /* Needed sanity check */
+
+#ifdef MBUF_STRESS_TEST
+	if (m_defragrandomfailures) {
+		int temp = arc4random() & 0xff;
+		if (temp == 0xba)
+			goto nospace;
+	}
+#endif
+	
+	if (m0->m_pkthdr.len > MHLEN)
+		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
+	else
+		m_final = m_gethdr(how, MT_DATA);
+
+	if (m_final == NULL)
+		goto nospace;
+
+	if (m_dup_pkthdr(m_final, m0, how) == 0)
+		goto nospace;
+
+	m_new = m_final;
+
+	while (progress < m0->m_pkthdr.len) {
+		length = m0->m_pkthdr.len - progress;
+		if (length > MCLBYTES)
+			length = MCLBYTES;
+
+		if (m_new == NULL) {
+			if (length > MLEN)
+				m_new = m_getcl(how, MT_DATA, 0);
+			else
+				m_new = m_get(how, MT_DATA);
+			if (m_new == NULL)
+				goto nospace;
+		}
+
+		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
+		progress += length;
+		m_new->m_len = length;
+		if (m_new != m_final)
+			m_cat(m_final, m_new);
+		m_new = NULL;
+	}
+#ifdef MBUF_STRESS_TEST
+	if (m0->m_next == NULL)
+		m_defraguseless++;
+#endif
+	m_freem(m0);
+	m0 = m_final;
+#ifdef MBUF_STRESS_TEST
+	m_defragpackets++;
+	m_defragbytes += m0->m_pkthdr.len;
+#endif
+	return (m0);
+nospace:
+#ifdef MBUF_STRESS_TEST
+	m_defragfailure++;
+#endif
+	if (m_final)
+		m_freem(m_final);
+	return (NULL);
+}
+
+/*
+ * Defragment an mbuf chain, returning at most maxfrags separate
+ * mbufs+clusters.  If this is not possible NULL is returned and
+ * the original mbuf chain is left in it's present (potentially
+ * modified) state.  We use two techniques: collapsing consecutive
+ * mbufs and replacing consecutive mbufs by a cluster.
+ *
+ * NB: this should really be named m_defrag but that name is taken
+ */
+struct mbuf *
+m_collapse(struct mbuf *m0, int how, int maxfrags)
+{
+	struct mbuf *m, *n, *n2, **prev;
+	u_int curfrags;
+
+	/*
+	 * Calculate the current number of frags.
+	 */
+	curfrags = 0;
+	for (m = m0; m != NULL; m = m->m_next)
+		curfrags++;
+	/*
+	 * First, try to collapse mbufs.  Note that we always collapse
+	 * towards the front so we don't need to deal with moving the
+	 * pkthdr.  This may be suboptimal if the first mbuf has much
+	 * less data than the following.
+	 */
+	m = m0;
+again:
+	for (;;) {
+		n = m->m_next;
+		if (n == NULL)
+			break;
+		if (M_WRITABLE(m) &&
+		    n->m_len < M_TRAILINGSPACE(m)) {
+			bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
+				n->m_len);
+			m->m_len += n->m_len;
+			m->m_next = n->m_next;
+			m_free(n);
+			if (--curfrags <= maxfrags)
+				return m0;
+		} else
+			m = n;
+	}
+	KASSERT(maxfrags > 1,
+		("maxfrags %u, but normal collapse failed", maxfrags));
+	/*
+	 * Collapse consecutive mbufs to a cluster.
+	 */
+	prev = &m0->m_next;		/* NB: not the first mbuf */
+	while ((n = *prev) != NULL) {
+		if ((n2 = n->m_next) != NULL &&
+		    n->m_len + n2->m_len < MCLBYTES) {
+			m = m_getcl(how, MT_DATA, 0);
+			if (m == NULL)
+				goto bad;
+			bcopy(mtod(n, void *), mtod(m, void *), n->m_len);
+			bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len,
+				n2->m_len);
+			m->m_len = n->m_len + n2->m_len;
+			m->m_next = n2->m_next;
+			*prev = m;
+			m_free(n);
+			m_free(n2);
+			if (--curfrags <= maxfrags)	/* +1 cl -2 mbufs */
+				return m0;
+			/*
+			 * Still not there, try the normal collapse
+			 * again before we allocate another cluster.
+			 */
+			goto again;
+		}
+		prev = &n->m_next;
+	}
+	/*
+	 * No place where we can collapse to a cluster; punt.
+	 * This can occur if, for example, you request 2 frags
+	 * but the packet requires that both be clusters (we
+	 * never reallocate the first mbuf to avoid moving the
+	 * packet header).
+	 */
+bad:
+	return NULL;
+}
+
+#ifdef MBUF_STRESS_TEST
+
+/*
+ * Fragment an mbuf chain.  There's no reason you'd ever want to do
+ * this in normal usage, but it's great for stress testing various
+ * mbuf consumers.
+ *
+ * If fragmentation is not possible, the original chain will be
+ * returned.
+ *
+ * Possible length values:
+ * 0	 no fragmentation will occur
+ * > 0	each fragment will be of the specified length
+ * -1	each fragment will be the same random value in length
+ * -2	each fragment's length will be entirely random
+ * (Random values range from 1 to 256)
+ */
+struct mbuf *
+m_fragment(struct mbuf *m0, int how, int length)
+{
+	struct mbuf *m_new = NULL, *m_final = NULL;
+	int progress = 0;
+
+	if (!(m0->m_flags & M_PKTHDR))
+		return (m0);
+	
+	if ((length == 0) || (length < -2))
+		return (m0);
+
+	m_fixhdr(m0); /* Needed sanity check */
+
+	m_final = m_getcl(how, MT_DATA, M_PKTHDR);
+
+	if (m_final == NULL)
+		goto nospace;
+
+	if (m_dup_pkthdr(m_final, m0, how) == 0)
+		goto nospace;
+
+	m_new = m_final;
+
+	if (length == -1)
+		length = 1 + (arc4random() & 255);
+
+	while (progress < m0->m_pkthdr.len) {
+		int fraglen;
+
+		if (length > 0)
+			fraglen = length;
+		else
+			fraglen = 1 + (arc4random() & 255);
+		if (fraglen > m0->m_pkthdr.len - progress)
+			fraglen = m0->m_pkthdr.len - progress;
+
+		if (fraglen > MCLBYTES)
+			fraglen = MCLBYTES;
+
+		if (m_new == NULL) {
+			m_new = m_getcl(how, MT_DATA, 0);
+			if (m_new == NULL)
+				goto nospace;
+		}
+
+		m_copydata(m0, progress, fraglen, mtod(m_new, caddr_t));
+		progress += fraglen;
+		m_new->m_len = fraglen;
+		if (m_new != m_final)
+			m_cat(m_final, m_new);
+		m_new = NULL;
+	}
+	m_freem(m0);
+	m0 = m_final;
+	return (m0);
+nospace:
+	if (m_final)
+		m_freem(m_final);
+	/* Return the original chain on failure */
+	return (m0);
+}
+
+#endif
+
+/*
+ * Copy the contents of uio into a properly sized mbuf chain.
+ */
+struct mbuf *
+m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
+{
+	struct mbuf *m, *mb;
+	int error, length;
+	ssize_t total;
+	int progress = 0;
+
+	/*
+	 * len can be zero or an arbitrary large value bound by
+	 * the total data supplied by the uio.
+	 */
+	if (len > 0)
+		total = min(uio->uio_resid, len);
+	else
+		total = uio->uio_resid;
+
+	/*
+	 * The smallest unit returned by m_getm2() is a single mbuf
+	 * with pkthdr.  We can't align past it.
+	 */
+	if (align >= MHLEN)
+		return (NULL);
+
+	/*
+	 * Give us the full allocation or nothing.
+	 * If len is zero return the smallest empty mbuf.
+	 */
+	m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags);
+	if (m == NULL)
+		return (NULL);
+	m->m_data += align;
+
+	/* Fill all mbufs with uio data and update header information. */
+	for (mb = m; mb != NULL; mb = mb->m_next) {
+		length = min(M_TRAILINGSPACE(mb), total - progress);
+
+		error = uiomove(mtod(mb, void *), length, uio);
+		if (error) {
+			m_freem(m);
+			return (NULL);
+		}
+
+		mb->m_len = length;
+		progress += length;
+		if (flags & M_PKTHDR)
+			m->m_pkthdr.len += length;
+	}
+	KASSERT(progress == total, ("%s: progress != total", __func__));
+
+	return (m);
+}
+
+/*
+ * Copy an mbuf chain into a uio limited by len if set.
+ */
+int
+m_mbuftouio(struct uio *uio, struct mbuf *m, int len)
+{
+	int error, length, total;
+	int progress = 0;
+
+	if (len > 0)
+		total = min(uio->uio_resid, len);
+	else
+		total = uio->uio_resid;
+
+	/* Fill the uio with data from the mbufs. */
+	for (; m != NULL; m = m->m_next) {
+		length = min(m->m_len, total - progress);
+
+		error = uiomove(mtod(m, void *), length, uio);
+		if (error)
+			return (error);
+
+		progress += length;
+	}
+
+	return (0);
+}
+
+/*
+ * Set the m_data pointer of a newly-allocated mbuf
+ * to place an object of the specified size at the
+ * end of the mbuf, longword aligned.
+ */
+void
+m_align(struct mbuf *m, int len)
+{
+#ifdef INVARIANTS
+	const char *msg = "%s: not a virgin mbuf";
+#endif
+	int adjust;
+
+	if (m->m_flags & M_EXT) {
+		KASSERT(m->m_data == m->m_ext.ext_buf, (msg, __func__));
+		adjust = m->m_ext.ext_size - len;
+	} else if (m->m_flags & M_PKTHDR) {
+		KASSERT(m->m_data == m->m_pktdat, (msg, __func__));
+		adjust = MHLEN - len;
+	} else {
+		KASSERT(m->m_data == m->m_dat, (msg, __func__));
+		adjust = MLEN - len;
+	}
+
+	m->m_data += adjust &~ (sizeof(long)-1);
+}
+
+/*
+ * Create a writable copy of the mbuf chain.  While doing this
+ * we compact the chain with a goal of producing a chain with
+ * at most two mbufs.  The second mbuf in this chain is likely
+ * to be a cluster.  The primary purpose of this work is to create
+ * a writable packet for encryption, compression, etc.  The
+ * secondary goal is to linearize the data so the data can be
+ * passed to crypto hardware in the most efficient manner possible.
+ */
+struct mbuf *
+m_unshare(struct mbuf *m0, int how)
+{
+	struct mbuf *m, *mprev;
+	struct mbuf *n, *mfirst, *mlast;
+	int len, off;
+
+	mprev = NULL;
+	for (m = m0; m != NULL; m = mprev->m_next) {
+		/*
+		 * Regular mbufs are ignored unless there's a cluster
+		 * in front of it that we can use to coalesce.  We do
+		 * the latter mainly so later clusters can be coalesced
+		 * also w/o having to handle them specially (i.e. convert
+		 * mbuf+cluster -> cluster).  This optimization is heavily
+		 * influenced by the assumption that we're running over
+		 * Ethernet where MCLBYTES is large enough that the max
+		 * packet size will permit lots of coalescing into a
+		 * single cluster.  This in turn permits efficient
+		 * crypto operations, especially when using hardware.
+		 */
+		if ((m->m_flags & M_EXT) == 0) {
+			if (mprev && (mprev->m_flags & M_EXT) &&
+			    m->m_len <= M_TRAILINGSPACE(mprev)) {
+				/* XXX: this ignores mbuf types */
+				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
+				       mtod(m, caddr_t), m->m_len);
+				mprev->m_len += m->m_len;
+				mprev->m_next = m->m_next;	/* unlink from chain */
+				m_free(m);			/* reclaim mbuf */
+#if 0
+				newipsecstat.ips_mbcoalesced++;
+#endif
+			} else {
+				mprev = m;
+			}
+			continue;
+		}
+		/*
+		 * Writable mbufs are left alone (for now).
+		 */
+		if (M_WRITABLE(m)) {
+			mprev = m;
+			continue;
+		}
+
+		/*
+		 * Not writable, replace with a copy or coalesce with
+		 * the previous mbuf if possible (since we have to copy
+		 * it anyway, we try to reduce the number of mbufs and
+		 * clusters so that future work is easier).
+		 */
+		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
+		/* NB: we only coalesce into a cluster or larger */
+		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
+		    m->m_len <= M_TRAILINGSPACE(mprev)) {
+			/* XXX: this ignores mbuf types */
+			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
+			       mtod(m, caddr_t), m->m_len);
+			mprev->m_len += m->m_len;
+			mprev->m_next = m->m_next;	/* unlink from chain */
+			m_free(m);			/* reclaim mbuf */
+#if 0
+			newipsecstat.ips_clcoalesced++;
+#endif
+			continue;
+		}
+
+		/*
+		 * Allocate new space to hold the copy and copy the data.
+		 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
+		 * splitting them into clusters.  We could just malloc a
+		 * buffer and make it external but too many device drivers
+		 * don't know how to break up the non-contiguous memory when
+		 * doing DMA.
+		 */
+		n = m_getcl(how, m->m_type, m->m_flags);
+		if (n == NULL) {
+			m_freem(m0);
+			return (NULL);
+		}
+		len = m->m_len;
+		off = 0;
+		mfirst = n;
+		mlast = NULL;
+		for (;;) {
+			int cc = min(len, MCLBYTES);
+			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
+			n->m_len = cc;
+			if (mlast != NULL)
+				mlast->m_next = n;
+			mlast = n;	
+#if 0
+			newipsecstat.ips_clcopied++;
+#endif
+
+			len -= cc;
+			if (len <= 0)
+				break;
+			off += cc;
+
+			n = m_getcl(how, m->m_type, m->m_flags);
+			if (n == NULL) {
+				m_freem(mfirst);
+				m_freem(m0);
+				return (NULL);
+			}
+		}
+		n->m_next = m->m_next; 
+		if (mprev == NULL)
+			m0 = mfirst;		/* new head of chain */
+		else
+			mprev->m_next = mfirst;	/* replace old mbuf */
+		m_free(m);			/* release old mbuf */
+		mprev = mfirst;
+	}
+	return (m0);
+}
+
+#ifdef MBUF_PROFILING
+
+#define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
+struct mbufprofile {
+	uintmax_t wasted[MP_BUCKETS];
+	uintmax_t used[MP_BUCKETS];
+	uintmax_t segments[MP_BUCKETS];
+} mbprof;
+
+#define MP_MAXDIGITS 21	/* strlen("16,000,000,000,000,000,000") == 21 */
+#define MP_NUMLINES 6
+#define MP_NUMSPERLINE 16
+#define MP_EXTRABYTES 64	/* > strlen("used:\nwasted:\nsegments:\n") */
+/* work out max space needed and add a bit of spare space too */
+#define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE)
+#define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES)
+
+char mbprofbuf[MP_BUFSIZE];
+
+void
+m_profile(struct mbuf *m)
+{
+	int segments = 0;
+	int used = 0;
+	int wasted = 0;
+	
+	while (m) {
+		segments++;
+		used += m->m_len;
+		if (m->m_flags & M_EXT) {
+			wasted += MHLEN - sizeof(m->m_ext) +
+			    m->m_ext.ext_size - m->m_len;
+		} else {
+			if (m->m_flags & M_PKTHDR)
+				wasted += MHLEN - m->m_len;
+			else
+				wasted += MLEN - m->m_len;
+		}
+		m = m->m_next;
+	}
+	/* be paranoid.. it helps */
+	if (segments > MP_BUCKETS - 1)
+		segments = MP_BUCKETS - 1;
+	if (used > 100000)
+		used = 100000;
+	if (wasted > 100000)
+		wasted = 100000;
+	/* store in the appropriate bucket */
+	/* don't bother locking. if it's slightly off, so what? */
+	mbprof.segments[segments]++;
+	mbprof.used[fls(used)]++;
+	mbprof.wasted[fls(wasted)]++;
+}
+
+static void
+mbprof_textify(void)
+{
+	int offset;
+	char *c;
+	uint64_t *p;
+	
+
+	p = &mbprof.wasted[0];
+	c = mbprofbuf;
+	offset = snprintf(c, MP_MAXLINE + 10, 
+	    "wasted:\n"
+	    "%ju %ju %ju %ju %ju %ju %ju %ju "
+	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
+	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+#ifdef BIG_ARRAY
+	p = &mbprof.wasted[16];
+	c += offset;
+	offset = snprintf(c, MP_MAXLINE, 
+	    "%ju %ju %ju %ju %ju %ju %ju %ju "
+	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
+	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+#endif
+	p = &mbprof.used[0];
+	c += offset;
+	offset = snprintf(c, MP_MAXLINE + 10, 
+	    "used:\n"
+	    "%ju %ju %ju %ju %ju %ju %ju %ju "
+	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
+	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+#ifdef BIG_ARRAY
+	p = &mbprof.used[16];
+	c += offset;
+	offset = snprintf(c, MP_MAXLINE, 
+	    "%ju %ju %ju %ju %ju %ju %ju %ju "
+	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
+	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+#endif
+	p = &mbprof.segments[0];
+	c += offset;
+	offset = snprintf(c, MP_MAXLINE + 10, 
+	    "segments:\n"
+	    "%ju %ju %ju %ju %ju %ju %ju %ju "
+	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
+	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+#ifdef BIG_ARRAY
+	p = &mbprof.segments[16];
+	c += offset;
+	offset = snprintf(c, MP_MAXLINE, 
+	    "%ju %ju %ju %ju %ju %ju %ju %ju "
+	    "%ju %ju %ju %ju %ju %ju %ju %jju",
+	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+#endif
+}
+
+static int
+mbprof_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+
+	mbprof_textify();
+	error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1);
+	return (error);
+}
+
+static int
+mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
+{
+	int clear, error;
+ 
+	clear = 0;
+	error = sysctl_handle_int(oidp, &clear, 0, req);
+	if (error || !req->newptr)
+		return (error);
+ 
+	if (clear) {
+		bzero(&mbprof, sizeof(mbprof));
+	}
+ 
+	return (error);
+}
+
+
+SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING|CTLFLAG_RD,
+	    NULL, 0, mbprof_handler, "A", "mbuf profiling statistics");
+
+SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT|CTLFLAG_RW,
+	    NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics");
+#endif
+
diff --git a/sys/kern/uipc_mbuf2.c b/sys/kern/uipc_mbuf2.c
new file mode 100644
index 0000000..e32e2a1
--- /dev/null
+++ b/sys/kern/uipc_mbuf2.c
@@ -0,0 +1,453 @@
+/*	$KAME: uipc_mbuf2.c,v 1.31 2001/11/28 11:08:53 itojun Exp $	*/
+/*	$NetBSD: uipc_mbuf.c,v 1.40 1999/04/01 00:23:25 thorpej Exp $	*/
+
+/*-
+ * Copyright (C) 1999 WIDE Project.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_mbuf.c	8.4 (Berkeley) 2/14/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*#define PULLDOWN_DEBUG*/
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+
+#include <security/mac/mac_framework.h>
+
+static MALLOC_DEFINE(M_PACKET_TAGS, MBUF_TAG_MEM_NAME,
+    "packet-attached information");
+
+/* can't call it m_dup(), as freebsd[34] uses m_dup() with different arg */
+static struct mbuf *m_dup1(struct mbuf *, int, int, int);
+
+/*
+ * ensure that [off, off + len) is contiguous on the mbuf chain "m".
+ * packet chain before "off" is kept untouched.
+ * if offp == NULL, the target will start at <retval, 0> on resulting chain.
+ * if offp != NULL, the target will start at <retval, *offp> on resulting chain.
+ *
+ * on error return (NULL return value), original "m" will be freed.
+ *
+ * XXX: M_TRAILINGSPACE/M_LEADINGSPACE only permitted on writable ext_buf.
+ */
+struct mbuf *
+m_pulldown(struct mbuf *m, int off, int len, int *offp)
+{
+	struct mbuf *n, *o;
+	int hlen, tlen, olen;
+	int writable;
+
+	/* check invalid arguments. */
+	if (m == NULL)
+		panic("m == NULL in m_pulldown()");
+	if (len > MCLBYTES) {
+		m_freem(m);
+		return NULL;	/* impossible */
+	}
+
+#ifdef PULLDOWN_DEBUG
+    {
+	struct mbuf *t;
+	printf("before:");
+	for (t = m; t; t = t->m_next)
+		printf(" %d", t->m_len);
+	printf("\n");
+    }
+#endif
+	n = m;
+	while (n != NULL && off > 0) {
+		if (n->m_len > off)
+			break;
+		off -= n->m_len;
+		n = n->m_next;
+	}
+	/* be sure to point non-empty mbuf */
+	while (n != NULL && n->m_len == 0)
+		n = n->m_next;
+	if (!n) {
+		m_freem(m);
+		return NULL;	/* mbuf chain too short */
+	}
+
+	/*
+	 * XXX: This code is flawed because it considers a "writable" mbuf
+	 *      data region to require all of the following:
+	 *	  (i) mbuf _has_ to have M_EXT set; if it is just a regular
+	 *	      mbuf, it is still not considered "writable."
+	 *	  (ii) since mbuf has M_EXT, the ext_type _has_ to be
+	 *	       EXT_CLUSTER. Anything else makes it non-writable.
+	 *	  (iii) M_WRITABLE() must evaluate true.
+	 *      Ideally, the requirement should only be (iii).
+	 *
+	 * If we're writable, we're sure we're writable, because the ref. count
+	 * cannot increase from 1, as that would require posession of mbuf
+	 * n by someone else (which is impossible). However, if we're _not_
+	 * writable, we may eventually become writable )if the ref. count drops
+	 * to 1), but we'll fail to notice it unless we re-evaluate
+	 * M_WRITABLE(). For now, we only evaluate once at the beginning and
+	 * live with this.
+	 */
+	/*
+	 * XXX: This is dumb. If we're just a regular mbuf with no M_EXT,
+	 *      then we're not "writable," according to this code.
+	 */
+	writable = 0;
+	if ((n->m_flags & M_EXT) == 0 ||
+	    (n->m_ext.ext_type == EXT_CLUSTER && M_WRITABLE(n)))
+		writable = 1;
+
+	/*
+	 * the target data is on <n, off>.
+	 * if we got enough data on the mbuf "n", we're done.
+	 */
+	if ((off == 0 || offp) && len <= n->m_len - off && writable)
+		goto ok;
+
+	/*
+	 * when len <= n->m_len - off and off != 0, it is a special case.
+	 * len bytes from <n, off> sits in single mbuf, but the caller does
+	 * not like the starting position (off).
+	 * chop the current mbuf into two pieces, set off to 0.
+	 */
+	if (len <= n->m_len - off) {
+		o = m_dup1(n, off, n->m_len - off, M_NOWAIT);
+		if (o == NULL) {
+			m_freem(m);
+			return NULL;	/* ENOBUFS */
+		}
+		n->m_len = off;
+		o->m_next = n->m_next;
+		n->m_next = o;
+		n = n->m_next;
+		off = 0;
+		goto ok;
+	}
+
+	/*
+	 * we need to take hlen from <n, off> and tlen from <n->m_next, 0>,
+	 * and construct contiguous mbuf with m_len == len.
+	 * note that hlen + tlen == len, and tlen > 0.
+	 */
+	hlen = n->m_len - off;
+	tlen = len - hlen;
+
+	/*
+	 * ensure that we have enough trailing data on mbuf chain.
+	 * if not, we can do nothing about the chain.
+	 */
+	olen = 0;
+	for (o = n->m_next; o != NULL; o = o->m_next)
+		olen += o->m_len;
+	if (hlen + olen < len) {
+		m_freem(m);
+		return NULL;	/* mbuf chain too short */
+	}
+
+	/*
+	 * easy cases first.
+	 * we need to use m_copydata() to get data from <n->m_next, 0>.
+	 */
+	if ((off == 0 || offp) && M_TRAILINGSPACE(n) >= tlen
+	 && writable) {
+		m_copydata(n->m_next, 0, tlen, mtod(n, caddr_t) + n->m_len);
+		n->m_len += tlen;
+		m_adj(n->m_next, tlen);
+		goto ok;
+	}
+	if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen
+	 && writable) {
+		n->m_next->m_data -= hlen;
+		n->m_next->m_len += hlen;
+		bcopy(mtod(n, caddr_t) + off, mtod(n->m_next, caddr_t), hlen);
+		n->m_len -= hlen;
+		n = n->m_next;
+		off = 0;
+		goto ok;
+	}
+
+	/*
+	 * now, we need to do the hard way.  don't m_copy as there's no room
+	 * on both end.
+	 */
+	if (len > MLEN)
+		o = m_getcl(M_NOWAIT, m->m_type, 0);
+	else
+		o = m_get(M_NOWAIT, m->m_type);
+	if (!o) {
+		m_freem(m);
+		return NULL;	/* ENOBUFS */
+	}
+	/* get hlen from <n, off> into <o, 0> */
+	o->m_len = hlen;
+	bcopy(mtod(n, caddr_t) + off, mtod(o, caddr_t), hlen);
+	n->m_len -= hlen;
+	/* get tlen from <n->m_next, 0> into <o, hlen> */
+	m_copydata(n->m_next, 0, tlen, mtod(o, caddr_t) + o->m_len);
+	o->m_len += tlen;
+	m_adj(n->m_next, tlen);
+	o->m_next = n->m_next;
+	n->m_next = o;
+	n = o;
+	off = 0;
+
+ok:
+#ifdef PULLDOWN_DEBUG
+    {
+	struct mbuf *t;
+	printf("after:");
+	for (t = m; t; t = t->m_next)
+		printf("%c%d", t == n ? '*' : ' ', t->m_len);
+	printf(" (off=%d)\n", off);
+    }
+#endif
+	if (offp)
+		*offp = off;
+	return n;
+}
+
+static struct mbuf *
+m_dup1(struct mbuf *m, int off, int len, int wait)
+{
+	struct mbuf *n;
+	int copyhdr;
+
+	if (len > MCLBYTES)
+		return NULL;
+	if (off == 0 && (m->m_flags & M_PKTHDR) != 0)
+		copyhdr = 1;
+	else
+		copyhdr = 0;
+	if (len >= MINCLSIZE) {
+		if (copyhdr == 1)
+			n = m_getcl(wait, m->m_type, M_PKTHDR);
+		else
+			n = m_getcl(wait, m->m_type, 0);
+	} else {
+		if (copyhdr == 1)
+			n = m_gethdr(wait, m->m_type);
+		else
+			n = m_get(wait, m->m_type);
+	}
+	if (!n)
+		return NULL; /* ENOBUFS */
+
+	if (copyhdr && !m_dup_pkthdr(n, m, wait)) {
+		m_free(n);
+		return NULL;
+	}
+	m_copydata(m, off, len, mtod(n, caddr_t));
+	n->m_len = len;
+	return n;
+}
+
+/* Free a packet tag. */
+void
+m_tag_free_default(struct m_tag *t)
+{
+#ifdef MAC
+	if (t->m_tag_id == PACKET_TAG_MACLABEL)
+		mac_mbuf_tag_destroy(t);
+#endif
+	free(t, M_PACKET_TAGS);
+}
+
+/* Get a packet tag structure along with specified data following. */
+struct m_tag *
+m_tag_alloc(uint32_t cookie, int type, int len, int wait)
+{
+	struct m_tag *t;
+
+	MBUF_CHECKSLEEP(wait);
+	if (len < 0)
+		return NULL;
+	t = malloc(len + sizeof(struct m_tag), M_PACKET_TAGS, wait);
+	if (t == NULL)
+		return NULL;
+	m_tag_setup(t, cookie, type, len);
+	t->m_tag_free = m_tag_free_default;
+	return t;
+}
+
+/* Unlink and free a packet tag. */
+void
+m_tag_delete(struct mbuf *m, struct m_tag *t)
+{
+
+	KASSERT(m && t, ("m_tag_delete: null argument, m %p t %p", m, t));
+	m_tag_unlink(m, t);
+	m_tag_free(t);
+}
+
+/* Unlink and free a packet tag chain, starting from given tag. */
+void
+m_tag_delete_chain(struct mbuf *m, struct m_tag *t)
+{
+	struct m_tag *p, *q;
+
+	KASSERT(m, ("m_tag_delete_chain: null mbuf"));
+	if (t != NULL)
+		p = t;
+	else
+		p = SLIST_FIRST(&m->m_pkthdr.tags);
+	if (p == NULL)
+		return;
+	while ((q = SLIST_NEXT(p, m_tag_link)) != NULL)
+		m_tag_delete(m, q);
+	m_tag_delete(m, p);
+}
+
+/*
+ * Strip off all tags that would normally vanish when
+ * passing through a network interface.  Only persistent
+ * tags will exist after this; these are expected to remain
+ * so long as the mbuf chain exists, regardless of the
+ * path the mbufs take.
+ */
+void
+m_tag_delete_nonpersistent(struct mbuf *m)
+{
+	struct m_tag *p, *q;
+
+	SLIST_FOREACH_SAFE(p, &m->m_pkthdr.tags, m_tag_link, q)
+		if ((p->m_tag_id & MTAG_PERSISTENT) == 0)
+			m_tag_delete(m, p);
+}
+
+/* Find a tag, starting from a given position. */
+struct m_tag *
+m_tag_locate(struct mbuf *m, uint32_t cookie, int type, struct m_tag *t)
+{
+	struct m_tag *p;
+
+	KASSERT(m, ("m_tag_locate: null mbuf"));
+	if (t == NULL)
+		p = SLIST_FIRST(&m->m_pkthdr.tags);
+	else
+		p = SLIST_NEXT(t, m_tag_link);
+	while (p != NULL) {
+		if (p->m_tag_cookie == cookie && p->m_tag_id == type)
+			return p;
+		p = SLIST_NEXT(p, m_tag_link);
+	}
+	return NULL;
+}
+
+/* Copy a single tag. */
+struct m_tag *
+m_tag_copy(struct m_tag *t, int how)
+{
+	struct m_tag *p;
+
+	MBUF_CHECKSLEEP(how);
+	KASSERT(t, ("m_tag_copy: null tag"));
+	p = m_tag_alloc(t->m_tag_cookie, t->m_tag_id, t->m_tag_len, how);
+	if (p == NULL)
+		return (NULL);
+#ifdef MAC
+	/*
+	 * XXXMAC: we should probably pass off the initialization, and
+	 * copying here?  can we hide that PACKET_TAG_MACLABEL is
+	 * special from the mbuf code?
+	 */
+	if (t->m_tag_id == PACKET_TAG_MACLABEL) {
+		if (mac_mbuf_tag_init(p, how) != 0) {
+			m_tag_free(p);
+			return (NULL);
+		}
+		mac_mbuf_tag_copy(t, p);
+	} else
+#endif
+		bcopy(t + 1, p + 1, t->m_tag_len); /* Copy the data */
+	return p;
+}
+
+/*
+ * Copy two tag chains. The destination mbuf (to) loses any attached
+ * tags even if the operation fails. This should not be a problem, as
+ * m_tag_copy_chain() is typically called with a newly-allocated
+ * destination mbuf.
+ */
+int
+m_tag_copy_chain(struct mbuf *to, struct mbuf *from, int how)
+{
+	struct m_tag *p, *t, *tprev = NULL;
+
+	MBUF_CHECKSLEEP(how);
+	KASSERT(to && from,
+		("m_tag_copy_chain: null argument, to %p from %p", to, from));
+	m_tag_delete_chain(to, NULL);
+	SLIST_FOREACH(p, &from->m_pkthdr.tags, m_tag_link) {
+		t = m_tag_copy(p, how);
+		if (t == NULL) {
+			m_tag_delete_chain(to, NULL);
+			return 0;
+		}
+		if (tprev == NULL)
+			SLIST_INSERT_HEAD(&to->m_pkthdr.tags, t, m_tag_link);
+		else
+			SLIST_INSERT_AFTER(tprev, t, m_tag_link);
+		tprev = t;
+	}
+	return 1;
+}
diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c
new file mode 100644
index 0000000..fe7e886
--- /dev/null
+++ b/sys/kern/uipc_mqueue.c
@@ -0,0 +1,2883 @@
+/*-
+ * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * POSIX message queue implementation.
+ *
+ * 1) A mqueue filesystem can be mounted, each message queue appears
+ *    in mounted directory, user can change queue's permission and
+ *    ownership, or remove a queue. Manually creating a file in the
+ *    directory causes a message queue to be created in the kernel with
+ *    default message queue attributes applied and same name used, this
+ *    method is not advocated since mq_open syscall allows user to specify
+ *    different attributes. Also the file system can be mounted multiple
+ *    times at different mount points but shows same contents.
+ *
+ * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
+ *    but directly operate on internal data structure, this allows user to
+ *    use the IPC facility without having to mount mqueue file system.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/buf.h>
+#include <sys/capability.h>
+#include <sys/dirent.h>
+#include <sys/event.h>
+#include <sys/eventhandler.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mount.h>
+#include <sys/mqueue.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/posix4.h>
+#include <sys/poll.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sysproto.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <machine/atomic.h>
+
+FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support");
+
+/*
+ * Limits and constants
+ */
+#define	MQFS_NAMELEN		NAME_MAX
+#define MQFS_DELEN		(8 + MQFS_NAMELEN)
+
+/* node types */
+typedef enum {
+	mqfstype_none = 0,
+	mqfstype_root,
+	mqfstype_dir,
+	mqfstype_this,
+	mqfstype_parent,
+	mqfstype_file,
+	mqfstype_symlink,
+} mqfs_type_t;
+
+struct mqfs_node;
+
+/*
+ * mqfs_info: describes a mqfs instance
+ */
+struct mqfs_info {
+	struct sx		mi_lock;
+	struct mqfs_node	*mi_root;
+	struct unrhdr		*mi_unrhdr;
+};
+
+struct mqfs_vdata {
+	LIST_ENTRY(mqfs_vdata)	mv_link;
+	struct mqfs_node	*mv_node;
+	struct vnode		*mv_vnode;
+	struct task		mv_task;
+};
+
+/*
+ * mqfs_node: describes a node (file or directory) within a mqfs
+ */
+struct mqfs_node {
+	char			mn_name[MQFS_NAMELEN+1];
+	struct mqfs_info	*mn_info;
+	struct mqfs_node	*mn_parent;
+	LIST_HEAD(,mqfs_node)	mn_children;
+	LIST_ENTRY(mqfs_node)	mn_sibling;
+	LIST_HEAD(,mqfs_vdata)	mn_vnodes;
+	int			mn_refcount;
+	mqfs_type_t		mn_type;
+	int			mn_deleted;
+	uint32_t		mn_fileno;
+	void			*mn_data;
+	struct timespec		mn_birth;
+	struct timespec		mn_ctime;
+	struct timespec		mn_atime;
+	struct timespec		mn_mtime;
+	uid_t			mn_uid;
+	gid_t			mn_gid;
+	int			mn_mode;
+};
+
+#define	VTON(vp)	(((struct mqfs_vdata *)((vp)->v_data))->mv_node)
+#define VTOMQ(vp) 	((struct mqueue *)(VTON(vp)->mn_data))
+#define	VFSTOMQFS(m)	((struct mqfs_info *)((m)->mnt_data))
+#define	FPTOMQ(fp)	((struct mqueue *)(((struct mqfs_node *) \
+				(fp)->f_data)->mn_data))
+
+TAILQ_HEAD(msgq, mqueue_msg);
+
+struct mqueue;
+
+struct mqueue_notifier {
+	LIST_ENTRY(mqueue_notifier)	nt_link;
+	struct sigevent			nt_sigev;
+	ksiginfo_t			nt_ksi;
+	struct proc			*nt_proc;
+};
+
+struct mqueue {
+	struct mtx	mq_mutex;
+	int		mq_flags;
+	long		mq_maxmsg;
+	long		mq_msgsize;
+	long		mq_curmsgs;
+	long		mq_totalbytes;
+	struct msgq	mq_msgq;
+	int		mq_receivers;
+	int		mq_senders;
+	struct selinfo	mq_rsel;
+	struct selinfo	mq_wsel;
+	struct mqueue_notifier	*mq_notifier;
+};
+
+#define	MQ_RSEL		0x01
+#define	MQ_WSEL		0x02
+
+struct mqueue_msg {
+	TAILQ_ENTRY(mqueue_msg)	msg_link;
+	unsigned int	msg_prio;
+	unsigned int	msg_size;
+	/* following real data... */
+};
+
+static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0,
+	"POSIX real time message queue");
+
+static int	default_maxmsg  = 10;
+static int	default_msgsize = 1024;
+
+static int	maxmsg = 100;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
+    &maxmsg, 0, "Default maximum messages in queue");
+static int	maxmsgsize = 16384;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
+    &maxmsgsize, 0, "Default maximum message size");
+static int	maxmq = 100;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
+    &maxmq, 0, "maximum message queues");
+static int	curmq = 0;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
+    &curmq, 0, "current message queue number");
+static int	unloadable = 0;
+static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
+
+static eventhandler_tag exit_tag;
+
+/* Only one instance per-system */
+static struct mqfs_info		mqfs_data;
+static uma_zone_t		mqnode_zone;
+static uma_zone_t		mqueue_zone;
+static uma_zone_t		mvdata_zone;
+static uma_zone_t		mqnoti_zone;
+static struct vop_vector	mqfs_vnodeops;
+static struct fileops		mqueueops;
+
+/*
+ * Directory structure construction and manipulation
+ */
+#ifdef notyet
+static struct mqfs_node	*mqfs_create_dir(struct mqfs_node *parent,
+	const char *name, int namelen, struct ucred *cred, int mode);
+static struct mqfs_node	*mqfs_create_link(struct mqfs_node *parent,
+	const char *name, int namelen, struct ucred *cred, int mode);
+#endif
+
+static struct mqfs_node	*mqfs_create_file(struct mqfs_node *parent,
+	const char *name, int namelen, struct ucred *cred, int mode);
+static int	mqfs_destroy(struct mqfs_node *mn);
+static void	mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
+static void	mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
+static int	mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
+
+/*
+ * Message queue construction and maniplation
+ */
+static struct mqueue	*mqueue_alloc(const struct mq_attr *attr);
+static void	mqueue_free(struct mqueue *mq);
+static int	mqueue_send(struct mqueue *mq, const char *msg_ptr,
+			size_t msg_len, unsigned msg_prio, int waitok,
+			const struct timespec *abs_timeout);
+static int	mqueue_receive(struct mqueue *mq, char *msg_ptr,
+			size_t msg_len, unsigned *msg_prio, int waitok,
+			const struct timespec *abs_timeout);
+static int	_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
+			int timo);
+static int	_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
+			int timo);
+static void	mqueue_send_notification(struct mqueue *mq);
+static void	mqueue_fdclose(struct thread *td, int fd, struct file *fp);
+static void	mq_proc_exit(void *arg, struct proc *p);
+
+/*
+ * kqueue filters
+ */
+static void	filt_mqdetach(struct knote *kn);
+static int	filt_mqread(struct knote *kn, long hint);
+static int	filt_mqwrite(struct knote *kn, long hint);
+
+struct filterops mq_rfiltops = {
+	.f_isfd = 1,
+	.f_detach = filt_mqdetach,
+	.f_event = filt_mqread,
+};
+struct filterops mq_wfiltops = {
+	.f_isfd = 1,
+	.f_detach = filt_mqdetach,
+	.f_event = filt_mqwrite,
+};
+
+/*
+ * Initialize fileno bitmap
+ */
+static void
+mqfs_fileno_init(struct mqfs_info *mi)
+{
+	struct unrhdr *up;
+
+	up = new_unrhdr(1, INT_MAX, NULL);
+	mi->mi_unrhdr = up;
+}
+
+/*
+ * Tear down fileno bitmap
+ */
+static void
+mqfs_fileno_uninit(struct mqfs_info *mi)
+{
+	struct unrhdr *up;
+
+	up = mi->mi_unrhdr;
+	mi->mi_unrhdr = NULL;
+	delete_unrhdr(up);
+}
+
+/*
+ * Allocate a file number
+ */
+static void
+mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
+{
+	/* make sure our parent has a file number */
+	if (mn->mn_parent && !mn->mn_parent->mn_fileno)
+		mqfs_fileno_alloc(mi, mn->mn_parent);
+
+	switch (mn->mn_type) {
+	case mqfstype_root:
+	case mqfstype_dir:
+	case mqfstype_file:
+	case mqfstype_symlink:
+		mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
+		break;
+	case mqfstype_this:
+		KASSERT(mn->mn_parent != NULL,
+		    ("mqfstype_this node has no parent"));
+		mn->mn_fileno = mn->mn_parent->mn_fileno;
+		break;
+	case mqfstype_parent:
+		KASSERT(mn->mn_parent != NULL,
+		    ("mqfstype_parent node has no parent"));
+		if (mn->mn_parent == mi->mi_root) {
+			mn->mn_fileno = mn->mn_parent->mn_fileno;
+			break;
+		}
+		KASSERT(mn->mn_parent->mn_parent != NULL,
+		    ("mqfstype_parent node has no grandparent"));
+		mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
+		break;
+	default:
+		KASSERT(0,
+		    ("mqfs_fileno_alloc() called for unknown type node: %d",
+			mn->mn_type));
+		break;
+	}
+}
+
+/*
+ * Release a file number
+ */
+static void
+mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
+{
+	switch (mn->mn_type) {
+	case mqfstype_root:
+	case mqfstype_dir:
+	case mqfstype_file:
+	case mqfstype_symlink:
+		free_unr(mi->mi_unrhdr, mn->mn_fileno);
+		break;
+	case mqfstype_this:
+	case mqfstype_parent:
+		/* ignore these, as they don't "own" their file number */
+		break;
+	default:
+		KASSERT(0,
+		    ("mqfs_fileno_free() called for unknown type node: %d", 
+			mn->mn_type));
+		break;
+	}
+}
+
+static __inline struct mqfs_node *
+mqnode_alloc(void)
+{
+	return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
+}
+
+static __inline void
+mqnode_free(struct mqfs_node *node)
+{
+	uma_zfree(mqnode_zone, node);
+}
+
+static __inline void
+mqnode_addref(struct mqfs_node *node)
+{
+	atomic_fetchadd_int(&node->mn_refcount, 1);
+}
+
+static __inline void
+mqnode_release(struct mqfs_node *node)
+{
+	struct mqfs_info *mqfs;
+	int old, exp;
+
+	mqfs = node->mn_info;
+	old = atomic_fetchadd_int(&node->mn_refcount, -1);
+	if (node->mn_type == mqfstype_dir ||
+	    node->mn_type == mqfstype_root)
+		exp = 3; /* include . and .. */
+	else
+		exp = 1;
+	if (old == exp) {
+		int locked = sx_xlocked(&mqfs->mi_lock);
+		if (!locked)
+			sx_xlock(&mqfs->mi_lock);
+		mqfs_destroy(node);
+		if (!locked)
+			sx_xunlock(&mqfs->mi_lock);
+	}
+}
+
+/*
+ * Add a node to a directory
+ */
+static int
+mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
+{
+	KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
+	KASSERT(parent->mn_info != NULL,
+	    ("%s(): parent has no mn_info", __func__));
+	KASSERT(parent->mn_type == mqfstype_dir ||
+	    parent->mn_type == mqfstype_root,
+	    ("%s(): parent is not a directory", __func__));
+
+	node->mn_info = parent->mn_info;
+	node->mn_parent = parent;
+	LIST_INIT(&node->mn_children);
+	LIST_INIT(&node->mn_vnodes);
+	LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
+	mqnode_addref(parent);
+	return (0);
+}
+
+static struct mqfs_node *
+mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
+	int nodetype)
+{
+	struct mqfs_node *node;
+
+	node = mqnode_alloc();
+	strncpy(node->mn_name, name, namelen);
+	node->mn_type = nodetype;
+	node->mn_refcount = 1;
+	vfs_timestamp(&node->mn_birth);
+	node->mn_ctime = node->mn_atime = node->mn_mtime
+		= node->mn_birth;
+	node->mn_uid = cred->cr_uid;
+	node->mn_gid = cred->cr_gid;
+	node->mn_mode = mode;
+	return (node);
+}
+
+/*
+ * Create a file
+ */
+static struct mqfs_node *
+mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
+	struct ucred *cred, int mode)
+{
+	struct mqfs_node *node;
+
+	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
+	if (mqfs_add_node(parent, node) != 0) {
+		mqnode_free(node);
+		return (NULL);
+	}
+	return (node);
+}
+
+/*
+ * Add . and .. to a directory
+ */
+static int
+mqfs_fixup_dir(struct mqfs_node *parent)
+{
+	struct mqfs_node *dir;
+
+	dir = mqnode_alloc();
+	dir->mn_name[0] = '.';
+	dir->mn_type = mqfstype_this;
+	dir->mn_refcount = 1;
+	if (mqfs_add_node(parent, dir) != 0) {
+		mqnode_free(dir);
+		return (-1);
+	}
+
+	dir = mqnode_alloc();
+	dir->mn_name[0] = dir->mn_name[1] = '.';
+	dir->mn_type = mqfstype_parent;
+	dir->mn_refcount = 1;
+
+	if (mqfs_add_node(parent, dir) != 0) {
+		mqnode_free(dir);
+		return (-1);
+	}
+
+	return (0);
+}
+
+#ifdef notyet
+
+/*
+ * Create a directory
+ */
+static struct mqfs_node *
+mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
+	struct ucred *cred, int mode)
+{
+	struct mqfs_node *node;
+
+	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
+	if (mqfs_add_node(parent, node) != 0) {
+		mqnode_free(node);
+		return (NULL);
+	}
+
+	if (mqfs_fixup_dir(node) != 0) {
+		mqfs_destroy(node);
+		return (NULL);
+	}
+	return (node);
+}
+
+/*
+ * Create a symlink
+ */
+static struct mqfs_node *
+mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
+	struct ucred *cred, int mode)
+{
+	struct mqfs_node *node;
+
+	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
+	if (mqfs_add_node(parent, node) != 0) {
+		mqnode_free(node);
+		return (NULL);
+	}
+	return (node);
+}
+
+#endif
+
+/*
+ * Destroy a node or a tree of nodes
+ */
+static int
+mqfs_destroy(struct mqfs_node *node)
+{
+	struct mqfs_node *parent;
+
+	KASSERT(node != NULL,
+	    ("%s(): node is NULL", __func__));
+	KASSERT(node->mn_info != NULL,
+	    ("%s(): node has no mn_info", __func__));
+
+	/* destroy children */
+	if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
+		while (! LIST_EMPTY(&node->mn_children))
+			mqfs_destroy(LIST_FIRST(&node->mn_children));
+
+	/* unlink from parent */
+	if ((parent = node->mn_parent) != NULL) {
+		KASSERT(parent->mn_info == node->mn_info,
+		    ("%s(): parent has different mn_info", __func__));
+		LIST_REMOVE(node, mn_sibling);
+	}
+
+	if (node->mn_fileno != 0)
+		mqfs_fileno_free(node->mn_info, node);
+	if (node->mn_data != NULL)
+		mqueue_free(node->mn_data);
+	mqnode_free(node);
+	return (0);
+}
+
+/*
+ * Mount a mqfs instance
+ */
+static int
+mqfs_mount(struct mount *mp)
+{
+	struct statfs *sbp;
+
+	if (mp->mnt_flag & MNT_UPDATE)
+		return (EOPNOTSUPP);
+
+	mp->mnt_data = &mqfs_data;
+	MNT_ILOCK(mp);
+	mp->mnt_flag |= MNT_LOCAL;
+	MNT_IUNLOCK(mp);
+	vfs_getnewfsid(mp);
+
+	sbp = &mp->mnt_stat;
+	vfs_mountedfrom(mp, "mqueue");
+	sbp->f_bsize = PAGE_SIZE;
+	sbp->f_iosize = PAGE_SIZE;
+	sbp->f_blocks = 1;
+	sbp->f_bfree = 0;
+	sbp->f_bavail = 0;
+	sbp->f_files = 1;
+	sbp->f_ffree = 0;
+	return (0);
+}
+
+/*
+ * Unmount a mqfs instance
+ */
+static int
+mqfs_unmount(struct mount *mp, int mntflags)
+{
+	int error;
+
+	error = vflush(mp, 0, (mntflags & MNT_FORCE) ?  FORCECLOSE : 0,
+	    curthread);
+	return (error);
+}
+
+/*
+ * Return a root vnode
+ */
+static int
+mqfs_root(struct mount *mp, int flags, struct vnode **vpp)
+{
+	struct mqfs_info *mqfs;
+	int ret;
+
+	mqfs = VFSTOMQFS(mp);
+	ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
+	return (ret);
+}
+
+/*
+ * Return filesystem stats
+ */
+static int
+mqfs_statfs(struct mount *mp, struct statfs *sbp)
+{
+	/* XXX update statistics */
+	return (0);
+}
+
+/*
+ * Initialize a mqfs instance
+ */
+static int
+mqfs_init(struct vfsconf *vfc)
+{
+	struct mqfs_node *root;
+	struct mqfs_info *mi;
+
+	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
+		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
+		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	mvdata_zone = uma_zcreate("mvdata",
+		sizeof(struct mqfs_vdata), NULL, NULL, NULL,
+		NULL, UMA_ALIGN_PTR, 0);
+	mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
+		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	mi = &mqfs_data;
+	sx_init(&mi->mi_lock, "mqfs lock");
+	/* set up the root diretory */
+	root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
+		mqfstype_root);
+	root->mn_info = mi;
+	LIST_INIT(&root->mn_children);
+	LIST_INIT(&root->mn_vnodes);
+	mi->mi_root = root;
+	mqfs_fileno_init(mi);
+	mqfs_fileno_alloc(mi, root);
+	mqfs_fixup_dir(root);
+	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
+	    EVENTHANDLER_PRI_ANY);
+	mq_fdclose = mqueue_fdclose;
+	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
+	return (0);
+}
+
+/*
+ * Destroy a mqfs instance
+ */
+static int
+mqfs_uninit(struct vfsconf *vfc)
+{
+	struct mqfs_info *mi;
+
+	if (!unloadable)
+		return (EOPNOTSUPP);
+	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
+	mi = &mqfs_data;
+	mqfs_destroy(mi->mi_root);
+	mi->mi_root = NULL;
+	mqfs_fileno_uninit(mi);
+	sx_destroy(&mi->mi_lock);
+	uma_zdestroy(mqnode_zone);
+	uma_zdestroy(mqueue_zone);
+	uma_zdestroy(mvdata_zone);
+	uma_zdestroy(mqnoti_zone);
+	return (0);
+}
+
+/*
+ * task routine
+ */
+static void
+do_recycle(void *context, int pending __unused)
+{
+	struct vnode *vp = (struct vnode *)context;
+
+	vrecycle(vp);
+	vdrop(vp);
+}
+
+/*
+ * Allocate a vnode
+ */
+static int
+mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
+{
+	struct mqfs_vdata *vd;
+	struct mqfs_info  *mqfs;
+	struct vnode *newvpp;
+	int error;
+
+	mqfs = pn->mn_info;
+	*vpp = NULL;
+	sx_xlock(&mqfs->mi_lock);
+	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
+		if (vd->mv_vnode->v_mount == mp) {
+			vhold(vd->mv_vnode);
+			break;
+		}
+	}
+
+	if (vd != NULL) {
+found:
+		*vpp = vd->mv_vnode;
+		sx_xunlock(&mqfs->mi_lock);
+		error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread);
+		vdrop(*vpp);
+		return (error);
+	}
+	sx_xunlock(&mqfs->mi_lock);
+
+	error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp);
+	if (error)
+		return (error);
+	vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY);
+	error = insmntque(newvpp, mp);
+	if (error != 0)
+		return (error);
+
+	sx_xlock(&mqfs->mi_lock);
+	/*
+	 * Check if it has already been allocated
+	 * while we were blocked.
+	 */
+	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
+		if (vd->mv_vnode->v_mount == mp) {
+			vhold(vd->mv_vnode);
+			sx_xunlock(&mqfs->mi_lock);
+
+			vgone(newvpp);
+			vput(newvpp);
+			goto found;
+		}
+	}
+
+	*vpp = newvpp;
+
+	vd = uma_zalloc(mvdata_zone, M_WAITOK);
+	(*vpp)->v_data = vd;
+	vd->mv_vnode = *vpp;
+	vd->mv_node = pn;
+	TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
+	LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
+	mqnode_addref(pn);
+	switch (pn->mn_type) {
+	case mqfstype_root:
+		(*vpp)->v_vflag = VV_ROOT;
+		/* fall through */
+	case mqfstype_dir:
+	case mqfstype_this:
+	case mqfstype_parent:
+		(*vpp)->v_type = VDIR;
+		break;
+	case mqfstype_file:
+		(*vpp)->v_type = VREG;
+		break;
+	case mqfstype_symlink:
+		(*vpp)->v_type = VLNK;
+		break;
+	case mqfstype_none:
+		KASSERT(0, ("mqfs_allocf called for null node\n"));
+	default:
+		panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
+	}
+	sx_xunlock(&mqfs->mi_lock);
+	return (0);
+}
+
+/* 
+ * Search a directory entry
+ */
+static struct mqfs_node *
+mqfs_search(struct mqfs_node *pd, const char *name, int len)
+{
+	struct mqfs_node *pn;
+
+	sx_assert(&pd->mn_info->mi_lock, SX_LOCKED);
+	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
+		if (strncmp(pn->mn_name, name, len) == 0 &&
+		    pn->mn_name[len] == '\0')
+			return (pn);
+	}
+	return (NULL);
+}
+
+/*
+ * Look up a file or directory.
+ */
+static int
+mqfs_lookupx(struct vop_cachedlookup_args *ap)
+{
+	struct componentname *cnp;
+	struct vnode *dvp, **vpp;
+	struct mqfs_node *pd;
+	struct mqfs_node *pn;
+	struct mqfs_info *mqfs;
+	int nameiop, flags, error, namelen;
+	char *pname;
+	struct thread *td;
+
+	cnp = ap->a_cnp;
+	vpp = ap->a_vpp;
+	dvp = ap->a_dvp;
+	pname = cnp->cn_nameptr;
+	namelen = cnp->cn_namelen;
+	td = cnp->cn_thread;
+	flags = cnp->cn_flags;
+	nameiop = cnp->cn_nameiop;
+	pd = VTON(dvp);
+	pn = NULL;
+	mqfs = pd->mn_info;
+	*vpp = NULLVP;
+
+	if (dvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
+	if (error)
+		return (error);
+
+	/* shortcut: check if the name is too long */
+	if (cnp->cn_namelen >= MQFS_NAMELEN)
+		return (ENOENT);
+
+	/* self */
+	if (namelen == 1 && pname[0] == '.') {
+		if ((flags & ISLASTCN) && nameiop != LOOKUP)
+			return (EINVAL);
+		pn = pd;
+		*vpp = dvp;
+		VREF(dvp);
+		return (0);
+	}
+
+	/* parent */
+	if (cnp->cn_flags & ISDOTDOT) {
+		if (dvp->v_vflag & VV_ROOT)
+			return (EIO);
+		if ((flags & ISLASTCN) && nameiop != LOOKUP)
+			return (EINVAL);
+		VOP_UNLOCK(dvp, 0);
+		KASSERT(pd->mn_parent, ("non-root directory has no parent"));
+		pn = pd->mn_parent;
+		error = mqfs_allocv(dvp->v_mount, vpp, pn);
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+		return (error);
+	}
+
+	/* named node */
+	sx_xlock(&mqfs->mi_lock);
+	pn = mqfs_search(pd, pname, namelen);
+	if (pn != NULL)
+		mqnode_addref(pn);
+	sx_xunlock(&mqfs->mi_lock);
+	
+	/* found */
+	if (pn != NULL) {
+		/* DELETE */
+		if (nameiop == DELETE && (flags & ISLASTCN)) {
+			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
+			if (error) {
+				mqnode_release(pn);
+				return (error);
+			}
+			if (*vpp == dvp) {
+				VREF(dvp);
+				*vpp = dvp;
+				mqnode_release(pn);
+				return (0);
+			}
+		}
+
+		/* allocate vnode */
+		error = mqfs_allocv(dvp->v_mount, vpp, pn);
+		mqnode_release(pn);
+		if (error == 0 && cnp->cn_flags & MAKEENTRY)
+			cache_enter(dvp, *vpp, cnp);
+		return (error);
+	}
+	
+	/* not found */
+
+	/* will create a new entry in the directory ? */
+	if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
+	    && (flags & ISLASTCN)) {
+		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
+		if (error)
+			return (error);
+		cnp->cn_flags |= SAVENAME;
+		return (EJUSTRETURN);
+	}
+	return (ENOENT);
+}
+
+#if 0
+struct vop_lookup_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+};
+#endif
+
+/*
+ * vnode lookup operation
+ */
+static int
+mqfs_lookup(struct vop_cachedlookup_args *ap)
+{
+	int rc;
+
+	rc = mqfs_lookupx(ap);
+	return (rc);
+}
+
+#if 0
+struct vop_create_args {
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+	struct vattr *a_vap;
+};
+#endif
+
+/*
+ * vnode creation operation
+ */
+static int
+mqfs_create(struct vop_create_args *ap)
+{
+	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+	struct componentname *cnp = ap->a_cnp;
+	struct mqfs_node *pd;
+	struct mqfs_node *pn;
+	struct mqueue *mq;
+	int error;
+
+	pd = VTON(ap->a_dvp);
+	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
+		return (ENOTDIR);
+	mq = mqueue_alloc(NULL);
+	if (mq == NULL)
+		return (EAGAIN);
+	sx_xlock(&mqfs->mi_lock);
+	if ((cnp->cn_flags & HASBUF) == 0)
+		panic("%s: no name", __func__);
+	pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
+		cnp->cn_cred, ap->a_vap->va_mode);
+	if (pn == NULL) {
+		sx_xunlock(&mqfs->mi_lock);
+		error = ENOSPC;
+	} else {
+		mqnode_addref(pn);
+		sx_xunlock(&mqfs->mi_lock);
+		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
+		mqnode_release(pn);
+		if (error)
+			mqfs_destroy(pn);
+		else
+			pn->mn_data = mq;
+	}
+	if (error)
+		mqueue_free(mq);
+	return (error);
+}
+
+/*
+ * Remove an entry
+ */
+static
+int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
+{
+	struct mqfs_node *parent;
+	struct mqfs_vdata *vd;
+	int error = 0;
+
+	sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
+
+	if (ucred->cr_uid != pn->mn_uid &&
+	    (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0)
+		error = EACCES;
+	else if (!pn->mn_deleted) {
+		parent = pn->mn_parent;
+		pn->mn_parent = NULL;
+		pn->mn_deleted = 1;
+		LIST_REMOVE(pn, mn_sibling);
+		LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
+			cache_purge(vd->mv_vnode);
+			vhold(vd->mv_vnode);
+			taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
+		}
+		mqnode_release(pn);
+		mqnode_release(parent);
+	} else
+		error = ENOENT;
+	return (error);
+}
+
+#if 0
+struct vop_remove_args {
+	struct vnode *a_dvp;
+	struct vnode *a_vp;
+	struct componentname *a_cnp;
+};
+#endif
+
+/*
+ * vnode removal operation
+ */
+static int
+mqfs_remove(struct vop_remove_args *ap)
+{
+	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+	struct mqfs_node *pn;
+	int error;
+
+	if (ap->a_vp->v_type == VDIR)
+                return (EPERM);
+	pn = VTON(ap->a_vp);
+	sx_xlock(&mqfs->mi_lock);
+	error = do_unlink(pn, ap->a_cnp->cn_cred);
+	sx_xunlock(&mqfs->mi_lock);
+	return (error);
+}
+
+#if 0
+struct vop_inactive_args {
+	struct vnode *a_vp;
+	struct thread *a_td;
+};
+#endif
+
+static int
+mqfs_inactive(struct vop_inactive_args *ap)
+{
+	struct mqfs_node *pn = VTON(ap->a_vp);
+
+	if (pn->mn_deleted)
+		vrecycle(ap->a_vp);
+	return (0);
+}
+
+#if 0
+struct vop_reclaim_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	struct thread *a_td;
+};
+#endif
+
+static int
+mqfs_reclaim(struct vop_reclaim_args *ap)
+{
+	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
+	struct vnode *vp = ap->a_vp;
+	struct mqfs_node *pn;
+	struct mqfs_vdata *vd;
+
+	vd = vp->v_data;
+	pn = vd->mv_node;
+	sx_xlock(&mqfs->mi_lock);
+	vp->v_data = NULL;
+	LIST_REMOVE(vd, mv_link);
+	uma_zfree(mvdata_zone, vd);
+	mqnode_release(pn);
+	sx_xunlock(&mqfs->mi_lock);
+	return (0);
+}
+
+#if 0
+struct vop_open_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	int a_mode;
+	struct ucred *a_cred;
+	struct thread *a_td;
+	struct file *a_fp;
+};
+#endif
+
+static int
+mqfs_open(struct vop_open_args *ap)
+{
+	return (0);
+}
+
+#if 0
+struct vop_close_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	int a_fflag;
+	struct ucred *a_cred;
+	struct thread *a_td;
+};
+#endif
+
+static int
+mqfs_close(struct vop_close_args *ap)
+{
+	return (0);
+}
+
+#if 0
+struct vop_access_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	accmode_t a_accmode;
+	struct ucred *a_cred;
+	struct thread *a_td;
+};
+#endif
+
+/*
+ * Verify permissions
+ */
+static int
+mqfs_access(struct vop_access_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct vattr vattr;
+	int error;
+
+	error = VOP_GETATTR(vp, &vattr, ap->a_cred);
+	if (error)
+		return (error);
+	error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid,
+	    vattr.va_gid, ap->a_accmode, ap->a_cred, NULL);
+	return (error);
+}
+
+#if 0
+struct vop_getattr_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	struct vattr *a_vap;
+	struct ucred *a_cred;
+};
+#endif
+
+/*
+ * Get file attributes
+ */
+static int
+mqfs_getattr(struct vop_getattr_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct mqfs_node *pn = VTON(vp);
+	struct vattr *vap = ap->a_vap;
+	int error = 0;
+
+	vap->va_type = vp->v_type;
+	vap->va_mode = pn->mn_mode;
+	vap->va_nlink = 1;
+	vap->va_uid = pn->mn_uid;
+	vap->va_gid = pn->mn_gid;
+	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+	vap->va_fileid = pn->mn_fileno;
+	vap->va_size = 0;
+	vap->va_blocksize = PAGE_SIZE;
+	vap->va_bytes = vap->va_size = 0;
+	vap->va_atime = pn->mn_atime;
+	vap->va_mtime = pn->mn_mtime;
+	vap->va_ctime = pn->mn_ctime;
+	vap->va_birthtime = pn->mn_birth;
+	vap->va_gen = 0;
+	vap->va_flags = 0;
+	vap->va_rdev = NODEV;
+	vap->va_bytes = 0;
+	vap->va_filerev = 0;
+	return (error);
+}
+
+#if 0
+struct vop_setattr_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	struct vattr *a_vap;
+	struct ucred *a_cred;
+};
+#endif
+/*
+ * Set attributes
+ */
+static int
+mqfs_setattr(struct vop_setattr_args *ap)
+{
+	struct mqfs_node *pn;
+	struct vattr *vap;
+	struct vnode *vp;
+	struct thread *td;
+	int c, error;
+	uid_t uid;
+	gid_t gid;
+
+	td = curthread;
+	vap = ap->a_vap;
+	vp = ap->a_vp;
+	if ((vap->va_type != VNON) ||
+	    (vap->va_nlink != VNOVAL) ||
+	    (vap->va_fsid != VNOVAL) ||
+	    (vap->va_fileid != VNOVAL) ||
+	    (vap->va_blocksize != VNOVAL) ||
+	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
+	    (vap->va_rdev != VNOVAL) ||
+	    ((int)vap->va_bytes != VNOVAL) ||
+	    (vap->va_gen != VNOVAL)) {
+		return (EINVAL);
+	}
+
+	pn = VTON(vp);
+
+	error = c = 0;
+	if (vap->va_uid == (uid_t)VNOVAL)
+		uid = pn->mn_uid;
+	else
+		uid = vap->va_uid;
+	if (vap->va_gid == (gid_t)VNOVAL)
+		gid = pn->mn_gid;
+	else
+		gid = vap->va_gid;
+
+	if (uid != pn->mn_uid || gid != pn->mn_gid) {
+		/*
+		 * To modify the ownership of a file, must possess VADMIN
+		 * for that file.
+		 */
+		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)))
+			return (error);
+
+		/*
+		 * XXXRW: Why is there a privilege check here: shouldn't the
+		 * check in VOP_ACCESS() be enough?  Also, are the group bits
+		 * below definitely right?
+		 */
+		if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
+		    (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
+		    (error = priv_check(td, PRIV_MQ_ADMIN)) != 0)
+			return (error);
+		pn->mn_uid = uid;
+		pn->mn_gid = gid;
+		c = 1;
+	}
+
+	if (vap->va_mode != (mode_t)VNOVAL) {
+		if ((ap->a_cred->cr_uid != pn->mn_uid) &&
+		    (error = priv_check(td, PRIV_MQ_ADMIN)))
+			return (error);
+		pn->mn_mode = vap->va_mode;
+		c = 1;
+	}
+
+	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
+		/* See the comment in ufs_vnops::ufs_setattr(). */
+		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) &&
+		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
+		    (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td))))
+			return (error);
+		if (vap->va_atime.tv_sec != VNOVAL) {
+			pn->mn_atime = vap->va_atime;
+		}
+		if (vap->va_mtime.tv_sec != VNOVAL) {
+			pn->mn_mtime = vap->va_mtime;
+		}
+		c = 1;
+	}
+	if (c) {
+		vfs_timestamp(&pn->mn_ctime);
+	}
+	return (0);
+}
+
+#if 0
+struct vop_read_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	struct uio *a_uio;
+	int a_ioflag;
+	struct ucred *a_cred;
+};
+#endif
+
+/*
+ * Read from a file
+ */
+static int
+mqfs_read(struct vop_read_args *ap)
+{
+	char buf[80];
+	struct vnode *vp = ap->a_vp;
+	struct uio *uio = ap->a_uio;
+	struct mqfs_node *pn;
+	struct mqueue *mq;
+	int len, error;
+
+	if (vp->v_type != VREG)
+		return (EINVAL);
+
+	pn = VTON(vp);
+	mq = VTOMQ(vp);
+	snprintf(buf, sizeof(buf),
+		"QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
+		mq->mq_totalbytes,
+		mq->mq_maxmsg,
+		mq->mq_curmsgs,
+		mq->mq_msgsize);
+	buf[sizeof(buf)-1] = '\0';
+	len = strlen(buf);
+	error = uiomove_frombuf(buf, len, uio);
+	return (error);
+}
+
+#if 0
+struct vop_readdir_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	struct uio *a_uio;
+	struct ucred *a_cred;
+	int *a_eofflag;
+	int *a_ncookies;
+	u_long **a_cookies;
+};
+#endif
+
+/*
+ * Return directory entries.
+ */
+static int
+mqfs_readdir(struct vop_readdir_args *ap)
+{
+	struct vnode *vp;
+	struct mqfs_info *mi;
+	struct mqfs_node *pd;
+	struct mqfs_node *pn;
+	struct dirent entry;
+	struct uio *uio;
+	int *tmp_ncookies = NULL;
+	off_t offset;
+	int error, i;
+
+	vp = ap->a_vp;
+	mi = VFSTOMQFS(vp->v_mount);
+	pd = VTON(vp);
+	uio = ap->a_uio;
+
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (uio->uio_offset < 0)
+		return (EINVAL);
+
+	if (ap->a_ncookies != NULL) {
+		tmp_ncookies = ap->a_ncookies;
+		*ap->a_ncookies = 0;
+		ap->a_ncookies = NULL;
+        }
+
+	error = 0;
+	offset = 0;
+
+	sx_xlock(&mi->mi_lock);
+
+	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
+		entry.d_reclen = sizeof(entry);
+		if (!pn->mn_fileno)
+			mqfs_fileno_alloc(mi, pn);
+		entry.d_fileno = pn->mn_fileno;
+		for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
+			entry.d_name[i] = pn->mn_name[i];
+		entry.d_name[i] = 0;
+		entry.d_namlen = i;
+		switch (pn->mn_type) {
+		case mqfstype_root:
+		case mqfstype_dir:
+		case mqfstype_this:
+		case mqfstype_parent:
+			entry.d_type = DT_DIR;
+			break;
+		case mqfstype_file:
+			entry.d_type = DT_REG;
+			break;
+		case mqfstype_symlink:
+			entry.d_type = DT_LNK;
+			break;
+		default:
+			panic("%s has unexpected node type: %d", pn->mn_name,
+				pn->mn_type);
+		}
+		if (entry.d_reclen > uio->uio_resid)
+                        break;
+		if (offset >= uio->uio_offset) {
+			error = vfs_read_dirent(ap, &entry, offset);
+                        if (error)
+                                break;
+                }
+                offset += entry.d_reclen;
+	}
+	sx_xunlock(&mi->mi_lock);
+
+	uio->uio_offset = offset;
+
+	if (tmp_ncookies != NULL)
+		ap->a_ncookies = tmp_ncookies;
+
+	return (error);
+}
+
+#ifdef notyet
+
+#if 0
+struct vop_mkdir_args {
+	struct vnode *a_dvp;
+	struvt vnode **a_vpp;
+	struvt componentname *a_cnp;
+	struct vattr *a_vap;
+};
+#endif
+
+/*
+ * Create a directory.
+ */
+static int
+mqfs_mkdir(struct vop_mkdir_args *ap)
+{
+	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+	struct componentname *cnp = ap->a_cnp;
+	struct mqfs_node *pd = VTON(ap->a_dvp);
+	struct mqfs_node *pn;
+	int error;
+
+	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
+		return (ENOTDIR);
+	sx_xlock(&mqfs->mi_lock);
+	if ((cnp->cn_flags & HASBUF) == 0)
+		panic("%s: no name", __func__);
+	pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
+		ap->a_vap->cn_cred, ap->a_vap->va_mode);
+	if (pn != NULL)
+		mqnode_addref(pn);
+	sx_xunlock(&mqfs->mi_lock);
+	if (pn == NULL) {
+		error = ENOSPC;
+	} else {
+		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
+		mqnode_release(pn);
+	}
+	return (error);
+}
+
+#if 0
+struct vop_rmdir_args {
+	struct vnode *a_dvp;
+	struct vnode *a_vp;
+	struct componentname *a_cnp;
+};
+#endif
+
+/*
+ * Remove a directory.
+ */
+static int
+mqfs_rmdir(struct vop_rmdir_args *ap)
+{
+	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+	struct mqfs_node *pn = VTON(ap->a_vp);
+	struct mqfs_node *pt;
+
+	if (pn->mn_type != mqfstype_dir)
+		return (ENOTDIR);
+
+	sx_xlock(&mqfs->mi_lock);
+	if (pn->mn_deleted) {
+		sx_xunlock(&mqfs->mi_lock);
+		return (ENOENT);
+	}
+
+	pt = LIST_FIRST(&pn->mn_children);
+	pt = LIST_NEXT(pt, mn_sibling);
+	pt = LIST_NEXT(pt, mn_sibling);
+	if (pt != NULL) {
+		sx_xunlock(&mqfs->mi_lock);
+		return (ENOTEMPTY);
+	}
+	pt = pn->mn_parent;
+	pn->mn_parent = NULL;
+	pn->mn_deleted = 1;
+	LIST_REMOVE(pn, mn_sibling);
+	mqnode_release(pn);
+	mqnode_release(pt);
+	sx_xunlock(&mqfs->mi_lock);
+	cache_purge(ap->a_vp);
+	return (0);
+}
+
+#endif /* notyet */
+
+/*
+ * Allocate a message queue
+ */
+static struct mqueue *
+mqueue_alloc(const struct mq_attr *attr)
+{
+	struct mqueue *mq;
+
+	if (curmq >= maxmq)
+		return (NULL);
+	mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
+	TAILQ_INIT(&mq->mq_msgq);
+	if (attr != NULL) {
+		mq->mq_maxmsg = attr->mq_maxmsg;
+		mq->mq_msgsize = attr->mq_msgsize;
+	} else {
+		mq->mq_maxmsg = default_maxmsg;
+		mq->mq_msgsize = default_msgsize;
+	}
+	mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF);
+	knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex);
+	knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex);
+	atomic_add_int(&curmq, 1);
+	return (mq);
+}
+
+/*
+ * Destroy a message queue
+ */
+static void
+mqueue_free(struct mqueue *mq)
+{
+	struct mqueue_msg *msg;
+
+	while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
+		TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
+		free(msg, M_MQUEUEDATA);
+	}
+
+	mtx_destroy(&mq->mq_mutex);
+	seldrain(&mq->mq_rsel);
+	seldrain(&mq->mq_wsel);
+	knlist_destroy(&mq->mq_rsel.si_note);
+	knlist_destroy(&mq->mq_wsel.si_note);
+	uma_zfree(mqueue_zone, mq);
+	atomic_add_int(&curmq, -1);
+}
+
+/*
+ * Load a message from user space
+ */
+static struct mqueue_msg *
+mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
+{
+	struct mqueue_msg *msg;
+	size_t len;
+	int error;
+
+	len = sizeof(struct mqueue_msg) + msg_size;
+	msg = malloc(len, M_MQUEUEDATA, M_WAITOK);
+	error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
+	    msg_size);
+	if (error) {
+		free(msg, M_MQUEUEDATA);
+		msg = NULL;
+	} else {
+		msg->msg_size = msg_size;
+		msg->msg_prio = msg_prio;
+	}
+	return (msg);
+}
+
+/*
+ * Save a message to user space
+ */
+static int
+mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
+{
+	int error;
+
+	error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
+		msg->msg_size);
+	if (error == 0 && msg_prio != NULL)
+		error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
+	return (error);
+}
+
+/*
+ * Free a message's memory
+ */
+static __inline void
+mqueue_freemsg(struct mqueue_msg *msg)
+{
+	free(msg, M_MQUEUEDATA);
+}
+
+/*
+ * Send a message. if waitok is false, thread will not be
+ * blocked if there is no data in queue, otherwise, absolute
+ * time will be checked.
+ */
+int
+mqueue_send(struct mqueue *mq, const char *msg_ptr,
+	size_t msg_len, unsigned msg_prio, int waitok,
+	const struct timespec *abs_timeout)
+{
+	struct mqueue_msg *msg;
+	struct timespec ts, ts2;
+	struct timeval tv;
+	int error;
+
+	if (msg_prio >= MQ_PRIO_MAX)
+		return (EINVAL);
+	if (msg_len > mq->mq_msgsize)
+		return (EMSGSIZE);
+	msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
+	if (msg == NULL)
+		return (EFAULT);
+
+	/* O_NONBLOCK case */
+	if (!waitok) {
+		error = _mqueue_send(mq, msg, -1);
+		if (error)
+			goto bad;
+		return (0);
+	}
+
+	/* we allow a null timeout (wait forever) */
+	if (abs_timeout == NULL) {
+		error = _mqueue_send(mq, msg, 0);
+		if (error)
+			goto bad;
+		return (0);
+	}
+
+	/* send it before checking time */
+	error = _mqueue_send(mq, msg, -1);
+	if (error == 0)
+		return (0);
+
+	if (error != EAGAIN)
+		goto bad;
+
+	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
+		error = EINVAL;
+		goto bad;
+	}
+	for (;;) {
+		ts2 = *abs_timeout;
+		getnanotime(&ts);
+		timespecsub(&ts2, &ts);
+		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
+			error = ETIMEDOUT;
+			break;
+		}
+		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
+		error = _mqueue_send(mq, msg, tvtohz(&tv));
+		if (error != ETIMEDOUT)
+			break;
+	}
+	if (error == 0)
+		return (0);
+bad:
+	mqueue_freemsg(msg);
+	return (error);
+}
+
+/*
+ * Common routine to send a message
+ */
+static int
+_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
+{	
+	struct mqueue_msg *msg2;
+	int error = 0;
+
+	mtx_lock(&mq->mq_mutex);
+	while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
+		if (timo < 0) {
+			mtx_unlock(&mq->mq_mutex);
+			return (EAGAIN);
+		}
+		mq->mq_senders++;
+		error = msleep(&mq->mq_senders, &mq->mq_mutex,
+			    PCATCH, "mqsend", timo);
+		mq->mq_senders--;
+		if (error == EAGAIN)
+			error = ETIMEDOUT;
+	}
+	if (mq->mq_curmsgs >= mq->mq_maxmsg) {
+		mtx_unlock(&mq->mq_mutex);
+		return (error);
+	}
+	error = 0;
+	if (TAILQ_EMPTY(&mq->mq_msgq)) {
+		TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
+	} else {
+		if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
+			TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
+		} else {
+			TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
+				if (msg2->msg_prio < msg->msg_prio)
+					break;
+			}
+			TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
+		}
+	}
+	mq->mq_curmsgs++;
+	mq->mq_totalbytes += msg->msg_size;
+	if (mq->mq_receivers)
+		wakeup_one(&mq->mq_receivers);
+	else if (mq->mq_notifier != NULL)
+		mqueue_send_notification(mq);
+	if (mq->mq_flags & MQ_RSEL) {
+		mq->mq_flags &= ~MQ_RSEL;
+		selwakeup(&mq->mq_rsel);
+	}
+	KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
+	mtx_unlock(&mq->mq_mutex);
+	return (0);
+}
+
+/*
+ * Send realtime a signal to process which registered itself
+ * successfully by mq_notify.
+ */
+static void
+mqueue_send_notification(struct mqueue *mq)
+{
+	struct mqueue_notifier *nt;
+	struct thread *td;
+	struct proc *p;
+	int error;
+
+	mtx_assert(&mq->mq_mutex, MA_OWNED);
+	nt = mq->mq_notifier;
+	if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
+		p = nt->nt_proc;
+		error = sigev_findtd(p, &nt->nt_sigev, &td);
+		if (error) {
+			mq->mq_notifier = NULL;
+			return;
+		}
+		if (!KSI_ONQ(&nt->nt_ksi)) {
+			ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev);
+			tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi);
+		}
+		PROC_UNLOCK(p);
+	}
+	mq->mq_notifier = NULL;
+}
+
+/*
+ * Get a message. if waitok is false, thread will not be
+ * blocked if there is no data in queue, otherwise, absolute
+ * time will be checked.
+ */
+int
+mqueue_receive(struct mqueue *mq, char *msg_ptr,
+	size_t msg_len, unsigned *msg_prio, int waitok,
+	const struct timespec *abs_timeout)
+{
+	struct mqueue_msg *msg;
+	struct timespec ts, ts2;
+	struct timeval tv;
+	int error;
+
+	if (msg_len < mq->mq_msgsize)
+		return (EMSGSIZE);
+
+	/* O_NONBLOCK case */
+	if (!waitok) {
+		error = _mqueue_recv(mq, &msg, -1);
+		if (error)
+			return (error);
+		goto received;
+	}
+
+	/* we allow a null timeout (wait forever). */
+	if (abs_timeout == NULL) {
+		error = _mqueue_recv(mq, &msg, 0);
+		if (error)
+			return (error);
+		goto received;
+	}
+
+	/* try to get a message before checking time */
+	error = _mqueue_recv(mq, &msg, -1);
+	if (error == 0)
+		goto received;
+
+	if (error != EAGAIN)
+		return (error);
+
+	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
+		error = EINVAL;
+		return (error);
+	}
+
+	for (;;) {
+		ts2 = *abs_timeout;
+		getnanotime(&ts);
+		timespecsub(&ts2, &ts);
+		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
+			error = ETIMEDOUT;
+			return (error);
+		}
+		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
+		error = _mqueue_recv(mq, &msg, tvtohz(&tv));
+		if (error == 0)
+			break;
+		if (error != ETIMEDOUT)
+			return (error);
+	}
+
+received:
+	error = mqueue_savemsg(msg, msg_ptr, msg_prio);
+	if (error == 0) {
+		curthread->td_retval[0] = msg->msg_size;
+		curthread->td_retval[1] = 0;
+	}
+	mqueue_freemsg(msg);
+	return (error);
+}
+
+/*
+ * Common routine to receive a message
+ */
+static int
+_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
+{	
+	int error = 0;
+	
+	mtx_lock(&mq->mq_mutex);
+	while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
+		if (timo < 0) {
+			mtx_unlock(&mq->mq_mutex);
+			return (EAGAIN);
+		}
+		mq->mq_receivers++;
+		error = msleep(&mq->mq_receivers, &mq->mq_mutex,
+			    PCATCH, "mqrecv", timo);
+		mq->mq_receivers--;
+		if (error == EAGAIN)
+			error = ETIMEDOUT;
+	}
+	if (*msg != NULL) {
+		error = 0;
+		TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
+		mq->mq_curmsgs--;
+		mq->mq_totalbytes -= (*msg)->msg_size;
+		if (mq->mq_senders)
+			wakeup_one(&mq->mq_senders);
+		if (mq->mq_flags & MQ_WSEL) {
+			mq->mq_flags &= ~MQ_WSEL;
+			selwakeup(&mq->mq_wsel);
+		}
+		KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
+	}
+	if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
+	    !TAILQ_EMPTY(&mq->mq_msgq)) {
+		mqueue_send_notification(mq);
+	}
+	mtx_unlock(&mq->mq_mutex);
+	return (error);
+}
+
+static __inline struct mqueue_notifier *
+notifier_alloc(void)
+{
+	return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
+}
+
+static __inline void
+notifier_free(struct mqueue_notifier *p)
+{
+	uma_zfree(mqnoti_zone, p);
+}
+
+static struct mqueue_notifier *
+notifier_search(struct proc *p, int fd)
+{
+	struct mqueue_notifier *nt;
+
+	LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
+		if (nt->nt_ksi.ksi_mqd == fd)
+			break;
+	}
+	return (nt);
+}
+
+static __inline void
+notifier_insert(struct proc *p, struct mqueue_notifier *nt)
+{
+	LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
+}
+
+static __inline void
+notifier_delete(struct proc *p, struct mqueue_notifier *nt)
+{
+	LIST_REMOVE(nt, nt_link);
+	notifier_free(nt);
+}
+
+static void
+notifier_remove(struct proc *p, struct mqueue *mq, int fd)
+{
+	struct mqueue_notifier *nt;
+
+	mtx_assert(&mq->mq_mutex, MA_OWNED);
+	PROC_LOCK(p);
+	nt = notifier_search(p, fd);
+	if (nt != NULL) {
+		if (mq->mq_notifier == nt)
+			mq->mq_notifier = NULL;
+		sigqueue_take(&nt->nt_ksi);
+		notifier_delete(p, nt);
+	}
+	PROC_UNLOCK(p);
+}
+
+static int
+kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode,
+    const struct mq_attr *attr)
+{
+	char path[MQFS_NAMELEN + 1];
+	struct mqfs_node *pn;
+	struct filedesc *fdp;
+	struct file *fp;
+	struct mqueue *mq;
+	int fd, error, len, cmode;
+
+	fdp = td->td_proc->p_fd;
+	cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT);
+	mq = NULL;
+	if ((flags & O_CREAT) != 0 && attr != NULL) {
+		if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg)
+			return (EINVAL);
+		if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize)
+			return (EINVAL);
+	}
+
+	error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL);
+        if (error)
+		return (error);
+
+	/*
+	 * The first character of name must be a slash  (/) character
+	 * and the remaining characters of name cannot include any slash
+	 * characters. 
+	 */
+	len = strlen(path);
+	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
+		return (EINVAL);
+
+	error = falloc(td, &fp, &fd, O_CLOEXEC);
+	if (error)
+		return (error);
+
+	sx_xlock(&mqfs_data.mi_lock);
+	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
+	if (pn == NULL) {
+		if (!(flags & O_CREAT)) {
+			error = ENOENT;
+		} else {
+			mq = mqueue_alloc(attr);
+			if (mq == NULL) {
+				error = ENFILE;
+			} else {
+				pn = mqfs_create_file(mqfs_data.mi_root,
+				         path + 1, len - 1, td->td_ucred,
+					 cmode);
+				if (pn == NULL) {
+					error = ENOSPC;
+					mqueue_free(mq);
+				}
+			}
+		}
+
+		if (error == 0) {
+			pn->mn_data = mq;
+		}
+	} else {
+		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
+			error = EEXIST;
+		} else {
+			accmode_t accmode = 0;
+
+			if (flags & FREAD)
+				accmode |= VREAD;
+			if (flags & FWRITE)
+				accmode |= VWRITE;
+			error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
+				    pn->mn_gid, accmode, td->td_ucred, NULL);
+		}
+	}
+
+	if (error) {
+		sx_xunlock(&mqfs_data.mi_lock);
+		fdclose(fdp, fp, fd, td);
+		fdrop(fp, td);
+		return (error);
+	}
+
+	mqnode_addref(pn);
+	sx_xunlock(&mqfs_data.mi_lock);
+
+	finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,
+	    &mqueueops);
+
+	td->td_retval[0] = fd;
+	fdrop(fp, td);
+	return (0);
+}
+
+/*
+ * Syscall to open a message queue.
+ */
+int
+sys_kmq_open(struct thread *td, struct kmq_open_args *uap)
+{
+	struct mq_attr attr;
+	int flags, error;
+
+	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
+		return (EINVAL);
+	flags = FFLAGS(uap->flags);
+	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
+		error = copyin(uap->attr, &attr, sizeof(attr));
+		if (error)
+			return (error);
+	}
+	return (kern_kmq_open(td, uap->path, flags, uap->mode,
+	    uap->attr != NULL ? &attr : NULL));
+}
+
+/*
+ * Syscall to unlink a message queue.
+ */
+int
+sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
+{
+	char path[MQFS_NAMELEN+1];
+	struct mqfs_node *pn;
+	int error, len;
+
+	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
+        if (error)
+		return (error);
+
+	len = strlen(path);
+	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
+		return (EINVAL);
+
+	sx_xlock(&mqfs_data.mi_lock);
+	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
+	if (pn != NULL)
+		error = do_unlink(pn, td->td_ucred);
+	else
+		error = ENOENT;
+	sx_xunlock(&mqfs_data.mi_lock);
+	return (error);
+}
+
+typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **);
+
+/*
+ * Get message queue by giving file slot
+ */
+static int
+_getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func,
+       struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
+{
+	struct mqfs_node *pn;
+	int error;
+
+	error = func(td, fd, rightsp, fpp);
+	if (error)
+		return (error);
+	if (&mqueueops != (*fpp)->f_ops) {
+		fdrop(*fpp, td);
+		return (EBADF);
+	}
+	pn = (*fpp)->f_data;
+	if (ppn)
+		*ppn = pn;
+	if (pmq)
+		*pmq = pn->mn_data;
+	return (0);
+}
+
+static __inline int
+getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
+	struct mqueue **pmq)
+{
+	cap_rights_t rights;
+
+	return _getmq(td, fd, cap_rights_init(&rights, CAP_POLL_EVENT), fget,
+	    fpp, ppn, pmq);
+}
+
+static __inline int
+getmq_read(struct thread *td, int fd, struct file **fpp,
+	 struct mqfs_node **ppn, struct mqueue **pmq)
+{
+	cap_rights_t rights;
+
+	return _getmq(td, fd, cap_rights_init(&rights, CAP_READ), fget_read,
+	    fpp, ppn, pmq);
+}
+
+static __inline int
+getmq_write(struct thread *td, int fd, struct file **fpp,
+	struct mqfs_node **ppn, struct mqueue **pmq)
+{
+	cap_rights_t rights;
+
+	return _getmq(td, fd, cap_rights_init(&rights, CAP_WRITE), fget_write,
+	    fpp, ppn, pmq);
+}
+
+static int
+kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr,
+    struct mq_attr *oattr)
+{
+	struct mqueue *mq;
+	struct file *fp;
+	u_int oflag, flag;
+	int error;
+
+	if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0)
+		return (EINVAL);
+	error = getmq(td, mqd, &fp, NULL, &mq);
+	if (error)
+		return (error);
+	oattr->mq_maxmsg  = mq->mq_maxmsg;
+	oattr->mq_msgsize = mq->mq_msgsize;
+	oattr->mq_curmsgs = mq->mq_curmsgs;
+	if (attr != NULL) {
+		do {
+			oflag = flag = fp->f_flag;
+			flag &= ~O_NONBLOCK;
+			flag |= (attr->mq_flags & O_NONBLOCK);
+		} while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
+	} else
+		oflag = fp->f_flag;
+	oattr->mq_flags = (O_NONBLOCK & oflag);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
+{
+	struct mq_attr attr, oattr;
+	int error;
+
+	if (uap->attr != NULL) {
+		error = copyin(uap->attr, &attr, sizeof(attr));
+		if (error != 0)
+			return (error);
+	}
+	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
+	    &oattr);
+	if (error != 0)
+		return (error);
+	if (uap->oattr != NULL)
+		error = copyout(&oattr, uap->oattr, sizeof(oattr));
+	return (error);
+}
+
+int
+sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
+{
+	struct mqueue *mq;
+	struct file *fp;
+	struct timespec *abs_timeout, ets;
+	int error;
+	int waitok;
+
+	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
+	if (error)
+		return (error);
+	if (uap->abs_timeout != NULL) {
+		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
+		if (error != 0)
+			return (error);
+		abs_timeout = &ets;
+	} else
+		abs_timeout = NULL;
+	waitok = !(fp->f_flag & O_NONBLOCK);
+	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
+		uap->msg_prio, waitok, abs_timeout);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
+{
+	struct mqueue *mq;
+	struct file *fp;
+	struct timespec *abs_timeout, ets;
+	int error, waitok;
+
+	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
+	if (error)
+		return (error);
+	if (uap->abs_timeout != NULL) {
+		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
+		if (error != 0)
+			return (error);
+		abs_timeout = &ets;
+	} else
+		abs_timeout = NULL;
+	waitok = !(fp->f_flag & O_NONBLOCK);
+	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
+		uap->msg_prio, waitok, abs_timeout);
+	fdrop(fp, td);
+	return (error);
+}
+
+static int
+kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev)
+{
+#ifdef CAPABILITIES
+	cap_rights_t rights;
+#endif
+	struct filedesc *fdp;
+	struct proc *p;
+	struct mqueue *mq;
+	struct file *fp, *fp2;
+	struct mqueue_notifier *nt, *newnt = NULL;
+	int error;
+
+	if (sigev != NULL) {
+		if (sigev->sigev_notify != SIGEV_SIGNAL &&
+		    sigev->sigev_notify != SIGEV_THREAD_ID &&
+		    sigev->sigev_notify != SIGEV_NONE)
+			return (EINVAL);
+		if ((sigev->sigev_notify == SIGEV_SIGNAL ||
+		    sigev->sigev_notify == SIGEV_THREAD_ID) &&
+		    !_SIG_VALID(sigev->sigev_signo))
+			return (EINVAL);
+	}
+	p = td->td_proc;
+	fdp = td->td_proc->p_fd;
+	error = getmq(td, mqd, &fp, NULL, &mq);
+	if (error)
+		return (error);
+again:
+	FILEDESC_SLOCK(fdp);
+	fp2 = fget_locked(fdp, mqd);
+	if (fp2 == NULL) {
+		FILEDESC_SUNLOCK(fdp);
+		error = EBADF;
+		goto out;
+	}
+#ifdef CAPABILITIES
+	error = cap_check(cap_rights(fdp, mqd),
+	    cap_rights_init(&rights, CAP_POLL_EVENT));
+	if (error) {
+		FILEDESC_SUNLOCK(fdp);
+		goto out;
+	}
+#endif
+	if (fp2 != fp) {
+		FILEDESC_SUNLOCK(fdp);
+		error = EBADF;
+		goto out;
+	}
+	mtx_lock(&mq->mq_mutex);
+	FILEDESC_SUNLOCK(fdp);
+	if (sigev != NULL) {
+		if (mq->mq_notifier != NULL) {
+			error = EBUSY;
+		} else {
+			PROC_LOCK(p);
+			nt = notifier_search(p, mqd);
+			if (nt == NULL) {
+				if (newnt == NULL) {
+					PROC_UNLOCK(p);
+					mtx_unlock(&mq->mq_mutex);
+					newnt = notifier_alloc();
+					goto again;
+				}
+			}
+
+			if (nt != NULL) {
+				sigqueue_take(&nt->nt_ksi);
+				if (newnt != NULL) {
+					notifier_free(newnt);
+					newnt = NULL;
+				}
+			} else {
+				nt = newnt;
+				newnt = NULL;
+				ksiginfo_init(&nt->nt_ksi);
+				nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
+				nt->nt_ksi.ksi_code = SI_MESGQ;
+				nt->nt_proc = p;
+				nt->nt_ksi.ksi_mqd = mqd;
+				notifier_insert(p, nt);
+			}
+			nt->nt_sigev = *sigev;
+			mq->mq_notifier = nt;
+			PROC_UNLOCK(p);
+			/*
+			 * if there is no receivers and message queue
+			 * is not empty, we should send notification
+			 * as soon as possible.
+			 */
+			if (mq->mq_receivers == 0 &&
+			    !TAILQ_EMPTY(&mq->mq_msgq))
+				mqueue_send_notification(mq);
+		}
+	} else {
+		notifier_remove(p, mq, mqd);
+	}
+	mtx_unlock(&mq->mq_mutex);
+
+out:
+	fdrop(fp, td);
+	if (newnt != NULL)
+		notifier_free(newnt);
+	return (error);
+}
+
+int
+sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
+{
+	struct sigevent ev, *evp;
+	int error;
+
+	if (uap->sigev == NULL) {
+		evp = NULL;
+	} else {
+		error = copyin(uap->sigev, &ev, sizeof(ev));
+		if (error != 0)
+			return (error);
+		evp = &ev;
+	}
+	return (kern_kmq_notify(td, uap->mqd, evp));
+}
+
+static void
+mqueue_fdclose(struct thread *td, int fd, struct file *fp)
+{
+	struct filedesc *fdp;
+	struct mqueue *mq;
+ 
+	fdp = td->td_proc->p_fd;
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	if (fp->f_ops == &mqueueops) {
+		mq = FPTOMQ(fp);
+		mtx_lock(&mq->mq_mutex);
+		notifier_remove(td->td_proc, mq, fd);
+
+		/* have to wakeup thread in same process */
+		if (mq->mq_flags & MQ_RSEL) {
+			mq->mq_flags &= ~MQ_RSEL;
+			selwakeup(&mq->mq_rsel);
+		}
+		if (mq->mq_flags & MQ_WSEL) {
+			mq->mq_flags &= ~MQ_WSEL;
+			selwakeup(&mq->mq_wsel);
+		}
+		mtx_unlock(&mq->mq_mutex);
+	}
+}
+
+static void
+mq_proc_exit(void *arg __unused, struct proc *p)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+	struct mqueue *mq;
+	int i;
+
+	fdp = p->p_fd;
+	FILEDESC_SLOCK(fdp);
+	for (i = 0; i < fdp->fd_nfiles; ++i) {
+		fp = fget_locked(fdp, i);
+		if (fp != NULL && fp->f_ops == &mqueueops) {
+			mq = FPTOMQ(fp);
+			mtx_lock(&mq->mq_mutex);
+			notifier_remove(p, FPTOMQ(fp), i);
+			mtx_unlock(&mq->mq_mutex);
+		}
+	}
+	FILEDESC_SUNLOCK(fdp);
+	KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
+}
+
+static int
+mqf_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+	int flags, struct thread *td)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+mqf_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+	int flags, struct thread *td)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+mqf_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+static int
+mqf_ioctl(struct file *fp, u_long cmd, void *data,
+	struct ucred *active_cred, struct thread *td)
+{
+	return (ENOTTY);
+}
+
+static int
+mqf_poll(struct file *fp, int events, struct ucred *active_cred,
+	struct thread *td)
+{
+	struct mqueue *mq = FPTOMQ(fp);
+	int revents = 0;
+
+	mtx_lock(&mq->mq_mutex);
+	if (events & (POLLIN | POLLRDNORM)) {
+		if (mq->mq_curmsgs) {
+			revents |= events & (POLLIN | POLLRDNORM);
+		} else {
+			mq->mq_flags |= MQ_RSEL;
+			selrecord(td, &mq->mq_rsel);
+ 		}
+	}
+	if (events & POLLOUT) {
+		if (mq->mq_curmsgs < mq->mq_maxmsg)
+			revents |= POLLOUT;
+		else {
+			mq->mq_flags |= MQ_WSEL;
+			selrecord(td, &mq->mq_wsel);
+		}
+	}
+	mtx_unlock(&mq->mq_mutex);
+	return (revents);
+}
+
+static int
+mqf_close(struct file *fp, struct thread *td)
+{
+	struct mqfs_node *pn;
+
+	fp->f_ops = &badfileops;
+	pn = fp->f_data;
+	fp->f_data = NULL;
+	sx_xlock(&mqfs_data.mi_lock);
+	mqnode_release(pn);
+	sx_xunlock(&mqfs_data.mi_lock);
+	return (0);
+}
+
+static int
+mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
+	struct thread *td)
+{
+	struct mqfs_node *pn = fp->f_data;
+
+	bzero(st, sizeof *st);
+	sx_xlock(&mqfs_data.mi_lock);
+	st->st_atim = pn->mn_atime;
+	st->st_mtim = pn->mn_mtime;
+	st->st_ctim = pn->mn_ctime;
+	st->st_birthtim = pn->mn_birth;
+	st->st_uid = pn->mn_uid;
+	st->st_gid = pn->mn_gid;
+	st->st_mode = S_IFIFO | pn->mn_mode;
+	sx_xunlock(&mqfs_data.mi_lock);
+	return (0);
+}
+
+static int
+mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct mqfs_node *pn;
+	int error;
+
+	error = 0;
+	pn = fp->f_data;
+	sx_xlock(&mqfs_data.mi_lock);
+	error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN,
+	    active_cred, NULL);
+	if (error != 0)
+		goto out;
+	pn->mn_mode = mode & ACCESSPERMS;
+out:
+	sx_xunlock(&mqfs_data.mi_lock);
+	return (error);
+}
+
+static int
+mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct mqfs_node *pn;
+	int error;
+
+	error = 0;
+	pn = fp->f_data;
+	sx_xlock(&mqfs_data.mi_lock);
+	if (uid == (uid_t)-1)
+		uid = pn->mn_uid;
+	if (gid == (gid_t)-1)
+		gid = pn->mn_gid;
+	if (((uid != pn->mn_uid && uid != active_cred->cr_uid) ||
+	    (gid != pn->mn_gid && !groupmember(gid, active_cred))) &&
+	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
+		goto out;
+	pn->mn_uid = uid;
+	pn->mn_gid = gid;
+out:
+	sx_xunlock(&mqfs_data.mi_lock);
+	return (error);
+}
+
+static int
+mqf_kqfilter(struct file *fp, struct knote *kn)
+{
+	struct mqueue *mq = FPTOMQ(fp);
+	int error = 0;
+
+	if (kn->kn_filter == EVFILT_READ) {
+		kn->kn_fop = &mq_rfiltops;
+		knlist_add(&mq->mq_rsel.si_note, kn, 0);
+	} else if (kn->kn_filter == EVFILT_WRITE) {
+		kn->kn_fop = &mq_wfiltops;
+		knlist_add(&mq->mq_wsel.si_note, kn, 0);
+	} else
+		error = EINVAL;
+	return (error);
+}
+
+static void
+filt_mqdetach(struct knote *kn)
+{
+	struct mqueue *mq = FPTOMQ(kn->kn_fp);
+
+	if (kn->kn_filter == EVFILT_READ)
+		knlist_remove(&mq->mq_rsel.si_note, kn, 0);
+	else if (kn->kn_filter == EVFILT_WRITE)
+		knlist_remove(&mq->mq_wsel.si_note, kn, 0);
+	else
+		panic("filt_mqdetach");
+}
+
+static int
+filt_mqread(struct knote *kn, long hint)
+{
+	struct mqueue *mq = FPTOMQ(kn->kn_fp);
+
+	mtx_assert(&mq->mq_mutex, MA_OWNED);
+	return (mq->mq_curmsgs != 0);
+}
+
+static int
+filt_mqwrite(struct knote *kn, long hint)
+{
+	struct mqueue *mq = FPTOMQ(kn->kn_fp);
+
+	mtx_assert(&mq->mq_mutex, MA_OWNED);
+	return (mq->mq_curmsgs < mq->mq_maxmsg);
+}
+
+static struct fileops mqueueops = {
+	.fo_read		= mqf_read,
+	.fo_write		= mqf_write,
+	.fo_truncate		= mqf_truncate,
+	.fo_ioctl		= mqf_ioctl,
+	.fo_poll		= mqf_poll,
+	.fo_kqfilter		= mqf_kqfilter,
+	.fo_stat		= mqf_stat,
+	.fo_chmod		= mqf_chmod,
+	.fo_chown		= mqf_chown,
+	.fo_close		= mqf_close,
+	.fo_sendfile		= invfo_sendfile,
+};
+
+static struct vop_vector mqfs_vnodeops = {
+	.vop_default 		= &default_vnodeops,
+	.vop_access		= mqfs_access,
+	.vop_cachedlookup	= mqfs_lookup,
+	.vop_lookup		= vfs_cache_lookup,
+	.vop_reclaim		= mqfs_reclaim,
+	.vop_create		= mqfs_create,
+	.vop_remove		= mqfs_remove,
+	.vop_inactive		= mqfs_inactive,
+	.vop_open		= mqfs_open,
+	.vop_close		= mqfs_close,
+	.vop_getattr		= mqfs_getattr,
+	.vop_setattr		= mqfs_setattr,
+	.vop_read		= mqfs_read,
+	.vop_write		= VOP_EOPNOTSUPP,
+	.vop_readdir		= mqfs_readdir,
+	.vop_mkdir		= VOP_EOPNOTSUPP,
+	.vop_rmdir		= VOP_EOPNOTSUPP
+};
+
+static struct vfsops mqfs_vfsops = {
+	.vfs_init 		= mqfs_init,
+	.vfs_uninit		= mqfs_uninit,
+	.vfs_mount		= mqfs_mount,
+	.vfs_unmount		= mqfs_unmount,
+	.vfs_root		= mqfs_root,
+	.vfs_statfs		= mqfs_statfs,
+};
+
+static struct vfsconf mqueuefs_vfsconf = {
+	.vfc_version = VFS_VERSION,
+	.vfc_name = "mqueuefs",
+	.vfc_vfsops = &mqfs_vfsops,
+	.vfc_typenum = -1,
+	.vfc_flags = VFCF_SYNTHETIC
+};
+
+static struct syscall_helper_data mq_syscalls[] = {
+	SYSCALL_INIT_HELPER(kmq_open),
+	SYSCALL_INIT_HELPER(kmq_setattr),
+	SYSCALL_INIT_HELPER(kmq_timedsend),
+	SYSCALL_INIT_HELPER(kmq_timedreceive),
+	SYSCALL_INIT_HELPER(kmq_notify),
+	SYSCALL_INIT_HELPER(kmq_unlink),
+	SYSCALL_INIT_LAST
+};
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+static void
+mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to)
+{
+
+	to->mq_flags = from->mq_flags;
+	to->mq_maxmsg = from->mq_maxmsg;
+	to->mq_msgsize = from->mq_msgsize;
+	to->mq_curmsgs = from->mq_curmsgs;
+}
+
+static void
+mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to)
+{
+
+	to->mq_flags = from->mq_flags;
+	to->mq_maxmsg = from->mq_maxmsg;
+	to->mq_msgsize = from->mq_msgsize;
+	to->mq_curmsgs = from->mq_curmsgs;
+}
+
+int
+freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap)
+{
+	struct mq_attr attr;
+	struct mq_attr32 attr32;
+	int flags, error;
+
+	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
+		return (EINVAL);
+	flags = FFLAGS(uap->flags);
+	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
+		error = copyin(uap->attr, &attr32, sizeof(attr32));
+		if (error)
+			return (error);
+		mq_attr_from32(&attr32, &attr);
+	}
+	return (kern_kmq_open(td, uap->path, flags, uap->mode,
+	    uap->attr != NULL ? &attr : NULL));
+}
+
+int
+freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap)
+{
+	struct mq_attr attr, oattr;
+	struct mq_attr32 attr32, oattr32;
+	int error;
+
+	if (uap->attr != NULL) {
+		error = copyin(uap->attr, &attr32, sizeof(attr32));
+		if (error != 0)
+			return (error);
+		mq_attr_from32(&attr32, &attr);
+	}
+	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
+	    &oattr);
+	if (error != 0)
+		return (error);
+	if (uap->oattr != NULL) {
+		mq_attr_to32(&oattr, &oattr32);
+		error = copyout(&oattr32, uap->oattr, sizeof(oattr32));
+	}
+	return (error);
+}
+
+int
+freebsd32_kmq_timedsend(struct thread *td,
+    struct freebsd32_kmq_timedsend_args *uap)
+{
+	struct mqueue *mq;
+	struct file *fp;
+	struct timespec32 ets32;
+	struct timespec *abs_timeout, ets;
+	int error;
+	int waitok;
+
+	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
+	if (error)
+		return (error);
+	if (uap->abs_timeout != NULL) {
+		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
+		if (error != 0)
+			return (error);
+		CP(ets32, ets, tv_sec);
+		CP(ets32, ets, tv_nsec);
+		abs_timeout = &ets;
+	} else
+		abs_timeout = NULL;
+	waitok = !(fp->f_flag & O_NONBLOCK);
+	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
+		uap->msg_prio, waitok, abs_timeout);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+freebsd32_kmq_timedreceive(struct thread *td,
+    struct freebsd32_kmq_timedreceive_args *uap)
+{
+	struct mqueue *mq;
+	struct file *fp;
+	struct timespec32 ets32;
+	struct timespec *abs_timeout, ets;
+	int error, waitok;
+
+	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
+	if (error)
+		return (error);
+	if (uap->abs_timeout != NULL) {
+		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
+		if (error != 0)
+			return (error);
+		CP(ets32, ets, tv_sec);
+		CP(ets32, ets, tv_nsec);
+		abs_timeout = &ets;
+	} else
+		abs_timeout = NULL;
+	waitok = !(fp->f_flag & O_NONBLOCK);
+	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
+		uap->msg_prio, waitok, abs_timeout);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap)
+{
+	struct sigevent ev, *evp;
+	struct sigevent32 ev32;
+	int error;
+
+	if (uap->sigev == NULL) {
+		evp = NULL;
+	} else {
+		error = copyin(uap->sigev, &ev32, sizeof(ev32));
+		if (error != 0)
+			return (error);
+		error = convert_sigevent32(&ev32, &ev);
+		if (error != 0)
+			return (error);
+		evp = &ev;
+	}
+	return (kern_kmq_notify(td, uap->mqd, evp));
+}
+
+static struct syscall_helper_data mq32_syscalls[] = {
+	SYSCALL32_INIT_HELPER(freebsd32_kmq_open),
+	SYSCALL32_INIT_HELPER(freebsd32_kmq_setattr),
+	SYSCALL32_INIT_HELPER(freebsd32_kmq_timedsend),
+	SYSCALL32_INIT_HELPER(freebsd32_kmq_timedreceive),
+	SYSCALL32_INIT_HELPER(freebsd32_kmq_notify),
+	SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink),
+	SYSCALL_INIT_LAST
+};
+#endif
+
+static int
+mqinit(void)
+{
+	int error;
+
+	error = syscall_helper_register(mq_syscalls);
+	if (error != 0)
+		return (error);
+#ifdef COMPAT_FREEBSD32
+	error = syscall32_helper_register(mq32_syscalls);
+	if (error != 0)
+		return (error);
+#endif
+	return (0);
+}
+
+static int
+mqunload(void)
+{
+
+#ifdef COMPAT_FREEBSD32
+	syscall32_helper_unregister(mq32_syscalls);
+#endif
+	syscall_helper_unregister(mq_syscalls);
+	return (0);
+}
+
+static int
+mq_modload(struct module *module, int cmd, void *arg)
+{
+	int error = 0;
+
+	error = vfs_modevent(module, cmd, arg);
+	if (error != 0)
+		return (error);
+
+	switch (cmd) {
+	case MOD_LOAD:
+		error = mqinit();
+		if (error != 0)
+			mqunload();
+		break;
+	case MOD_UNLOAD:
+		error = mqunload();
+		break;
+	default:
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t mqueuefs_mod = {
+	"mqueuefs",
+	mq_modload,
+	&mqueuefs_vfsconf
+};
+DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
+MODULE_VERSION(mqueuefs, 1);
diff --git a/sys/kern/uipc_sem.c b/sys/kern/uipc_sem.c
new file mode 100644
index 0000000..f641654
--- /dev/null
+++ b/sys/kern/uipc_sem.c
@@ -0,0 +1,1111 @@
+/*-
+ * Copyright (c) 2002 Alfred Perlstein <alfred@FreeBSD.org>
+ * Copyright (c) 2003-2005 SPARTA, Inc.
+ * Copyright (c) 2005 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project in part by Network
+ * Associates Laboratories, the Security Research Division of Network
+ * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
+ * as part of the DARPA CHATS research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_posix.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/condvar.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/fnv_hash.h>
+#include <sys/kernel.h>
+#include <sys/ksem.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/_semaphore.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/vnode.h>
+
+#include <security/mac/mac_framework.h>
+
+FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support");
+/*
+ * TODO
+ *
+ * - Resource limits?
+ * - Replace global sem_lock with mtx_pool locks?
+ * - Add a MAC check_create() hook for creating new named semaphores.
+ */
+
+#ifndef SEM_MAX
+#define	SEM_MAX	30
+#endif
+
+#ifdef SEM_DEBUG
+#define	DP(x)	printf x
+#else
+#define	DP(x)
+#endif
+
+struct ksem_mapping {
+	char		*km_path;
+	Fnv32_t		km_fnv;
+	struct ksem	*km_ksem;
+	LIST_ENTRY(ksem_mapping) km_link;
+};
+
+static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor");
+static LIST_HEAD(, ksem_mapping) *ksem_dictionary;
+static struct sx ksem_dict_lock;
+static struct mtx ksem_count_lock;
+static struct mtx sem_lock;
+static u_long ksem_hash;
+static int ksem_dead;
+
+#define	KSEM_HASH(fnv)	(&ksem_dictionary[(fnv) & ksem_hash])
+
+static int nsems = 0;
+SYSCTL_DECL(_p1003_1b);
+SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0,
+    "Number of active kernel POSIX semaphores");
+
+static int	kern_sem_wait(struct thread *td, semid_t id, int tryflag,
+		    struct timespec *abstime);
+static int	ksem_access(struct ksem *ks, struct ucred *ucred);
+static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode,
+		    unsigned int value);
+static int	ksem_create(struct thread *td, const char *path,
+		    semid_t *semidp, mode_t mode, unsigned int value,
+		    int flags, int compat32);
+static void	ksem_drop(struct ksem *ks);
+static int	ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
+    struct file **fpp);
+static struct ksem *ksem_hold(struct ksem *ks);
+static void	ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks);
+static struct ksem *ksem_lookup(char *path, Fnv32_t fnv);
+static void	ksem_module_destroy(void);
+static int	ksem_module_init(void);
+static int	ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
+static int	sem_modload(struct module *module, int cmd, void *arg);
+
+static fo_rdwr_t	ksem_read;
+static fo_rdwr_t	ksem_write;
+static fo_truncate_t	ksem_truncate;
+static fo_ioctl_t	ksem_ioctl;
+static fo_poll_t	ksem_poll;
+static fo_kqfilter_t	ksem_kqfilter;
+static fo_stat_t	ksem_stat;
+static fo_close_t	ksem_closef;
+static fo_chmod_t	ksem_chmod;
+static fo_chown_t	ksem_chown;
+
+/* File descriptor operations. */
+static struct fileops ksem_ops = {
+	.fo_read = ksem_read,
+	.fo_write = ksem_write,
+	.fo_truncate = ksem_truncate,
+	.fo_ioctl = ksem_ioctl,
+	.fo_poll = ksem_poll,
+	.fo_kqfilter = ksem_kqfilter,
+	.fo_stat = ksem_stat,
+	.fo_close = ksem_closef,
+	.fo_chmod = ksem_chmod,
+	.fo_chown = ksem_chown,
+	.fo_sendfile = invfo_sendfile,
+	.fo_flags = DFLAG_PASSABLE
+};
+
+FEATURE(posix_sem, "POSIX semaphores");
+
+static int
+ksem_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+ksem_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+ksem_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+static int
+ksem_ioctl(struct file *fp, u_long com, void *data,
+    struct ucred *active_cred, struct thread *td)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+ksem_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+ksem_kqfilter(struct file *fp, struct knote *kn)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct ksem *ks;
+#ifdef MAC
+	int error;
+#endif
+
+	ks = fp->f_data;
+
+#ifdef MAC
+	error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks);
+	if (error)
+		return (error);
+#endif
+	
+	/*
+	 * Attempt to return sanish values for fstat() on a semaphore
+	 * file descriptor.
+	 */
+	bzero(sb, sizeof(*sb));
+
+	mtx_lock(&sem_lock);
+	sb->st_atim = ks->ks_atime;
+	sb->st_ctim = ks->ks_ctime;
+	sb->st_mtim = ks->ks_mtime;
+	sb->st_birthtim = ks->ks_birthtime;
+	sb->st_uid = ks->ks_uid;
+	sb->st_gid = ks->ks_gid;
+	sb->st_mode = S_IFREG | ks->ks_mode;		/* XXX */
+	mtx_unlock(&sem_lock);
+
+	return (0);
+}
+
+static int
+ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct ksem *ks;
+	int error;
+
+	error = 0;
+	ks = fp->f_data;
+	mtx_lock(&sem_lock);
+#ifdef MAC
+	error = mac_posixsem_check_setmode(active_cred, ks, mode);
+	if (error != 0)
+		goto out;
+#endif
+	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN,
+	    active_cred, NULL);
+	if (error != 0)
+		goto out;
+	ks->ks_mode = mode & ACCESSPERMS;
+out:
+	mtx_unlock(&sem_lock);
+	return (error);
+}
+
+static int
+ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct ksem *ks;
+	int error;
+
+	error = 0;
+	ks = fp->f_data;
+	mtx_lock(&sem_lock);
+#ifdef MAC
+	error = mac_posixsem_check_setowner(active_cred, ks, uid, gid);
+	if (error != 0)
+		goto out;
+#endif
+	if (uid == (uid_t)-1)
+		uid = ks->ks_uid;
+	if (gid == (gid_t)-1)
+                 gid = ks->ks_gid;
+	if (((uid != ks->ks_uid && uid != active_cred->cr_uid) ||
+	    (gid != ks->ks_gid && !groupmember(gid, active_cred))) &&
+	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
+		goto out;
+	ks->ks_uid = uid;
+	ks->ks_gid = gid;
+out:
+	mtx_unlock(&sem_lock);
+	return (error);
+}
+
+static int
+ksem_closef(struct file *fp, struct thread *td)
+{
+	struct ksem *ks;
+
+	ks = fp->f_data;
+	fp->f_data = NULL;
+	ksem_drop(ks);
+
+	return (0);
+}
+
+/*
+ * ksem object management including creation and reference counting
+ * routines.
+ */
+static struct ksem *
+ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value)
+{
+	struct ksem *ks;
+
+	mtx_lock(&ksem_count_lock);
+	if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) {
+		mtx_unlock(&ksem_count_lock);
+		return (NULL);
+	}
+	nsems++;
+	mtx_unlock(&ksem_count_lock);
+	ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO);
+	ks->ks_uid = ucred->cr_uid;
+	ks->ks_gid = ucred->cr_gid;
+	ks->ks_mode = mode;
+	ks->ks_value = value;
+	cv_init(&ks->ks_cv, "ksem");
+	vfs_timestamp(&ks->ks_birthtime);
+	ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime;
+	refcount_init(&ks->ks_ref, 1);
+#ifdef MAC
+	mac_posixsem_init(ks);
+	mac_posixsem_create(ucred, ks);
+#endif
+
+	return (ks);
+}
+
+static struct ksem *
+ksem_hold(struct ksem *ks)
+{
+
+	refcount_acquire(&ks->ks_ref);
+	return (ks);
+}
+
+static void
+ksem_drop(struct ksem *ks)
+{
+
+	if (refcount_release(&ks->ks_ref)) {
+#ifdef MAC
+		mac_posixsem_destroy(ks);
+#endif
+		cv_destroy(&ks->ks_cv);
+		free(ks, M_KSEM);
+		mtx_lock(&ksem_count_lock);
+		nsems--;
+		mtx_unlock(&ksem_count_lock);
+	}
+}
+
+/*
+ * Determine if the credentials have sufficient permissions for read
+ * and write access.
+ */
+static int
+ksem_access(struct ksem *ks, struct ucred *ucred)
+{
+	int error;
+
+	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid,
+	    VREAD | VWRITE, ucred, NULL);
+	if (error)
+		error = priv_check_cred(ucred, PRIV_SEM_WRITE, 0);
+	return (error);
+}
+
+/*
+ * Dictionary management.  We maintain an in-kernel dictionary to map
+ * paths to semaphore objects.  We use the FNV hash on the path to
+ * store the mappings in a hash table.
+ */
+static struct ksem *
+ksem_lookup(char *path, Fnv32_t fnv)
+{
+	struct ksem_mapping *map;
+
+	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
+		if (map->km_fnv != fnv)
+			continue;
+		if (strcmp(map->km_path, path) == 0)
+			return (map->km_ksem);
+	}
+
+	return (NULL);
+}
+
+static void
+ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks)
+{
+	struct ksem_mapping *map;
+
+	map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK);
+	map->km_path = path;
+	map->km_fnv = fnv;
+	map->km_ksem = ksem_hold(ks);
+	ks->ks_path = path;
+	LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link);
+}
+
+static int
+ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
+{
+	struct ksem_mapping *map;
+	int error;
+
+	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
+		if (map->km_fnv != fnv)
+			continue;
+		if (strcmp(map->km_path, path) == 0) {
+#ifdef MAC
+			error = mac_posixsem_check_unlink(ucred, map->km_ksem);
+			if (error)
+				return (error);
+#endif
+			error = ksem_access(map->km_ksem, ucred);
+			if (error)
+				return (error);
+			map->km_ksem->ks_path = NULL;
+			LIST_REMOVE(map, km_link);
+			ksem_drop(map->km_ksem);
+			free(map->km_path, M_KSEM);
+			free(map, M_KSEM);
+			return (0);
+		}
+	}
+
+	return (ENOENT);
+}
+
+static void
+ksem_info_impl(struct ksem *ks, char *path, size_t size, uint32_t *value)
+{
+
+	if (ks->ks_path == NULL)
+		return;
+	sx_slock(&ksem_dict_lock);
+	if (ks->ks_path != NULL)
+		strlcpy(path, ks->ks_path, size);
+	if (value != NULL)
+		*value = ks->ks_value;
+	sx_sunlock(&ksem_dict_lock);
+}
+
+static int
+ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd,
+    int compat32)
+{
+	semid_t semid;
+#ifdef COMPAT_FREEBSD32
+	int32_t semid32;
+#endif
+	void *ptr;
+	size_t ptrs;
+
+#ifdef COMPAT_FREEBSD32
+	if (compat32) {
+		semid32 = fd;
+		ptr = &semid32;
+		ptrs = sizeof(semid32);
+	} else {
+#endif
+		semid = fd;
+		ptr = &semid;
+		ptrs = sizeof(semid);
+		compat32 = 0; /* silence gcc */
+#ifdef COMPAT_FREEBSD32
+	}
+#endif
+
+	return (copyout(ptr, semidp, ptrs));
+}
+
+/* Other helper routines. */
+static int
+ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode,
+    unsigned int value, int flags, int compat32)
+{
+	struct filedesc *fdp;
+	struct ksem *ks;
+	struct file *fp;
+	char *path;
+	Fnv32_t fnv;
+	int error, fd;
+
+	if (value > SEM_VALUE_MAX)
+		return (EINVAL);
+
+	fdp = td->td_proc->p_fd;
+	mode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
+	error = falloc(td, &fp, &fd, O_CLOEXEC);
+	if (error) {
+		if (name == NULL)
+			error = ENOSPC;
+		return (error);
+	}
+
+	/*
+	 * Go ahead and copyout the file descriptor now.  This is a bit
+	 * premature, but it is a lot easier to handle errors as opposed
+	 * to later when we've possibly created a new semaphore, etc.
+	 */
+	error = ksem_create_copyout_semid(td, semidp, fd, compat32);
+	if (error) {
+		fdclose(fdp, fp, fd, td);
+		fdrop(fp, td);
+		return (error);
+	}
+
+	if (name == NULL) {
+		/* Create an anonymous semaphore. */
+		ks = ksem_alloc(td->td_ucred, mode, value);
+		if (ks == NULL)
+			error = ENOSPC;
+		else
+			ks->ks_flags |= KS_ANONYMOUS;
+	} else {
+		path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK);
+		error = copyinstr(name, path, MAXPATHLEN, NULL);
+
+		/* Require paths to start with a '/' character. */
+		if (error == 0 && path[0] != '/')
+			error = EINVAL;
+		if (error) {
+			fdclose(fdp, fp, fd, td);
+			fdrop(fp, td);
+			free(path, M_KSEM);
+			return (error);
+		}
+
+		fnv = fnv_32_str(path, FNV1_32_INIT);
+		sx_xlock(&ksem_dict_lock);
+		ks = ksem_lookup(path, fnv);
+		if (ks == NULL) {
+			/* Object does not exist, create it if requested. */
+			if (flags & O_CREAT) {
+				ks = ksem_alloc(td->td_ucred, mode, value);
+				if (ks == NULL)
+					error = ENFILE;
+				else {
+					ksem_insert(path, fnv, ks);
+					path = NULL;
+				}
+			} else
+				error = ENOENT;
+		} else {
+			/*
+			 * Object already exists, obtain a new
+			 * reference if requested and permitted.
+			 */
+			if ((flags & (O_CREAT | O_EXCL)) ==
+			    (O_CREAT | O_EXCL))
+				error = EEXIST;
+			else {
+#ifdef MAC
+				error = mac_posixsem_check_open(td->td_ucred,
+				    ks);
+				if (error == 0)
+#endif
+				error = ksem_access(ks, td->td_ucred);
+			}
+			if (error == 0)
+				ksem_hold(ks);
+#ifdef INVARIANTS
+			else
+				ks = NULL;
+#endif
+		}
+		sx_xunlock(&ksem_dict_lock);
+		if (path)
+			free(path, M_KSEM);
+	}
+
+	if (error) {
+		KASSERT(ks == NULL, ("ksem_create error with a ksem"));
+		fdclose(fdp, fp, fd, td);
+		fdrop(fp, td);
+		return (error);
+	}
+	KASSERT(ks != NULL, ("ksem_create w/o a ksem"));
+
+	finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops);
+
+	fdrop(fp, td);
+
+	return (0);
+}
+
+static int
+ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
+    struct file **fpp)
+{
+	struct ksem *ks;
+	struct file *fp;
+	int error;
+
+	error = fget(td, id, rightsp, &fp);
+	if (error)
+		return (EINVAL);
+	if (fp->f_type != DTYPE_SEM) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	ks = fp->f_data;
+	if (ks->ks_flags & KS_DEAD) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	*fpp = fp;
+	return (0);
+}
+
+/* System calls. */
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_init_args {
+	unsigned int	value;
+	semid_t		*idp;
+};
+#endif
+int
+sys_ksem_init(struct thread *td, struct ksem_init_args *uap)
+{
+
+	return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
+	    0, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_open_args {
+	char		*name;
+	int		oflag;
+	mode_t		mode;
+	unsigned int	value;
+	semid_t		*idp;	
+};
+#endif
+int
+sys_ksem_open(struct thread *td, struct ksem_open_args *uap)
+{
+
+	DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid));
+
+	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
+		return (EINVAL);
+	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
+	    uap->oflag, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_unlink_args {
+	char		*name;
+};
+#endif
+int
+sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
+{
+	char *path;
+	Fnv32_t fnv;
+	int error;
+
+	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	error = copyinstr(uap->name, path, MAXPATHLEN, NULL);
+	if (error) {
+		free(path, M_TEMP);
+		return (error);
+	}
+
+	fnv = fnv_32_str(path, FNV1_32_INIT);
+	sx_xlock(&ksem_dict_lock);
+	error = ksem_remove(path, fnv, td->td_ucred);
+	sx_xunlock(&ksem_dict_lock);
+	free(path, M_TEMP);
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_close_args {
+	semid_t		id;
+};
+#endif
+int
+sys_ksem_close(struct thread *td, struct ksem_close_args *uap)
+{
+	struct ksem *ks;
+	struct file *fp;
+	int error;
+
+	/* No capability rights required to close a semaphore. */
+	error = ksem_get(td, uap->id, 0, &fp);
+	if (error)
+		return (error);
+	ks = fp->f_data;
+	if (ks->ks_flags & KS_ANONYMOUS) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	error = kern_close(td, uap->id);
+	fdrop(fp, td);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_post_args {
+	semid_t	id;
+};
+#endif
+int
+sys_ksem_post(struct thread *td, struct ksem_post_args *uap)
+{
+	cap_rights_t rights;
+	struct file *fp;
+	struct ksem *ks;
+	int error;
+
+	error = ksem_get(td, uap->id,
+	    cap_rights_init(&rights, CAP_SEM_POST), &fp);
+	if (error)
+		return (error);
+	ks = fp->f_data;
+
+	mtx_lock(&sem_lock);
+#ifdef MAC
+	error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks);
+	if (error)
+		goto err;
+#endif
+	if (ks->ks_value == SEM_VALUE_MAX) {
+		error = EOVERFLOW;
+		goto err;
+	}
+	++ks->ks_value;
+	if (ks->ks_waiters > 0)
+		cv_signal(&ks->ks_cv);
+	error = 0;
+	vfs_timestamp(&ks->ks_ctime);
+err:
+	mtx_unlock(&sem_lock);
+	fdrop(fp, td);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_wait_args {
+	semid_t		id;
+};
+#endif
+int
+sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap)
+{
+
+	return (kern_sem_wait(td, uap->id, 0, NULL));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_timedwait_args {
+	semid_t		id;
+	const struct timespec *abstime;
+};
+#endif
+int
+sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap)
+{
+	struct timespec abstime;
+	struct timespec *ts;
+	int error;
+
+	/*
+	 * We allow a null timespec (wait forever).
+	 */
+	if (uap->abstime == NULL)
+		ts = NULL;
+	else {
+		error = copyin(uap->abstime, &abstime, sizeof(abstime));
+		if (error != 0)
+			return (error);
+		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
+			return (EINVAL);
+		ts = &abstime;
+	}
+	return (kern_sem_wait(td, uap->id, 0, ts));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_trywait_args {
+	semid_t		id;
+};
+#endif
+int
+sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap)
+{
+
+	return (kern_sem_wait(td, uap->id, 1, NULL));
+}
+
+static int
+kern_sem_wait(struct thread *td, semid_t id, int tryflag,
+    struct timespec *abstime)
+{
+	struct timespec ts1, ts2;
+	struct timeval tv;
+	cap_rights_t rights;
+	struct file *fp;
+	struct ksem *ks;
+	int error;
+
+	DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid));
+	error = ksem_get(td, id, cap_rights_init(&rights, CAP_SEM_WAIT), &fp);
+	if (error)
+		return (error);
+	ks = fp->f_data;
+	mtx_lock(&sem_lock);
+	DP((">>> kern_sem_wait critical section entered! pid=%d\n",
+	    (int)td->td_proc->p_pid));
+#ifdef MAC
+	error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks);
+	if (error) {
+		DP(("kern_sem_wait mac failed\n"));
+		goto err;
+	}
+#endif
+	DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag));
+	vfs_timestamp(&ks->ks_atime);
+	while (ks->ks_value == 0) {
+		ks->ks_waiters++;
+		if (tryflag != 0)
+			error = EAGAIN;
+		else if (abstime == NULL)
+			error = cv_wait_sig(&ks->ks_cv, &sem_lock);
+		else {
+			for (;;) {
+				ts1 = *abstime;
+				getnanotime(&ts2);
+				timespecsub(&ts1, &ts2);
+				TIMESPEC_TO_TIMEVAL(&tv, &ts1);
+				if (tv.tv_sec < 0) {
+					error = ETIMEDOUT;
+					break;
+				}
+				error = cv_timedwait_sig(&ks->ks_cv,
+				    &sem_lock, tvtohz(&tv));
+				if (error != EWOULDBLOCK)
+					break;
+			}
+		}
+		ks->ks_waiters--;
+		if (error)
+			goto err;
+	}
+	ks->ks_value--;
+	DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value));
+	error = 0;
+err:
+	mtx_unlock(&sem_lock);
+	fdrop(fp, td);
+	DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n",
+	    (int)td->td_proc->p_pid, error));
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_getvalue_args {
+	semid_t		id;
+	int		*val;
+};
+#endif
+int
+sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap)
+{
+	cap_rights_t rights;
+	struct file *fp;
+	struct ksem *ks;
+	int error, val;
+
+	error = ksem_get(td, uap->id,
+	    cap_rights_init(&rights, CAP_SEM_GETVALUE), &fp);
+	if (error)
+		return (error);
+	ks = fp->f_data;
+
+	mtx_lock(&sem_lock);
+#ifdef MAC
+	error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks);
+	if (error) {
+		mtx_unlock(&sem_lock);
+		fdrop(fp, td);
+		return (error);
+	}
+#endif
+	val = ks->ks_value;
+	vfs_timestamp(&ks->ks_atime);
+	mtx_unlock(&sem_lock);
+	fdrop(fp, td);
+	error = copyout(&val, uap->val, sizeof(val));
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_destroy_args {
+	semid_t		id;
+};
+#endif
+int
+sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap)
+{
+	struct file *fp;
+	struct ksem *ks;
+	int error;
+
+	/* No capability rights required to close a semaphore. */
+	error = ksem_get(td, uap->id, 0, &fp);
+	if (error)
+		return (error);
+	ks = fp->f_data;
+	if (!(ks->ks_flags & KS_ANONYMOUS)) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	mtx_lock(&sem_lock);
+	if (ks->ks_waiters != 0) {
+		mtx_unlock(&sem_lock);
+		error = EBUSY;
+		goto err;
+	}
+	ks->ks_flags |= KS_DEAD;
+	mtx_unlock(&sem_lock);
+
+	error = kern_close(td, uap->id);
+err:
+	fdrop(fp, td);
+	return (error);
+}
+
+static struct syscall_helper_data ksem_syscalls[] = {
+	SYSCALL_INIT_HELPER(ksem_init),
+	SYSCALL_INIT_HELPER(ksem_open),
+	SYSCALL_INIT_HELPER(ksem_unlink),
+	SYSCALL_INIT_HELPER(ksem_close),
+	SYSCALL_INIT_HELPER(ksem_post),
+	SYSCALL_INIT_HELPER(ksem_wait),
+	SYSCALL_INIT_HELPER(ksem_timedwait),
+	SYSCALL_INIT_HELPER(ksem_trywait),
+	SYSCALL_INIT_HELPER(ksem_getvalue),
+	SYSCALL_INIT_HELPER(ksem_destroy),
+	SYSCALL_INIT_LAST
+};
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+int
+freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap)
+{
+
+	return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
+	    0, 1));
+}
+
+int
+freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap)
+{
+
+	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
+		return (EINVAL);
+	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
+	    uap->oflag, 1));
+}
+
+int
+freebsd32_ksem_timedwait(struct thread *td,
+    struct freebsd32_ksem_timedwait_args *uap)
+{
+	struct timespec32 abstime32;
+	struct timespec *ts, abstime;
+	int error;
+
+	/*
+	 * We allow a null timespec (wait forever).
+	 */
+	if (uap->abstime == NULL)
+		ts = NULL;
+	else {
+		error = copyin(uap->abstime, &abstime32, sizeof(abstime32));
+		if (error != 0)
+			return (error);
+		CP(abstime32, abstime, tv_sec);
+		CP(abstime32, abstime, tv_nsec);
+		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
+			return (EINVAL);
+		ts = &abstime;
+	}
+	return (kern_sem_wait(td, uap->id, 0, ts));
+}
+
+static struct syscall_helper_data ksem32_syscalls[] = {
+	SYSCALL32_INIT_HELPER(freebsd32_ksem_init),
+	SYSCALL32_INIT_HELPER(freebsd32_ksem_open),
+	SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink),
+	SYSCALL32_INIT_HELPER_COMPAT(ksem_close),
+	SYSCALL32_INIT_HELPER_COMPAT(ksem_post),
+	SYSCALL32_INIT_HELPER_COMPAT(ksem_wait),
+	SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait),
+	SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait),
+	SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue),
+	SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy),
+	SYSCALL_INIT_LAST
+};
+#endif
+
+static int
+ksem_module_init(void)
+{
+	int error;
+
+	mtx_init(&sem_lock, "sem", NULL, MTX_DEF);
+	mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF);
+	sx_init(&ksem_dict_lock, "ksem dictionary");
+	ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash);
+	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L);
+	p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX);
+	p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);
+	ksem_info = ksem_info_impl;
+
+	error = syscall_helper_register(ksem_syscalls);
+	if (error)
+		return (error);
+#ifdef COMPAT_FREEBSD32
+	error = syscall32_helper_register(ksem32_syscalls);
+	if (error)
+		return (error);
+#endif
+	return (0);
+}
+
+static void
+ksem_module_destroy(void)
+{
+
+#ifdef COMPAT_FREEBSD32
+	syscall32_helper_unregister(ksem32_syscalls);
+#endif
+	syscall_helper_unregister(ksem_syscalls);
+
+	ksem_info = NULL;
+	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0);
+	hashdestroy(ksem_dictionary, M_KSEM, ksem_hash);
+	sx_destroy(&ksem_dict_lock);
+	mtx_destroy(&ksem_count_lock);
+	mtx_destroy(&sem_lock);
+	p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX);
+	p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX);
+}
+
+static int
+sem_modload(struct module *module, int cmd, void *arg)
+{
+        int error = 0;
+
+        switch (cmd) {
+        case MOD_LOAD:
+		error = ksem_module_init();
+		if (error)
+			ksem_module_destroy();
+                break;
+
+        case MOD_UNLOAD:
+		mtx_lock(&ksem_count_lock);
+		if (nsems != 0) {
+			error = EOPNOTSUPP;
+			mtx_unlock(&ksem_count_lock);
+			break;
+		}
+		ksem_dead = 1;
+		mtx_unlock(&ksem_count_lock);
+		ksem_module_destroy();
+                break;
+
+        case MOD_SHUTDOWN:
+                break;
+        default:
+                error = EINVAL;
+                break;
+        }
+        return (error);
+}
+
+static moduledata_t sem_mod = {
+        "sem",
+        &sem_modload,
+        NULL
+};
+
+DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
+MODULE_VERSION(sem, 1);
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
new file mode 100644
index 0000000..54366af
--- /dev/null
+++ b/sys/kern/uipc_shm.c
@@ -0,0 +1,1033 @@
+/*-
+ * Copyright (c) 2006, 2011 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Support for shared swap-backed anonymous memory objects via
+ * shm_open(2) and shm_unlink(2).  While most of the implementation is
+ * here, vm_mmap.c contains mapping logic changes.
+ *
+ * TODO:
+ *
+ * (1) Need to export data to a userland tool via a sysctl.  Should ipcs(1)
+ *     and ipcrm(1) be expanded or should new tools to manage both POSIX
+ *     kernel semaphores and POSIX shared memory be written?
+ *
+ * (2) Add support for this file type to fstat(1).
+ *
+ * (3) Resource limits?  Does this need its own resource limits or are the
+ *     existing limits in mmap(2) sufficient?
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/fnv_hash.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/refcount.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/time.h>
+#include <sys/vnode.h>
+#include <sys/unistd.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
+#include <vm/swap_pager.h>
+
+struct shm_mapping {
+	char		*sm_path;
+	Fnv32_t		sm_fnv;
+	struct shmfd	*sm_shmfd;
+	LIST_ENTRY(shm_mapping) sm_link;
+};
+
+static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
+static LIST_HEAD(, shm_mapping) *shm_dictionary;
+static struct sx shm_dict_lock;
+static struct mtx shm_timestamp_lock;
+static u_long shm_hash;
+
+#define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])
+
+static int	shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags);
+static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode);
+static void	shm_dict_init(void *arg);
+static void	shm_drop(struct shmfd *shmfd);
+static struct shmfd *shm_hold(struct shmfd *shmfd);
+static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
+static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
+static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
+static int	shm_dotruncate(struct shmfd *shmfd, off_t length);
+
+static fo_rdwr_t	shm_read;
+static fo_rdwr_t	shm_write;
+static fo_truncate_t	shm_truncate;
+static fo_ioctl_t	shm_ioctl;
+static fo_poll_t	shm_poll;
+static fo_kqfilter_t	shm_kqfilter;
+static fo_stat_t	shm_stat;
+static fo_close_t	shm_close;
+static fo_chmod_t	shm_chmod;
+static fo_chown_t	shm_chown;
+static fo_seek_t	shm_seek;
+
+/* File descriptor operations. */
+static struct fileops shm_ops = {
+	.fo_read = shm_read,
+	.fo_write = shm_write,
+	.fo_truncate = shm_truncate,
+	.fo_ioctl = shm_ioctl,
+	.fo_poll = shm_poll,
+	.fo_kqfilter = shm_kqfilter,
+	.fo_stat = shm_stat,
+	.fo_close = shm_close,
+	.fo_chmod = shm_chmod,
+	.fo_chown = shm_chown,
+	.fo_sendfile = invfo_sendfile,
+	.fo_seek = shm_seek,
+	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
+};
+
+FEATURE(posix_shm, "POSIX shared memory");
+
+static int
+uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
+{
+	vm_page_t m;
+	vm_pindex_t idx;
+	size_t tlen;
+	int error, offset, rv;
+
+	idx = OFF_TO_IDX(uio->uio_offset);
+	offset = uio->uio_offset & PAGE_MASK;
+	tlen = MIN(PAGE_SIZE - offset, len);
+
+	VM_OBJECT_WLOCK(obj);
+
+	/*
+	 * Parallel reads of the page content from disk are prevented
+	 * by exclusive busy.
+	 *
+	 * Although the tmpfs vnode lock is held here, it is
+	 * nonetheless safe to sleep waiting for a free page.  The
+	 * pageout daemon does not need to acquire the tmpfs vnode
+	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
+	 * type object.
+	 */
+	m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL);
+	if (m->valid != VM_PAGE_BITS_ALL) {
+		if (vm_pager_has_page(obj, idx, NULL, NULL)) {
+			rv = vm_pager_get_pages(obj, &m, 1, 0);
+			m = vm_page_lookup(obj, idx);
+			if (m == NULL) {
+				printf(
+		    "uiomove_object: vm_obj %p idx %jd null lookup rv %d\n",
+				    obj, idx, rv);
+				VM_OBJECT_WUNLOCK(obj);
+				return (EIO);
+			}
+			if (rv != VM_PAGER_OK) {
+				printf(
+	    "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n",
+				    obj, idx, m->valid, rv);
+				vm_page_lock(m);
+				vm_page_free(m);
+				vm_page_unlock(m);
+				VM_OBJECT_WUNLOCK(obj);
+				return (EIO);
+			}
+		} else
+			vm_page_zero_invalid(m, TRUE);
+	}
+	vm_page_xunbusy(m);
+	vm_page_lock(m);
+	vm_page_hold(m);
+	vm_page_unlock(m);
+	VM_OBJECT_WUNLOCK(obj);
+	error = uiomove_fromphys(&m, offset, tlen, uio);
+	if (uio->uio_rw == UIO_WRITE && error == 0) {
+		VM_OBJECT_WLOCK(obj);
+		vm_page_dirty(m);
+		VM_OBJECT_WUNLOCK(obj);
+	}
+	vm_page_lock(m);
+	vm_page_unhold(m);
+	if (m->queue == PQ_NONE) {
+		vm_page_deactivate(m);
+	} else {
+		/* Requeue to maintain LRU ordering. */
+		vm_page_requeue(m);
+	}
+	vm_page_unlock(m);
+
+	return (error);
+}
+
+int
+uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
+{
+	ssize_t resid;
+	size_t len;
+	int error;
+
+	error = 0;
+	while ((resid = uio->uio_resid) > 0) {
+		if (obj_size <= uio->uio_offset)
+			break;
+		len = MIN(obj_size - uio->uio_offset, resid);
+		if (len == 0)
+			break;
+		error = uiomove_object_page(obj, len, uio);
+		if (error != 0 || resid == uio->uio_resid)
+			break;
+	}
+	return (error);
+}
+
+static int
+shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
+{
+	struct shmfd *shmfd;
+	off_t foffset;
+	int error;
+
+	shmfd = fp->f_data;
+	foffset = foffset_lock(fp, 0);
+	error = 0;
+	switch (whence) {
+	case L_INCR:
+		if (foffset < 0 ||
+		    (offset > 0 && foffset > OFF_MAX - offset)) {
+			error = EOVERFLOW;
+			break;
+		}
+		offset += foffset;
+		break;
+	case L_XTND:
+		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
+			error = EOVERFLOW;
+			break;
+		}
+		offset += shmfd->shm_size;
+		break;
+	case L_SET:
+		break;
+	default:
+		error = EINVAL;
+	}
+	if (error == 0) {
+		if (offset < 0 || offset > shmfd->shm_size)
+			error = EINVAL;
+		else
+			*(off_t *)(td->td_retval) = offset;
+	}
+	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
+	return (error);
+}
+
+static int
+shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	struct shmfd *shmfd;
+	void *rl_cookie;
+	int error;
+
+	shmfd = fp->f_data;
+	foffset_lock_uio(fp, uio, flags);
+	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
+	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
+#ifdef MAC
+	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
+	if (error)
+		return (error);
+#endif
+	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
+	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
+	foffset_unlock_uio(fp, uio, flags);
+	return (error);
+}
+
+static int
+shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	struct shmfd *shmfd;
+	void *rl_cookie;
+	int error;
+
+	shmfd = fp->f_data;
+#ifdef MAC
+	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
+	if (error)
+		return (error);
+#endif
+	foffset_lock_uio(fp, uio, flags);
+	if ((flags & FOF_OFFSET) == 0) {
+		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
+		    &shmfd->shm_mtx);
+	} else {
+		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
+		    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
+	}
+
+	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
+	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
+	foffset_unlock_uio(fp, uio, flags);
+	return (error);
+}
+
+static int
+shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct shmfd *shmfd;
+#ifdef MAC
+	int error;
+#endif
+
+	shmfd = fp->f_data;
+#ifdef MAC
+	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
+	if (error)
+		return (error);
+#endif
+	return (shm_dotruncate(shmfd, length));
+}
+
+static int
+shm_ioctl(struct file *fp, u_long com, void *data,
+    struct ucred *active_cred, struct thread *td)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+shm_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+shm_kqfilter(struct file *fp, struct knote *kn)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct shmfd *shmfd;
+#ifdef MAC
+	int error;
+#endif
+
+	shmfd = fp->f_data;
+
+#ifdef MAC
+	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
+	if (error)
+		return (error);
+#endif
+	
+	/*
+	 * Attempt to return sanish values for fstat() on a memory file
+	 * descriptor.
+	 */
+	bzero(sb, sizeof(*sb));
+	sb->st_blksize = PAGE_SIZE;
+	sb->st_size = shmfd->shm_size;
+	sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize;
+	mtx_lock(&shm_timestamp_lock);
+	sb->st_atim = shmfd->shm_atime;
+	sb->st_ctim = shmfd->shm_ctime;
+	sb->st_mtim = shmfd->shm_mtime;
+	sb->st_birthtim = shmfd->shm_birthtime;
+	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
+	sb->st_uid = shmfd->shm_uid;
+	sb->st_gid = shmfd->shm_gid;
+	mtx_unlock(&shm_timestamp_lock);
+
+	return (0);
+}
+
+static int
+shm_close(struct file *fp, struct thread *td)
+{
+	struct shmfd *shmfd;
+
+	shmfd = fp->f_data;
+	fp->f_data = NULL;
+	shm_drop(shmfd);
+
+	return (0);
+}
+
+static int
+shm_dotruncate(struct shmfd *shmfd, off_t length)
+{
+	vm_object_t object;
+	vm_page_t m, ma[1];
+	vm_pindex_t idx, nobjsize;
+	vm_ooffset_t delta;
+	int base, rv;
+
+	object = shmfd->shm_object;
+	VM_OBJECT_WLOCK(object);
+	if (length == shmfd->shm_size) {
+		VM_OBJECT_WUNLOCK(object);
+		return (0);
+	}
+	nobjsize = OFF_TO_IDX(length + PAGE_MASK);
+
+	/* Are we shrinking?  If so, trim the end. */
+	if (length < shmfd->shm_size) {
+		/*
+		 * Disallow any requests to shrink the size if this
+		 * object is mapped into the kernel.
+		 */
+		if (shmfd->shm_kmappings > 0) {
+			VM_OBJECT_WUNLOCK(object);
+			return (EBUSY);
+		}
+
+		/*
+		 * Zero the truncated part of the last page.
+		 */
+		base = length & PAGE_MASK;
+		if (base != 0) {
+			idx = OFF_TO_IDX(length);
+retry:
+			m = vm_page_lookup(object, idx);
+			if (m != NULL) {
+				if (vm_page_sleep_if_busy(m, "shmtrc"))
+					goto retry;
+			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
+				m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL);
+				if (m == NULL) {
+					VM_OBJECT_WUNLOCK(object);
+					VM_WAIT;
+					VM_OBJECT_WLOCK(object);
+					goto retry;
+				} else if (m->valid != VM_PAGE_BITS_ALL) {
+					ma[0] = m;
+					rv = vm_pager_get_pages(object, ma, 1,
+					    0);
+					m = vm_page_lookup(object, idx);
+				} else
+					/* A cached page was reactivated. */
+					rv = VM_PAGER_OK;
+				vm_page_lock(m);
+				if (rv == VM_PAGER_OK) {
+					vm_page_deactivate(m);
+					vm_page_unlock(m);
+					vm_page_xunbusy(m);
+				} else {
+					vm_page_free(m);
+					vm_page_unlock(m);
+					VM_OBJECT_WUNLOCK(object);
+					return (EIO);
+				}
+			}
+			if (m != NULL) {
+				pmap_zero_page_area(m, base, PAGE_SIZE - base);
+				KASSERT(m->valid == VM_PAGE_BITS_ALL,
+				    ("shm_dotruncate: page %p is invalid", m));
+				vm_page_dirty(m);
+				vm_pager_page_unswapped(m);
+			}
+		}
+		delta = ptoa(object->size - nobjsize);
+
+		/* Toss in memory pages. */
+		if (nobjsize < object->size)
+			vm_object_page_remove(object, nobjsize, object->size,
+			    0);
+
+		/* Toss pages from swap. */
+		if (object->type == OBJT_SWAP)
+			swap_pager_freespace(object, nobjsize, delta);
+
+		/* Free the swap accounted for shm */
+		swap_release_by_cred(delta, object->cred);
+		object->charge -= delta;
+	} else {
+		/* Attempt to reserve the swap */
+		delta = ptoa(nobjsize - object->size);
+		if (!swap_reserve_by_cred(delta, object->cred)) {
+			VM_OBJECT_WUNLOCK(object);
+			return (ENOMEM);
+		}
+		object->charge += delta;
+	}
+	shmfd->shm_size = length;
+	mtx_lock(&shm_timestamp_lock);
+	vfs_timestamp(&shmfd->shm_ctime);
+	shmfd->shm_mtime = shmfd->shm_ctime;
+	mtx_unlock(&shm_timestamp_lock);
+	object->size = nobjsize;
+	VM_OBJECT_WUNLOCK(object);
+	return (0);
+}
+
+/*
+ * shmfd object management including creation and reference counting
+ * routines.
+ */
+static struct shmfd *
+shm_alloc(struct ucred *ucred, mode_t mode)
+{
+	struct shmfd *shmfd;
+
+	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
+	shmfd->shm_size = 0;
+	shmfd->shm_uid = ucred->cr_uid;
+	shmfd->shm_gid = ucred->cr_gid;
+	shmfd->shm_mode = mode;
+	shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL,
+	    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
+	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
+	VM_OBJECT_WLOCK(shmfd->shm_object);
+	vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING);
+	vm_object_set_flag(shmfd->shm_object, OBJ_NOSPLIT);
+	VM_OBJECT_WUNLOCK(shmfd->shm_object);
+	vfs_timestamp(&shmfd->shm_birthtime);
+	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
+	    shmfd->shm_birthtime;
+	refcount_init(&shmfd->shm_refs, 1);
+	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
+	rangelock_init(&shmfd->shm_rl);
+#ifdef MAC
+	mac_posixshm_init(shmfd);
+	mac_posixshm_create(ucred, shmfd);
+#endif
+
+	return (shmfd);
+}
+
+static struct shmfd *
+shm_hold(struct shmfd *shmfd)
+{
+
+	refcount_acquire(&shmfd->shm_refs);
+	return (shmfd);
+}
+
+static void
+shm_drop(struct shmfd *shmfd)
+{
+
+	if (refcount_release(&shmfd->shm_refs)) {
+#ifdef MAC
+		mac_posixshm_destroy(shmfd);
+#endif
+		rangelock_destroy(&shmfd->shm_rl);
+		mtx_destroy(&shmfd->shm_mtx);
+		vm_object_deallocate(shmfd->shm_object);
+		free(shmfd, M_SHMFD);
+	}
+}
+
+/*
+ * Determine if the credentials have sufficient permissions for a
+ * specified combination of FREAD and FWRITE.
+ */
+static int
+shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
+{
+	accmode_t accmode;
+	int error;
+
+	accmode = 0;
+	if (flags & FREAD)
+		accmode |= VREAD;
+	if (flags & FWRITE)
+		accmode |= VWRITE;
+	mtx_lock(&shm_timestamp_lock);
+	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
+	    accmode, ucred, NULL);
+	mtx_unlock(&shm_timestamp_lock);
+	return (error);
+}
+
+/*
+ * Dictionary management.  We maintain an in-kernel dictionary to map
+ * paths to shmfd objects.  We use the FNV hash on the path to store
+ * the mappings in a hash table.
+ */
+static void
+shm_dict_init(void *arg)
+{
+
+	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
+	sx_init(&shm_dict_lock, "shm dictionary");
+	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
+}
+SYSINIT(shm_dict_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_dict_init, NULL);
+
+static struct shmfd *
+shm_lookup(char *path, Fnv32_t fnv)
+{
+	struct shm_mapping *map;
+
+	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
+		if (map->sm_fnv != fnv)
+			continue;
+		if (strcmp(map->sm_path, path) == 0)
+			return (map->sm_shmfd);
+	}
+
+	return (NULL);
+}
+
+static void
+shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
+{
+	struct shm_mapping *map;
+
+	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
+	map->sm_path = path;
+	map->sm_fnv = fnv;
+	map->sm_shmfd = shm_hold(shmfd);
+	shmfd->shm_path = path;
+	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
+}
+
+static int
+shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
+{
+	struct shm_mapping *map;
+	int error;
+
+	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
+		if (map->sm_fnv != fnv)
+			continue;
+		if (strcmp(map->sm_path, path) == 0) {
+#ifdef MAC
+			error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
+			if (error)
+				return (error);
+#endif
+			error = shm_access(map->sm_shmfd, ucred,
+			    FREAD | FWRITE);
+			if (error)
+				return (error);
+			map->sm_shmfd->shm_path = NULL;
+			LIST_REMOVE(map, sm_link);
+			shm_drop(map->sm_shmfd);
+			free(map->sm_path, M_SHMFD);
+			free(map, M_SHMFD);
+			return (0);
+		}
+	}
+
+	return (ENOENT);
+}
+
+/* System calls. */
+int
+sys_shm_open(struct thread *td, struct shm_open_args *uap)
+{
+	struct filedesc *fdp;
+	struct shmfd *shmfd;
+	struct file *fp;
+	char *path;
+	Fnv32_t fnv;
+	mode_t cmode;
+	int fd, error;
+
+#ifdef CAPABILITY_MODE
+	/*
+	 * shm_open(2) is only allowed for anonymous objects.
+	 */
+	if (IN_CAPABILITY_MODE(td) && (uap->path != SHM_ANON))
+		return (ECAPMODE);
+#endif
+
+	if ((uap->flags & O_ACCMODE) != O_RDONLY &&
+	    (uap->flags & O_ACCMODE) != O_RDWR)
+		return (EINVAL);
+
+	if ((uap->flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC)) != 0)
+		return (EINVAL);
+
+	fdp = td->td_proc->p_fd;
+	cmode = (uap->mode & ~fdp->fd_cmask) & ACCESSPERMS;
+
+	error = falloc(td, &fp, &fd, O_CLOEXEC);
+	if (error)
+		return (error);
+
+	/* A SHM_ANON path pointer creates an anonymous object. */
+	if (uap->path == SHM_ANON) {
+		/* A read-only anonymous object is pointless. */
+		if ((uap->flags & O_ACCMODE) == O_RDONLY) {
+			fdclose(fdp, fp, fd, td);
+			fdrop(fp, td);
+			return (EINVAL);
+		}
+		shmfd = shm_alloc(td->td_ucred, cmode);
+	} else {
+		path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
+		error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
+
+		/* Require paths to start with a '/' character. */
+		if (error == 0 && path[0] != '/')
+			error = EINVAL;
+		if (error) {
+			fdclose(fdp, fp, fd, td);
+			fdrop(fp, td);
+			free(path, M_SHMFD);
+			return (error);
+		}
+
+		fnv = fnv_32_str(path, FNV1_32_INIT);
+		sx_xlock(&shm_dict_lock);
+		shmfd = shm_lookup(path, fnv);
+		if (shmfd == NULL) {
+			/* Object does not yet exist, create it if requested. */
+			if (uap->flags & O_CREAT) {
+#ifdef MAC
+				error = mac_posixshm_check_create(td->td_ucred,
+				    path);
+				if (error == 0) {
+#endif
+					shmfd = shm_alloc(td->td_ucred, cmode);
+					shm_insert(path, fnv, shmfd);
+#ifdef MAC
+				}
+#endif
+			} else {
+				free(path, M_SHMFD);
+				error = ENOENT;
+			}
+		} else {
+			/*
+			 * Object already exists, obtain a new
+			 * reference if requested and permitted.
+			 */
+			free(path, M_SHMFD);
+			if ((uap->flags & (O_CREAT | O_EXCL)) ==
+			    (O_CREAT | O_EXCL))
+				error = EEXIST;
+			else {
+#ifdef MAC
+				error = mac_posixshm_check_open(td->td_ucred,
+				    shmfd, FFLAGS(uap->flags & O_ACCMODE));
+				if (error == 0)
+#endif
+				error = shm_access(shmfd, td->td_ucred,
+				    FFLAGS(uap->flags & O_ACCMODE));
+			}
+
+			/*
+			 * Truncate the file back to zero length if
+			 * O_TRUNC was specified and the object was
+			 * opened with read/write.
+			 */
+			if (error == 0 &&
+			    (uap->flags & (O_ACCMODE | O_TRUNC)) ==
+			    (O_RDWR | O_TRUNC)) {
+#ifdef MAC
+				error = mac_posixshm_check_truncate(
+					td->td_ucred, fp->f_cred, shmfd);
+				if (error == 0)
+#endif
+					shm_dotruncate(shmfd, 0);
+			}
+			if (error == 0)
+				shm_hold(shmfd);
+		}
+		sx_xunlock(&shm_dict_lock);
+
+		if (error) {
+			fdclose(fdp, fp, fd, td);
+			fdrop(fp, td);
+			return (error);
+		}
+	}
+
+	finit(fp, FFLAGS(uap->flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
+
+	td->td_retval[0] = fd;
+	fdrop(fp, td);
+
+	return (0);
+}
+
+int
+sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
+{
+	char *path;
+	Fnv32_t fnv;
+	int error;
+
+	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
+	if (error) {
+		free(path, M_TEMP);
+		return (error);
+	}
+
+	fnv = fnv_32_str(path, FNV1_32_INIT);
+	sx_xlock(&shm_dict_lock);
+	error = shm_remove(path, fnv, td->td_ucred);
+	sx_xunlock(&shm_dict_lock);
+	free(path, M_TEMP);
+
+	return (error);
+}
+
+/*
+ * mmap() helper to validate mmap() requests against shm object state
+ * and give mmap() the vm_object to use for the mapping.
+ */
+int
+shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff,
+    vm_object_t *obj)
+{
+
+	/*
+	 * XXXRW: This validation is probably insufficient, and subject to
+	 * sign errors.  It should be fixed.
+	 */
+	if (foff >= shmfd->shm_size ||
+	    foff + objsize > round_page(shmfd->shm_size))
+		return (EINVAL);
+
+	mtx_lock(&shm_timestamp_lock);
+	vfs_timestamp(&shmfd->shm_atime);
+	mtx_unlock(&shm_timestamp_lock);
+	vm_object_reference(shmfd->shm_object);
+	*obj = shmfd->shm_object;
+	return (0);
+}
+
+static int
+shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct shmfd *shmfd;
+	int error;
+
+	error = 0;
+	shmfd = fp->f_data;
+	mtx_lock(&shm_timestamp_lock);
+	/*
+	 * SUSv4 says that x bits of permission need not be affected.
+	 * Be consistent with our shm_open there.
+	 */
+#ifdef MAC
+	error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
+	if (error != 0)
+		goto out;
+#endif
+	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid,
+	    shmfd->shm_gid, VADMIN, active_cred, NULL);
+	if (error != 0)
+		goto out;
+	shmfd->shm_mode = mode & ACCESSPERMS;
+out:
+	mtx_unlock(&shm_timestamp_lock);
+	return (error);
+}
+
+static int
+shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct shmfd *shmfd;
+	int error;
+
+	error = 0;
+	shmfd = fp->f_data;
+	mtx_lock(&shm_timestamp_lock);
+#ifdef MAC
+	error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
+	if (error != 0)
+		goto out;
+#endif
+	if (uid == (uid_t)-1)
+		uid = shmfd->shm_uid;
+	if (gid == (gid_t)-1)
+                 gid = shmfd->shm_gid;
+	if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
+	    (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
+	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
+		goto out;
+	shmfd->shm_uid = uid;
+	shmfd->shm_gid = gid;
+out:
+	mtx_unlock(&shm_timestamp_lock);
+	return (error);
+}
+
+/*
+ * Helper routines to allow the backing object of a shared memory file
+ * descriptor to be mapped in the kernel.
+ */
+int
+shm_map(struct file *fp, size_t size, off_t offset, void **memp)
+{
+	struct shmfd *shmfd;
+	vm_offset_t kva, ofs;
+	vm_object_t obj;
+	int rv;
+
+	if (fp->f_type != DTYPE_SHM)
+		return (EINVAL);
+	shmfd = fp->f_data;
+	obj = shmfd->shm_object;
+	VM_OBJECT_WLOCK(obj);
+	/*
+	 * XXXRW: This validation is probably insufficient, and subject to
+	 * sign errors.  It should be fixed.
+	 */
+	if (offset >= shmfd->shm_size ||
+	    offset + size > round_page(shmfd->shm_size)) {
+		VM_OBJECT_WUNLOCK(obj);
+		return (EINVAL);
+	}
+
+	shmfd->shm_kmappings++;
+	vm_object_reference_locked(obj);
+	VM_OBJECT_WUNLOCK(obj);
+
+	/* Map the object into the kernel_map and wire it. */
+	kva = vm_map_min(kernel_map);
+	ofs = offset & PAGE_MASK;
+	offset = trunc_page(offset);
+	size = round_page(size + ofs);
+	rv = vm_map_find(kernel_map, obj, offset, &kva, size,
+	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
+	    VM_PROT_READ | VM_PROT_WRITE, 0);
+	if (rv == KERN_SUCCESS) {
+		rv = vm_map_wire(kernel_map, kva, kva + size,
+		    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
+		if (rv == KERN_SUCCESS) {
+			*memp = (void *)(kva + ofs);
+			return (0);
+		}
+		vm_map_remove(kernel_map, kva, kva + size);
+	} else
+		vm_object_deallocate(obj);
+
+	/* On failure, drop our mapping reference. */
+	VM_OBJECT_WLOCK(obj);
+	shmfd->shm_kmappings--;
+	VM_OBJECT_WUNLOCK(obj);
+
+	return (vm_mmap_to_errno(rv));
+}
+
+/*
+ * We require the caller to unmap the entire entry.  This allows us to
+ * safely decrement shm_kmappings when a mapping is removed.
+ */
+int
+shm_unmap(struct file *fp, void *mem, size_t size)
+{
+	struct shmfd *shmfd;
+	vm_map_entry_t entry;
+	vm_offset_t kva, ofs;
+	vm_object_t obj;
+	vm_pindex_t pindex;
+	vm_prot_t prot;
+	boolean_t wired;
+	vm_map_t map;
+	int rv;
+
+	if (fp->f_type != DTYPE_SHM)
+		return (EINVAL);
+	shmfd = fp->f_data;
+	kva = (vm_offset_t)mem;
+	ofs = kva & PAGE_MASK;
+	kva = trunc_page(kva);
+	size = round_page(size + ofs);
+	map = kernel_map;
+	rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry,
+	    &obj, &pindex, &prot, &wired);
+	if (rv != KERN_SUCCESS)
+		return (EINVAL);
+	if (entry->start != kva || entry->end != kva + size) {
+		vm_map_lookup_done(map, entry);
+		return (EINVAL);
+	}
+	vm_map_lookup_done(map, entry);
+	if (obj != shmfd->shm_object)
+		return (EINVAL);
+	vm_map_remove(map, kva, kva + size);
+	VM_OBJECT_WLOCK(obj);
+	KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
+	shmfd->shm_kmappings--;
+	VM_OBJECT_WUNLOCK(obj);
+	return (0);
+}
+
+void
+shm_path(struct shmfd *shmfd, char *path, size_t size)
+{
+
+	if (shmfd->shm_path == NULL)
+		return;
+	sx_slock(&shm_dict_lock);
+	if (shmfd->shm_path != NULL)
+		strlcpy(path, shmfd->shm_path, size);
+	sx_sunlock(&shm_dict_lock);
+}
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
new file mode 100644
index 0000000..9fa8ae0
--- /dev/null
+++ b/sys/kern/uipc_sockbuf.c
@@ -0,0 +1,1061 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_param.h"
+
+#include <sys/param.h>
+#include <sys/aio.h> /* for aio_swake proto */
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+
+/*
+ * Function pointer set by the AIO routines so that the socket buffer code
+ * can call back into the AIO module if it is loaded.
+ */
+void	(*aio_swake)(struct socket *, struct sockbuf *);
+
+/*
+ * Primitive routines for operating on socket buffers
+ */
+
+u_long	sb_max = SB_MAX;
+u_long sb_max_adj =
+       (quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
+
+static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
+
+static void	sbdrop_internal(struct sockbuf *sb, int len);
+static void	sbflush_internal(struct sockbuf *sb);
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the socket; it
+ * would normally be applied to a socket when the user informs the system
+ * that no more data is to be sent, by the protocol code (in case
+ * PRU_SHUTDOWN).  Socantrcvmore indicates that no more data will be
+ * received, and will normally be applied to the socket by a protocol when it
+ * detects that the peer will send no more data.  Data queued for reading in
+ * the socket may yet be read.
+ */
+void
+socantsendmore_locked(struct socket *so)
+{
+
+	SOCKBUF_LOCK_ASSERT(&so->so_snd);
+
+	so->so_snd.sb_state |= SBS_CANTSENDMORE;
+	sowwakeup_locked(so);
+	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
+}
+
+void
+socantsendmore(struct socket *so)
+{
+
+	SOCKBUF_LOCK(&so->so_snd);
+	socantsendmore_locked(so);
+	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
+}
+
+void
+socantrcvmore_locked(struct socket *so)
+{
+
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
+	sorwakeup_locked(so);
+	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
+}
+
+void
+socantrcvmore(struct socket *so)
+{
+
+	SOCKBUF_LOCK(&so->so_rcv);
+	socantrcvmore_locked(so);
+	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(struct sockbuf *sb)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	sb->sb_flags |= SB_WAIT;
+	return (msleep_sbt(&sb->sb_cc, &sb->sb_mtx,
+	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+	    sb->sb_timeo, 0, 0));
+}
+
+int
+sblock(struct sockbuf *sb, int flags)
+{
+
+	KASSERT((flags & SBL_VALID) == flags,
+	    ("sblock: flags invalid (0x%x)", flags));
+
+	if (flags & SBL_WAIT) {
+		if ((sb->sb_flags & SB_NOINTR) ||
+		    (flags & SBL_NOINTR)) {
+			sx_xlock(&sb->sb_sx);
+			return (0);
+		}
+		return (sx_xlock_sig(&sb->sb_sx));
+	} else {
+		if (sx_try_xlock(&sb->sb_sx) == 0)
+			return (EWOULDBLOCK);
+		return (0);
+	}
+}
+
+void
+sbunlock(struct sockbuf *sb)
+{
+
+	sx_xunlock(&sb->sb_sx);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer.  Do asynchronous notification
+ * via SIGIO if the socket has the SS_ASYNC flag set.
+ *
+ * Called with the socket buffer lock held; will release the lock by the end
+ * of the function.  This allows the caller to acquire the socket buffer lock
+ * while testing for the need for various sorts of wakeup and hold it through
+ * to the point where it's no longer required.  We currently hold the lock
+ * through calls out to other subsystems (with the exception of kqueue), and
+ * then release it to avoid lock order issues.  It's not clear that's
+ * correct.
+ */
+void
+sowakeup(struct socket *so, struct sockbuf *sb)
+{
+	int ret;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	selwakeuppri(&sb->sb_sel, PSOCK);
+	if (!SEL_WAITING(&sb->sb_sel))
+		sb->sb_flags &= ~SB_SEL;
+	if (sb->sb_flags & SB_WAIT) {
+		sb->sb_flags &= ~SB_WAIT;
+		wakeup(&sb->sb_cc);
+	}
+	KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
+	if (sb->sb_upcall != NULL) {
+		ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT);
+		if (ret == SU_ISCONNECTED) {
+			KASSERT(sb == &so->so_rcv,
+			    ("SO_SND upcall returned SU_ISCONNECTED"));
+			soupcall_clear(so, SO_RCV);
+		}
+	} else
+		ret = SU_OK;
+	if (sb->sb_flags & SB_AIO)
+		aio_swake(so, sb);
+	SOCKBUF_UNLOCK(sb);
+	if (ret == SU_ISCONNECTED)
+		soisconnected(so);
+	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
+		pgsigio(&so->so_sigio, SIGIO, 0);
+	mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED);
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and one for
+ * receiving data.  Each buffer contains a queue of mbufs, information about
+ * the number of mbufs and amount of data in the queue, and other fields
+ * allowing select() statements and notification on data availability to be
+ * implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records.  Each
+ * record is a list of mbufs chained together with the m_next field.  Records
+ * are chained together with the m_nextpkt field. The upper level routine
+ * soreceive() expects the following conventions to be observed when placing
+ * information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's name,
+ *    then a record containing that name must be present before any
+ *    associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really just
+ *    additional data associated with the message), and there are ``rights''
+ *    to be received, then a record containing this data should be present
+ *    (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by a data
+ *    record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve().  This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits).  The space should
+ * be released by calling sbrelease() when the socket is destroyed.
+ */
+int
+soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
+{
+	struct thread *td = curthread;
+
+	SOCKBUF_LOCK(&so->so_snd);
+	SOCKBUF_LOCK(&so->so_rcv);
+	if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0)
+		goto bad;
+	if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0)
+		goto bad2;
+	if (so->so_rcv.sb_lowat == 0)
+		so->so_rcv.sb_lowat = 1;
+	if (so->so_snd.sb_lowat == 0)
+		so->so_snd.sb_lowat = MCLBYTES;
+	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	SOCKBUF_UNLOCK(&so->so_snd);
+	return (0);
+bad2:
+	sbrelease_locked(&so->so_snd, so);
+bad:
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	SOCKBUF_UNLOCK(&so->so_snd);
+	return (ENOBUFS);
+}
+
+static int
+sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
+{
+	int error = 0;
+	u_long tmp_sb_max = sb_max;
+
+	error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req);
+	if (error || !req->newptr)
+		return (error);
+	if (tmp_sb_max < MSIZE + MCLBYTES)
+		return (EINVAL);
+	sb_max = tmp_sb_max;
+	sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
+	return (0);
+}
+	
+/*
+ * Allot mbufs to a sockbuf.  Attempt to scale mbmax so that mbcnt doesn't
+ * become limiting if buffering efficiency is near the normal case.
+ */
+int
+sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
+    struct thread *td)
+{
+	rlim_t sbsize_limit;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	/*
+	 * When a thread is passed, we take into account the thread's socket
+	 * buffer size limit.  The caller will generally pass curthread, but
+	 * in the TCP input path, NULL will be passed to indicate that no
+	 * appropriate thread resource limits are available.  In that case,
+	 * we don't apply a process limit.
+	 */
+	if (cc > sb_max_adj)
+		return (0);
+	if (td != NULL) {
+		PROC_LOCK(td->td_proc);
+		sbsize_limit = lim_cur(td->td_proc, RLIMIT_SBSIZE);
+		PROC_UNLOCK(td->td_proc);
+	} else
+		sbsize_limit = RLIM_INFINITY;
+	if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
+	    sbsize_limit))
+		return (0);
+	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+	if (sb->sb_lowat > sb->sb_hiwat)
+		sb->sb_lowat = sb->sb_hiwat;
+	return (1);
+}
+
+int
+sbreserve(struct sockbuf *sb, u_long cc, struct socket *so, 
+    struct thread *td)
+{
+	int error;
+
+	SOCKBUF_LOCK(sb);
+	error = sbreserve_locked(sb, cc, so, td);
+	SOCKBUF_UNLOCK(sb);
+	return (error);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+void
+sbrelease_internal(struct sockbuf *sb, struct socket *so)
+{
+
+	sbflush_internal(sb);
+	(void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
+	    RLIM_INFINITY);
+	sb->sb_mbmax = 0;
+}
+
+void
+sbrelease_locked(struct sockbuf *sb, struct socket *so)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	sbrelease_internal(sb, so);
+}
+
+void
+sbrelease(struct sockbuf *sb, struct socket *so)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbrelease_locked(sb, so);
+	SOCKBUF_UNLOCK(sb);
+}
+
+void
+sbdestroy(struct sockbuf *sb, struct socket *so)
+{
+
+	sbrelease_internal(sb, so);
+}
+
+/*
+ * Routines to add and remove data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to append
+ * new mbufs to a socket buffer, after checking that adequate space is
+ * available, comparing the function sbspace() with the amount of data to be
+ * added.  sbappendrecord() differs from sbappend() in that data supplied is
+ * treated as the beginning of a new record.  To place a sender's address,
+ * optional access rights, and data in a socket receive buffer,
+ * sbappendaddr() should be used.  To place access rights and data in a
+ * socket receive buffer, sbappendrights() should be used.  In either case,
+ * the new data begins a new record.  Note that unlike sbappend() and
+ * sbappendrecord(), these routines check for the caller that there will be
+ * enough space to store the data.  Each fails if there is not enough space,
+ * or if it cannot find mbufs to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data awaiting
+ * acknowledgement.  Data is normally copied from a socket send buffer in a
+ * protocol with m_copy for output to a peer, and then removing the data from
+ * the socket buffer with sbdrop() or sbdroprecord() when the data is
+ * acknowledged by the peer.
+ */
+#ifdef SOCKBUF_DEBUG
+void
+sblastrecordchk(struct sockbuf *sb, const char *file, int line)
+{
+	struct mbuf *m = sb->sb_mb;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	while (m && m->m_nextpkt)
+		m = m->m_nextpkt;
+
+	if (m != sb->sb_lastrecord) {
+		printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
+			__func__, sb->sb_mb, sb->sb_lastrecord, m);
+		printf("packet chain:\n");
+		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
+			printf("\t%p\n", m);
+		panic("%s from %s:%u", __func__, file, line);
+	}
+}
+
+void
+sblastmbufchk(struct sockbuf *sb, const char *file, int line)
+{
+	struct mbuf *m = sb->sb_mb;
+	struct mbuf *n;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	while (m && m->m_nextpkt)
+		m = m->m_nextpkt;
+
+	while (m && m->m_next)
+		m = m->m_next;
+
+	if (m != sb->sb_mbtail) {
+		printf("%s: sb_mb %p sb_mbtail %p last %p\n",
+			__func__, sb->sb_mb, sb->sb_mbtail, m);
+		printf("packet tree:\n");
+		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
+			printf("\t");
+			for (n = m; n != NULL; n = n->m_next)
+				printf("%p ", n);
+			printf("\n");
+		}
+		panic("%s from %s:%u", __func__, file, line);
+	}
+}
+#endif /* SOCKBUF_DEBUG */
+
+#define SBLINKRECORD(sb, m0) do {					\
+	SOCKBUF_LOCK_ASSERT(sb);					\
+	if ((sb)->sb_lastrecord != NULL)				\
+		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
+	else								\
+		(sb)->sb_mb = (m0);					\
+	(sb)->sb_lastrecord = (m0);					\
+} while (/*CONSTCOND*/0)
+
+/*
+ * Append mbuf chain m to the last record in the socket buffer sb.  The
+ * additional space associated the mbuf chain is recorded in sb.  Empty mbufs
+ * are discarded and mbufs are compacted where possible.
+ */
+void
+sbappend_locked(struct sockbuf *sb, struct mbuf *m)
+{
+	struct mbuf *n;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	if (m == 0)
+		return;
+
+	SBLASTRECORDCHK(sb);
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		do {
+			if (n->m_flags & M_EOR) {
+				sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
+				return;
+			}
+		} while (n->m_next && (n = n->m_next));
+	} else {
+		/*
+		 * XXX Would like to simply use sb_mbtail here, but
+		 * XXX I need to verify that I won't miss an EOR that
+		 * XXX way.
+		 */
+		if ((n = sb->sb_lastrecord) != NULL) {
+			do {
+				if (n->m_flags & M_EOR) {
+					sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
+					return;
+				}
+			} while (n->m_next && (n = n->m_next));
+		} else {
+			/*
+			 * If this is the first record in the socket buffer,
+			 * it's also the last record.
+			 */
+			sb->sb_lastrecord = m;
+		}
+	}
+	sbcompress(sb, m, n);
+	SBLASTRECORDCHK(sb);
+}
+
+/*
+ * Append mbuf chain m to the last record in the socket buffer sb.  The
+ * additional space associated the mbuf chain is recorded in sb.  Empty mbufs
+ * are discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(struct sockbuf *sb, struct mbuf *m)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbappend_locked(sb, m);
+	SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * This version of sbappend() should only be used when the caller absolutely
+ * knows that there will never be more than one record in the socket buffer,
+ * that is, a stream protocol (such as TCP).
+ */
+void
+sbappendstream_locked(struct sockbuf *sb, struct mbuf *m)
+{
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
+	KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
+
+	SBLASTMBUFCHK(sb);
+
+	/* Remove all packet headers and mbuf tags to get a pure data chain. */
+	m_demote(m, 1);
+	
+	sbcompress(sb, m, sb->sb_mbtail);
+
+	sb->sb_lastrecord = sb->sb_mb;
+	SBLASTRECORDCHK(sb);
+}
+
+/*
+ * This version of sbappend() should only be used when the caller absolutely
+ * knows that there will never be more than one record in the socket buffer,
+ * that is, a stream protocol (such as TCP).
+ */
+void
+sbappendstream(struct sockbuf *sb, struct mbuf *m)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbappendstream_locked(sb, m);
+	SOCKBUF_UNLOCK(sb);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(struct sockbuf *sb)
+{
+	struct mbuf *m;
+	struct mbuf *n = 0;
+	u_long len = 0, mbcnt = 0;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	for (m = sb->sb_mb; m; m = n) {
+	    n = m->m_nextpkt;
+	    for (; m; m = m->m_next) {
+		len += m->m_len;
+		mbcnt += MSIZE;
+		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+			mbcnt += m->m_ext.ext_size;
+	    }
+	}
+	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+		printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc,
+		    mbcnt, sb->sb_mbcnt);
+		panic("sbcheck");
+	}
+}
+#endif
+
+/*
+ * As above, except the mbuf chain begins a new record.
+ */
+void
+sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0)
+{
+	struct mbuf *m;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	if (m0 == 0)
+		return;
+	/*
+	 * Put the first mbuf on the queue.  Note this permits zero length
+	 * records.
+	 */
+	sballoc(sb, m0);
+	SBLASTRECORDCHK(sb);
+	SBLINKRECORD(sb, m0);
+	sb->sb_mbtail = m0;
+	m = m0->m_next;
+	m0->m_next = 0;
+	if (m && (m0->m_flags & M_EOR)) {
+		m0->m_flags &= ~M_EOR;
+		m->m_flags |= M_EOR;
+	}
+	/* always call sbcompress() so it can do SBLASTMBUFCHK() */
+	sbcompress(sb, m, m0);
+}
+
+/*
+ * As above, except the mbuf chain begins a new record.
+ */
+void
+sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbappendrecord_locked(sb, m0);
+	SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data to the
+ * receive queue of a socket.  If present, m0 must include a packet header
+ * with total length.  Returns 0 if no space in sockbuf or insufficient
+ * mbufs.
+ */
+int
+sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
+    struct mbuf *m0, struct mbuf *control)
+{
+	struct mbuf *m, *n, *nlast;
+	int space = asa->sa_len;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+		panic("sbappendaddr_locked");
+	if (m0)
+		space += m0->m_pkthdr.len;
+	space += m_length(control, &n);
+
+	if (space > sbspace(sb))
+		return (0);
+#if MSIZE <= 256
+	if (asa->sa_len > MLEN)
+		return (0);
+#endif
+	m = m_get(M_NOWAIT, MT_SONAME);
+	if (m == NULL)
+		return (0);
+	m->m_len = asa->sa_len;
+	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
+	if (n)
+		n->m_next = m0;		/* concatenate data to control */
+	else
+		control = m0;
+	m->m_next = control;
+	for (n = m; n->m_next != NULL; n = n->m_next)
+		sballoc(sb, n);
+	sballoc(sb, n);
+	nlast = n;
+	SBLINKRECORD(sb, m);
+
+	sb->sb_mbtail = nlast;
+	SBLASTMBUFCHK(sb);
+
+	SBLASTRECORDCHK(sb);
+	return (1);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data to the
+ * receive queue of a socket.  If present, m0 must include a packet header
+ * with total length.  Returns 0 if no space in sockbuf or insufficient
+ * mbufs.
+ */
+int
+sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
+    struct mbuf *m0, struct mbuf *control)
+{
+	int retval;
+
+	SOCKBUF_LOCK(sb);
+	retval = sbappendaddr_locked(sb, asa, m0, control);
+	SOCKBUF_UNLOCK(sb);
+	return (retval);
+}
+
+int
+sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
+    struct mbuf *control)
+{
+	struct mbuf *m, *n, *mlast;
+	int space;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	if (control == 0)
+		panic("sbappendcontrol_locked");
+	space = m_length(control, &n) + m_length(m0, NULL);
+
+	if (space > sbspace(sb))
+		return (0);
+	n->m_next = m0;			/* concatenate data to control */
+
+	SBLASTRECORDCHK(sb);
+
+	for (m = control; m->m_next; m = m->m_next)
+		sballoc(sb, m);
+	sballoc(sb, m);
+	mlast = m;
+	SBLINKRECORD(sb, control);
+
+	sb->sb_mbtail = mlast;
+	SBLASTMBUFCHK(sb);
+
+	SBLASTRECORDCHK(sb);
+	return (1);
+}
+
+int
+sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
+{
+	int retval;
+
+	SOCKBUF_LOCK(sb);
+	retval = sbappendcontrol_locked(sb, m0, control);
+	SOCKBUF_UNLOCK(sb);
+	return (retval);
+}
+
+/*
+ * Append the data in mbuf chain (m) into the socket buffer sb following mbuf
+ * (n).  If (n) is NULL, the buffer is presumed empty.
+ *
+ * When the data is compressed, mbufs in the chain may be handled in one of
+ * three ways:
+ *
+ * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no
+ *     record boundary, and no change in data type).
+ *
+ * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into
+ *     an mbuf already in the socket buffer.  This can occur if an
+ *     appropriate mbuf exists, there is room, and no merging of data types
+ *     will occur.
+ *
+ * (3) The mbuf may be appended to the end of the existing mbuf chain.
+ *
+ * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as
+ * end-of-record.
+ */
+void
+sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
+{
+	int eor = 0;
+	struct mbuf *o;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	while (m) {
+		eor |= m->m_flags & M_EOR;
+		if (m->m_len == 0 &&
+		    (eor == 0 ||
+		     (((o = m->m_next) || (o = n)) &&
+		      o->m_type == m->m_type))) {
+			if (sb->sb_lastrecord == m)
+				sb->sb_lastrecord = m->m_next;
+			m = m_free(m);
+			continue;
+		}
+		if (n && (n->m_flags & M_EOR) == 0 &&
+		    M_WRITABLE(n) &&
+		    ((sb->sb_flags & SB_NOCOALESCE) == 0) &&
+		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
+		    m->m_len <= M_TRAILINGSPACE(n) &&
+		    n->m_type == m->m_type) {
+			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+			    (unsigned)m->m_len);
+			n->m_len += m->m_len;
+			sb->sb_cc += m->m_len;
+			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
+				/* XXX: Probably don't need.*/
+				sb->sb_ctl += m->m_len;
+			m = m_free(m);
+			continue;
+		}
+		if (n)
+			n->m_next = m;
+		else
+			sb->sb_mb = m;
+		sb->sb_mbtail = m;
+		sballoc(sb, m);
+		n = m;
+		m->m_flags &= ~M_EOR;
+		m = m->m_next;
+		n->m_next = 0;
+	}
+	if (eor) {
+		KASSERT(n != NULL, ("sbcompress: eor && n == NULL"));
+		n->m_flags |= eor;
+	}
+	SBLASTMBUFCHK(sb);
+}
+
+/*
+ * Free all mbufs in a sockbuf.  Check that all resources are reclaimed.
+ */
+static void
+sbflush_internal(struct sockbuf *sb)
+{
+
+	while (sb->sb_mbcnt) {
+		/*
+		 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
+		 * we would loop forever. Panic instead.
+		 */
+		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
+			break;
+		sbdrop_internal(sb, (int)sb->sb_cc);
+	}
+	if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
+		panic("sbflush_internal: cc %u || mb %p || mbcnt %u",
+		    sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
+}
+
+void
+sbflush_locked(struct sockbuf *sb)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+	sbflush_internal(sb);
+}
+
+void
+sbflush(struct sockbuf *sb)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbflush_locked(sb);
+	SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+static void
+sbdrop_internal(struct sockbuf *sb, int len)
+{
+	struct mbuf *m;
+	struct mbuf *next;
+
+	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+	while (len > 0) {
+		if (m == 0) {
+			if (next == 0)
+				panic("sbdrop");
+			m = next;
+			next = m->m_nextpkt;
+			continue;
+		}
+		if (m->m_len > len) {
+			m->m_len -= len;
+			m->m_data += len;
+			sb->sb_cc -= len;
+			if (sb->sb_sndptroff != 0)
+				sb->sb_sndptroff -= len;
+			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
+				sb->sb_ctl -= len;
+			break;
+		}
+		len -= m->m_len;
+		sbfree(sb, m);
+		m = m_free(m);
+	}
+	while (m && m->m_len == 0) {
+		sbfree(sb, m);
+		m = m_free(m);
+	}
+	if (m) {
+		sb->sb_mb = m;
+		m->m_nextpkt = next;
+	} else
+		sb->sb_mb = next;
+	/*
+	 * First part is an inline SB_EMPTY_FIXUP().  Second part makes sure
+	 * sb_lastrecord is up-to-date if we dropped part of the last record.
+	 */
+	m = sb->sb_mb;
+	if (m == NULL) {
+		sb->sb_mbtail = NULL;
+		sb->sb_lastrecord = NULL;
+	} else if (m->m_nextpkt == NULL) {
+		sb->sb_lastrecord = m;
+	}
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop_locked(struct sockbuf *sb, int len)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	sbdrop_internal(sb, len);
+}
+
+void
+sbdrop(struct sockbuf *sb, int len)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbdrop_locked(sb, len);
+	SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Maintain a pointer and offset pair into the socket buffer mbuf chain to
+ * avoid traversal of the entire socket buffer for larger offsets.
+ */
+struct mbuf *
+sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff)
+{
+	struct mbuf *m, *ret;
+
+	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
+	KASSERT(off + len <= sb->sb_cc, ("%s: beyond sb", __func__));
+	KASSERT(sb->sb_sndptroff <= sb->sb_cc, ("%s: sndptroff broken", __func__));
+
+	/*
+	 * Is off below stored offset? Happens on retransmits.
+	 * Just return, we can't help here.
+	 */
+	if (sb->sb_sndptroff > off) {
+		*moff = off;
+		return (sb->sb_mb);
+	}
+
+	/* Return closest mbuf in chain for current offset. */
+	*moff = off - sb->sb_sndptroff;
+	m = ret = sb->sb_sndptr ? sb->sb_sndptr : sb->sb_mb;
+	if (*moff == m->m_len) {
+		*moff = 0;
+		sb->sb_sndptroff += m->m_len;
+		m = ret = m->m_next;
+		KASSERT(ret->m_len > 0,
+		    ("mbuf %p in sockbuf %p chain has no valid data", ret, sb));
+	}
+
+	/* Advance by len to be as close as possible for the next transmit. */
+	for (off = off - sb->sb_sndptroff + len - 1;
+	     off > 0 && m != NULL && off >= m->m_len;
+	     m = m->m_next) {
+		sb->sb_sndptroff += m->m_len;
+		off -= m->m_len;
+	}
+	if (off > 0 && m == NULL)
+		panic("%s: sockbuf %p and mbuf %p clashing", __func__, sb, ret);
+	sb->sb_sndptr = m;
+
+	return (ret);
+}
+
+/*
+ * Drop a record off the front of a sockbuf and move the next record to the
+ * front.
+ */
+void
+sbdroprecord_locked(struct sockbuf *sb)
+{
+	struct mbuf *m;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	m = sb->sb_mb;
+	if (m) {
+		sb->sb_mb = m->m_nextpkt;
+		do {
+			sbfree(sb, m);
+			m = m_free(m);
+		} while (m);
+	}
+	SB_EMPTY_FIXUP(sb);
+}
+
+/*
+ * Drop a record off the front of a sockbuf and move the next record to the
+ * front.
+ */
+void
+sbdroprecord(struct sockbuf *sb)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbdroprecord_locked(sb);
+	SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Create a "control" mbuf containing the specified data with the specified
+ * type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(caddr_t p, int size, int type, int level)
+{
+	struct cmsghdr *cp;
+	struct mbuf *m;
+
+	if (CMSG_SPACE((u_int)size) > MCLBYTES)
+		return ((struct mbuf *) NULL);
+	if (CMSG_SPACE((u_int)size) > MLEN)
+		m = m_getcl(M_NOWAIT, MT_CONTROL, 0);
+	else
+		m = m_get(M_NOWAIT, MT_CONTROL);
+	if (m == NULL)
+		return ((struct mbuf *) NULL);
+	cp = mtod(m, struct cmsghdr *);
+	m->m_len = 0;
+	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
+	    ("sbcreatecontrol: short mbuf"));
+	if (p != NULL)
+		(void)memcpy(CMSG_DATA(cp), p, size);
+	m->m_len = CMSG_SPACE(size);
+	cp->cmsg_len = CMSG_LEN(size);
+	cp->cmsg_level = level;
+	cp->cmsg_type = type;
+	return (m);
+}
+
+/*
+ * This does the same for socket buffers that sotoxsocket does for sockets:
+ * generate an user-format data structure describing the socket buffer.  Note
+ * that the xsockbuf structure, since it is always embedded in a socket, does
+ * not include a self pointer nor a length.  We make this entry point public
+ * in case some other mechanism needs it.
+ */
+void
+sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
+{
+
+	xsb->sb_cc = sb->sb_cc;
+	xsb->sb_hiwat = sb->sb_hiwat;
+	xsb->sb_mbcnt = sb->sb_mbcnt;
+	xsb->sb_mcnt = sb->sb_mcnt;	
+	xsb->sb_ccnt = sb->sb_ccnt;
+	xsb->sb_mbmax = sb->sb_mbmax;
+	xsb->sb_lowat = sb->sb_lowat;
+	xsb->sb_flags = sb->sb_flags;
+	xsb->sb_timeo = sb->sb_timeo;
+}
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW,
+    &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size");
+SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+    &sb_efficiency, 0, "Socket buffer size waste factor");
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
new file mode 100644
index 0000000..639d865
--- /dev/null
+++ b/sys/kern/uipc_socket.c
@@ -0,0 +1,3752 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ *	The Regents of the University of California.
+ * Copyright (c) 2004 The FreeBSD Foundation
+ * Copyright (c) 2004-2008 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
+ */
+
+/*
+ * Comments on the socket life cycle:
+ *
+ * soalloc() sets of socket layer state for a socket, called only by
+ * socreate() and sonewconn().  Socket layer private.
+ *
+ * sodealloc() tears down socket layer state for a socket, called only by
+ * sofree() and sonewconn().  Socket layer private.
+ *
+ * pru_attach() associates protocol layer state with an allocated socket;
+ * called only once, may fail, aborting socket allocation.  This is called
+ * from socreate() and sonewconn().  Socket layer private.
+ *
+ * pru_detach() disassociates protocol layer state from an attached socket,
+ * and will be called exactly once for sockets in which pru_attach() has
+ * been successfully called.  If pru_attach() returned an error,
+ * pru_detach() will not be called.  Socket layer private.
+ *
+ * pru_abort() and pru_close() notify the protocol layer that the last
+ * consumer of a socket is starting to tear down the socket, and that the
+ * protocol should terminate the connection.  Historically, pru_abort() also
+ * detached protocol state from the socket state, but this is no longer the
+ * case.
+ *
+ * socreate() creates a socket and attaches protocol state.  This is a public
+ * interface that may be used by socket layer consumers to create new
+ * sockets.
+ *
+ * sonewconn() creates a socket and attaches protocol state.  This is a
+ * public interface  that may be used by protocols to create new sockets when
+ * a new connection is received and will be available for accept() on a
+ * listen socket.
+ *
+ * soclose() destroys a socket after possibly waiting for it to disconnect.
+ * This is a public interface that socket consumers should use to close and
+ * release a socket when done with it.
+ *
+ * soabort() destroys a socket without waiting for it to disconnect (used
+ * only for incoming connections that are already partially or fully
+ * connected).  This is used internally by the socket layer when clearing
+ * listen socket queues (due to overflow or close on the listen socket), but
+ * is also a public interface protocols may use to abort connections in
+ * their incomplete listen queues should they no longer be required.  Sockets
+ * placed in completed connection listen queues should not be aborted for
+ * reasons described in the comment above the soclose() implementation.  This
+ * is not a general purpose close routine, and except in the specific
+ * circumstances described here, should not be used.
+ *
+ * sofree() will free a socket and its protocol state if all references on
+ * the socket have been released, and is the public interface to attempt to
+ * free a socket when a reference is removed.  This is a socket layer private
+ * interface.
+ *
+ * NOTE: In addition to socreate() and soclose(), which provide a single
+ * socket reference to the consumer to be managed as required, there are two
+ * calls to explicitly manage socket references, soref(), and sorele().
+ * Currently, these are generally required only when transitioning a socket
+ * from a listen queue to a file descriptor, in order to prevent garbage
+ * collection of the socket at an untimely moment.  For a number of reasons,
+ * these interfaces are not preferred, and should be avoided.
+ *
+ * NOTE: With regard to VNETs the general rule is that callers do not set
+ * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
+ * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
+ * and sorflush(), which are usually called from a pre-set VNET context.
+ * sopoll() currently does not need a VNET context to be set.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_zero.h"
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mac.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/domain.h>
+#include <sys/file.h>			/* for struct knote */
+#include <sys/kernel.h>
+#include <sys/event.h>
+#include <sys/eventhandler.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/resourcevar.h>
+#include <net/route.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/jail.h>
+#include <sys/syslog.h>
+#include <netinet/in.h>
+
+#include <net/vnet.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/uma.h>
+
+#ifdef COMPAT_FREEBSD32
+#include <sys/mount.h>
+#include <sys/sysent.h>
+#include <compat/freebsd32/freebsd32.h>
+#endif
+
+static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
+		    int flags);
+
+static void	filt_sordetach(struct knote *kn);
+static int	filt_soread(struct knote *kn, long hint);
+static void	filt_sowdetach(struct knote *kn);
+static int	filt_sowrite(struct knote *kn, long hint);
+static int	filt_solisten(struct knote *kn, long hint);
+
+static struct filterops solisten_filtops = {
+	.f_isfd = 1,
+	.f_detach = filt_sordetach,
+	.f_event = filt_solisten,
+};
+static struct filterops soread_filtops = {
+	.f_isfd = 1,
+	.f_detach = filt_sordetach,
+	.f_event = filt_soread,
+};
+static struct filterops sowrite_filtops = {
+	.f_isfd = 1,
+	.f_detach = filt_sowdetach,
+	.f_event = filt_sowrite,
+};
+
+so_gen_t	so_gencnt;	/* generation count for sockets */
+
+MALLOC_DEFINE(M_SONAME, "soname", "socket name");
+MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
+
+#define	VNET_SO_ASSERT(so)						\
+	VNET_ASSERT(curvnet != NULL,					\
+	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
+
+/*
+ * Limit on the number of connections in the listen queue waiting
+ * for accept(2).
+ * NB: The orginal sysctl somaxconn is still available but hidden
+ * to prevent confusion about the actual purpose of this number.
+ */
+static int somaxconn = SOMAXCONN;
+
+static int
+sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	int val;
+
+	val = somaxconn;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error || !req->newptr )
+		return (error);
+
+	if (val < 1 || val > USHRT_MAX)
+		return (EINVAL);
+
+	somaxconn = val;
+	return (0);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW,
+    0, sizeof(int), sysctl_somaxconn, "I",
+    "Maximum listen socket pending connection accept queue size");
+SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
+    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP,
+    0, sizeof(int), sysctl_somaxconn, "I",
+    "Maximum listen socket pending connection accept queue size (compat)");
+
+static int numopensockets;
+SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
+    &numopensockets, 0, "Number of open sockets");
+
+#if defined(SOCKET_SEND_COW) || defined(SOCKET_RECV_PFLIP)
+SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
+    "Zero copy controls");
+#ifdef SOCKET_RECV_PFLIP
+int so_zero_copy_receive = 1;
+SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
+    &so_zero_copy_receive, 0, "Enable zero copy receive");
+#endif
+#ifdef SOCKET_SEND_COW
+int so_zero_copy_send = 1;
+SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
+    &so_zero_copy_send, 0, "Enable zero copy send");
+#endif /* SOCKET_SEND_COW */
+#endif /* SOCKET_SEND_COW || SOCKET_RECV_PFLIP */
+
+/*
+ * accept_mtx locks down per-socket fields relating to accept queues.  See
+ * socketvar.h for an annotation of the protected fields of struct socket.
+ */
+struct mtx accept_mtx;
+MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
+
+/*
+ * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
+ * so_gencnt field.
+ */
+static struct mtx so_global_mtx;
+MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
+
+/*
+ * General IPC sysctl name space, used by sockets and a variety of other IPC
+ * types.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/*
+ * Initialize the socket subsystem and set up the socket
+ * memory allocator.
+ */
+static uma_zone_t socket_zone;
+int	maxsockets;
+
+static void
+socket_zone_change(void *tag)
+{
+
+	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
+}
+
+static void
+socket_init(void *tag)
+{
+
+	socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
+	uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
+	EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
+	    EVENTHANDLER_PRI_FIRST);
+}
+SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
+
+/*
+ * Initialise maxsockets.  This SYSINIT must be run after
+ * tunable_mbinit().
+ */
+static void
+init_maxsockets(void *ignored)
+{
+
+	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
+	maxsockets = imax(maxsockets, maxfiles);
+}
+SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
+
+/*
+ * Sysctl to get and set the maximum global sockets limit.  Notify protocols
+ * of the change so that they can update their dependent limits as required.
+ */
+static int
+sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
+{
+	int error, newmaxsockets;
+
+	newmaxsockets = maxsockets;
+	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
+	if (error == 0 && req->newptr) {
+		if (newmaxsockets > maxsockets &&
+		    newmaxsockets <= maxfiles) {
+			maxsockets = newmaxsockets;
+			EVENTHANDLER_INVOKE(maxsockets_change);
+		} else
+			error = EINVAL;
+	}
+	return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
+    &maxsockets, 0, sysctl_maxsockets, "IU",
+    "Maximum number of sockets avaliable");
+
+/*
+ * Socket operation routines.  These routines are called by the routines in
+ * sys_socket.c or from a system process, and implement the semantics of
+ * socket operations by switching out to the protocol specific routines.
+ */
+
+/*
+ * Get a socket structure from our zone, and initialize it.  Note that it
+ * would probably be better to allocate socket and PCB at the same time, but
+ * I'm not convinced that all the protocols can be easily modified to do
+ * this.
+ *
+ * soalloc() returns a socket with a ref count of 0.
+ */
+static struct socket *
+soalloc(struct vnet *vnet)
+{
+	struct socket *so;
+
+	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
+	if (so == NULL)
+		return (NULL);
+#ifdef MAC
+	if (mac_socket_init(so, M_NOWAIT) != 0) {
+		uma_zfree(socket_zone, so);
+		return (NULL);
+	}
+#endif
+	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
+	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
+	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
+	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
+	TAILQ_INIT(&so->so_aiojobq);
+	mtx_lock(&so_global_mtx);
+	so->so_gencnt = ++so_gencnt;
+	++numopensockets;
+#ifdef VIMAGE
+	VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
+	    __func__, __LINE__, so));
+	vnet->vnet_sockcnt++;
+	so->so_vnet = vnet;
+#endif
+	mtx_unlock(&so_global_mtx);
+	return (so);
+}
+
+/*
+ * Free the storage associated with a socket at the socket layer, tear down
+ * locks, labels, etc.  All protocol state is assumed already to have been
+ * torn down (and possibly never set up) by the caller.
+ */
+static void
+sodealloc(struct socket *so)
+{
+
+	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
+	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
+
+	mtx_lock(&so_global_mtx);
+	so->so_gencnt = ++so_gencnt;
+	--numopensockets;	/* Could be below, but faster here. */
+#ifdef VIMAGE
+	VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
+	    __func__, __LINE__, so));
+	so->so_vnet->vnet_sockcnt--;
+#endif
+	mtx_unlock(&so_global_mtx);
+	if (so->so_rcv.sb_hiwat)
+		(void)chgsbsize(so->so_cred->cr_uidinfo,
+		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
+	if (so->so_snd.sb_hiwat)
+		(void)chgsbsize(so->so_cred->cr_uidinfo,
+		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
+#ifdef INET
+	/* remove acccept filter if one is present. */
+	if (so->so_accf != NULL)
+		do_setopt_accept_filter(so, NULL);
+#endif
+#ifdef MAC
+	mac_socket_destroy(so);
+#endif
+	crfree(so->so_cred);
+	sx_destroy(&so->so_snd.sb_sx);
+	sx_destroy(&so->so_rcv.sb_sx);
+	SOCKBUF_LOCK_DESTROY(&so->so_snd);
+	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+	uma_zfree(socket_zone, so);
+}
+
+/*
+ * socreate returns a socket with a ref count of 1.  The socket should be
+ * closed with soclose().
+ */
+int
+socreate(int dom, struct socket **aso, int type, int proto,
+    struct ucred *cred, struct thread *td)
+{
+	struct protosw *prp;
+	struct socket *so;
+	int error;
+
+	if (proto)
+		prp = pffindproto(dom, proto, type);
+	else
+		prp = pffindtype(dom, type);
+
+	if (prp == NULL) {
+		/* No support for domain. */
+		if (pffinddomain(dom) == NULL)
+			return (EAFNOSUPPORT);
+		/* No support for socket type. */
+		if (proto == 0 && type != 0)
+			return (EPROTOTYPE);
+		return (EPROTONOSUPPORT);
+	}
+	if (prp->pr_usrreqs->pru_attach == NULL ||
+	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
+		return (EPROTONOSUPPORT);
+
+	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
+		return (EPROTONOSUPPORT);
+
+	if (prp->pr_type != type)
+		return (EPROTOTYPE);
+	so = soalloc(CRED_TO_VNET(cred));
+	if (so == NULL)
+		return (ENOBUFS);
+
+	TAILQ_INIT(&so->so_incomp);
+	TAILQ_INIT(&so->so_comp);
+	so->so_type = type;
+	so->so_cred = crhold(cred);
+	if ((prp->pr_domain->dom_family == PF_INET) ||
+	    (prp->pr_domain->dom_family == PF_INET6) ||
+	    (prp->pr_domain->dom_family == PF_ROUTE))
+		so->so_fibnum = td->td_proc->p_fibnum;
+	else
+		so->so_fibnum = 0;
+	so->so_proto = prp;
+#ifdef MAC
+	mac_socket_create(cred, so);
+#endif
+	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
+	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
+	so->so_count = 1;
+	/*
+	 * Auto-sizing of socket buffers is managed by the protocols and
+	 * the appropriate flags must be set in the pru_attach function.
+	 */
+	CURVNET_SET(so->so_vnet);
+	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
+	CURVNET_RESTORE();
+	if (error) {
+		KASSERT(so->so_count == 1, ("socreate: so_count %d",
+		    so->so_count));
+		so->so_count = 0;
+		sodealloc(so);
+		return (error);
+	}
+	*aso = so;
+	return (0);
+}
+
+#ifdef REGRESSION
+static int regression_sonewconn_earlytest = 1;
+SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
+    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
+#endif
+
+/*
+ * When an attempt at a new connection is noted on a socket which accepts
+ * connections, sonewconn is called.  If the connection is possible (subject
+ * to space constraints, etc.) then we allocate a new structure, propoerly
+ * linked into the data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ *
+ * Note: the ref count on the socket is 0 on return.
+ */
+struct socket *
+sonewconn(struct socket *head, int connstatus)
+{
+	struct socket *so;
+	int over;
+
+	ACCEPT_LOCK();
+	over = (head->so_qlen > 3 * head->so_qlimit / 2);
+	ACCEPT_UNLOCK();
+#ifdef REGRESSION
+	if (regression_sonewconn_earlytest && over) {
+#else
+	if (over) {
+#endif
+		log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
+		    "%i already in queue awaiting acceptance\n",
+		    __func__, head->so_pcb, head->so_qlen);
+		return (NULL);
+	}
+	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
+	    __func__, __LINE__, head));
+	so = soalloc(head->so_vnet);
+	if (so == NULL) {
+		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
+		    "limit reached or out of memory\n",
+		    __func__, head->so_pcb);
+		return (NULL);
+	}
+	if ((head->so_options & SO_ACCEPTFILTER) != 0)
+		connstatus = 0;
+	so->so_head = head;
+	so->so_type = head->so_type;
+	so->so_options = head->so_options &~ SO_ACCEPTCONN;
+	so->so_linger = head->so_linger;
+	so->so_state = head->so_state | SS_NOFDREF;
+	so->so_fibnum = head->so_fibnum;
+	so->so_proto = head->so_proto;
+	so->so_cred = crhold(head->so_cred);
+#ifdef MAC
+	mac_socket_newconn(head, so);
+#endif
+	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
+	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
+	VNET_SO_ASSERT(head);
+	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
+		sodealloc(so);
+		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
+		    __func__, head->so_pcb);
+		return (NULL);
+	}
+	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+		sodealloc(so);
+		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
+		    __func__, head->so_pcb);
+		return (NULL);
+	}
+	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
+	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
+	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
+	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
+	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
+	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
+	so->so_state |= connstatus;
+	ACCEPT_LOCK();
+	/*
+	 * The accept socket may be tearing down but we just
+	 * won a race on the ACCEPT_LOCK.
+	 * However, if sctp_peeloff() is called on a 1-to-many
+	 * style socket, the SO_ACCEPTCONN doesn't need to be set.
+	 */
+	if (!(head->so_options & SO_ACCEPTCONN) &&
+	    ((head->so_proto->pr_protocol != IPPROTO_SCTP) ||
+	     (head->so_type != SOCK_SEQPACKET))) {
+		SOCK_LOCK(so);
+		so->so_head = NULL;
+		sofree(so);		/* NB: returns ACCEPT_UNLOCK'ed. */
+		return (NULL);
+	}
+	if (connstatus) {
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		so->so_qstate |= SQ_COMP;
+		head->so_qlen++;
+	} else {
+		/*
+		 * Keep removing sockets from the head until there's room for
+		 * us to insert on the tail.  In pre-locking revisions, this
+		 * was a simple if(), but as we could be racing with other
+		 * threads and soabort() requires dropping locks, we must
+		 * loop waiting for the condition to be true.
+		 */
+		while (head->so_incqlen > head->so_qlimit) {
+			struct socket *sp;
+			sp = TAILQ_FIRST(&head->so_incomp);
+			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
+			head->so_incqlen--;
+			sp->so_qstate &= ~SQ_INCOMP;
+			sp->so_head = NULL;
+			ACCEPT_UNLOCK();
+			soabort(sp);
+			ACCEPT_LOCK();
+		}
+		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+		so->so_qstate |= SQ_INCOMP;
+		head->so_incqlen++;
+	}
+	ACCEPT_UNLOCK();
+	if (connstatus) {
+		sorwakeup(head);
+		wakeup_one(&head->so_timeo);
+	}
+	return (so);
+}
+
+int
+sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	int error;
+
+	CURVNET_SET(so->so_vnet);
+	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
+	CURVNET_RESTORE();
+	return (error);
+}
+
+int
+sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	int error;
+
+	CURVNET_SET(so->so_vnet);
+	error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
+	CURVNET_RESTORE();
+	return (error);
+}
+
+/*
+ * solisten() transitions a socket from a non-listening state to a listening
+ * state, but can also be used to update the listen queue depth on an
+ * existing listen socket.  The protocol will call back into the sockets
+ * layer using solisten_proto_check() and solisten_proto() to check and set
+ * socket-layer listen state.  Call backs are used so that the protocol can
+ * acquire both protocol and socket layer locks in whatever order is required
+ * by the protocol.
+ *
+ * Protocol implementors are advised to hold the socket lock across the
+ * socket-layer test and set to avoid races at the socket layer.
+ */
+int
+solisten(struct socket *so, int backlog, struct thread *td)
+{
+	int error;
+
+	CURVNET_SET(so->so_vnet);
+	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
+	CURVNET_RESTORE();
+	return (error);
+}
+
+int
+solisten_proto_check(struct socket *so)
+{
+
+	SOCK_LOCK_ASSERT(so);
+
+	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
+	    SS_ISDISCONNECTING))
+		return (EINVAL);
+	return (0);
+}
+
+void
+solisten_proto(struct socket *so, int backlog)
+{
+
+	SOCK_LOCK_ASSERT(so);
+
+	if (backlog < 0 || backlog > somaxconn)
+		backlog = somaxconn;
+	so->so_qlimit = backlog;
+	so->so_options |= SO_ACCEPTCONN;
+}
+
+/*
+ * Evaluate the reference count and named references on a socket; if no
+ * references remain, free it.  This should be called whenever a reference is
+ * released, such as in sorele(), but also when named reference flags are
+ * cleared in socket or protocol code.
+ *
+ * sofree() will free the socket if:
+ *
+ * - There are no outstanding file descriptor references or related consumers
+ *   (so_count == 0).
+ *
+ * - The socket has been closed by user space, if ever open (SS_NOFDREF).
+ *
+ * - The protocol does not have an outstanding strong reference on the socket
+ *   (SS_PROTOREF).
+ *
+ * - The socket is not in a completed connection queue, so a process has been
+ *   notified that it is present.  If it is removed, the user process may
+ *   block in accept() despite select() saying the socket was ready.
+ */
+void
+sofree(struct socket *so)
+{
+	struct protosw *pr = so->so_proto;
+	struct socket *head;
+
+	ACCEPT_LOCK_ASSERT();
+	SOCK_LOCK_ASSERT(so);
+
+	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
+	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
+		SOCK_UNLOCK(so);
+		ACCEPT_UNLOCK();
+		return;
+	}
+
+	head = so->so_head;
+	if (head != NULL) {
+		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
+		    (so->so_qstate & SQ_INCOMP) != 0,
+		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
+		    "SQ_INCOMP"));
+		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
+		    (so->so_qstate & SQ_INCOMP) == 0,
+		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
+		TAILQ_REMOVE(&head->so_incomp, so, so_list);
+		head->so_incqlen--;
+		so->so_qstate &= ~SQ_INCOMP;
+		so->so_head = NULL;
+	}
+	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
+	    (so->so_qstate & SQ_INCOMP) == 0,
+	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
+	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
+	if (so->so_options & SO_ACCEPTCONN) {
+		KASSERT((TAILQ_EMPTY(&so->so_comp)),
+		    ("sofree: so_comp populated"));
+		KASSERT((TAILQ_EMPTY(&so->so_incomp)),
+		    ("sofree: so_incomp populated"));
+	}
+	SOCK_UNLOCK(so);
+	ACCEPT_UNLOCK();
+
+	VNET_SO_ASSERT(so);
+	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
+		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
+	if (pr->pr_usrreqs->pru_detach != NULL)
+		(*pr->pr_usrreqs->pru_detach)(so);
+
+	/*
+	 * From this point on, we assume that no other references to this
+	 * socket exist anywhere else in the stack.  Therefore, no locks need
+	 * to be acquired or held.
+	 *
+	 * We used to do a lot of socket buffer and socket locking here, as
+	 * well as invoke sorflush() and perform wakeups.  The direct call to
+	 * dom_dispose() and sbrelease_internal() are an inlining of what was
+	 * necessary from sorflush().
+	 *
+	 * Notice that the socket buffer and kqueue state are torn down
+	 * before calling pru_detach.  This means that protocols shold not
+	 * assume they can perform socket wakeups, etc, in their detach code.
+	 */
+	sbdestroy(&so->so_snd, so);
+	sbdestroy(&so->so_rcv, so);
+	seldrain(&so->so_snd.sb_sel);
+	seldrain(&so->so_rcv.sb_sel);
+	knlist_destroy(&so->so_rcv.sb_sel.si_note);
+	knlist_destroy(&so->so_snd.sb_sel.si_note);
+	sodealloc(so);
+}
+
+/*
+ * Close a socket on last file table reference removal.  Initiate disconnect
+ * if connected.  Free socket when disconnect complete.
+ *
+ * This function will sorele() the socket.  Note that soclose() may be called
+ * prior to the ref count reaching zero.  The actual socket structure will
+ * not be freed until the ref count reaches zero.
+ */
+int
+soclose(struct socket *so)
+{
+	int error = 0;
+
+	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
+
+	CURVNET_SET(so->so_vnet);
+	funsetown(&so->so_sigio);
+	if (so->so_state & SS_ISCONNECTED) {
+		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
+			error = sodisconnect(so);
+			if (error) {
+				if (error == ENOTCONN)
+					error = 0;
+				goto drop;
+			}
+		}
+		if (so->so_options & SO_LINGER) {
+			if ((so->so_state & SS_ISDISCONNECTING) &&
+			    (so->so_state & SS_NBIO))
+				goto drop;
+			while (so->so_state & SS_ISCONNECTED) {
+				error = tsleep(&so->so_timeo,
+				    PSOCK | PCATCH, "soclos",
+				    so->so_linger * hz);
+				if (error)
+					break;
+			}
+		}
+	}
+
+drop:
+	if (so->so_proto->pr_usrreqs->pru_close != NULL)
+		(*so->so_proto->pr_usrreqs->pru_close)(so);
+	ACCEPT_LOCK();
+	if (so->so_options & SO_ACCEPTCONN) {
+		struct socket *sp;
+		/*
+		 * Prevent new additions to the accept queues due
+		 * to ACCEPT_LOCK races while we are draining them.
+		 */
+		so->so_options &= ~SO_ACCEPTCONN;
+		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
+			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
+			so->so_incqlen--;
+			sp->so_qstate &= ~SQ_INCOMP;
+			sp->so_head = NULL;
+			ACCEPT_UNLOCK();
+			soabort(sp);
+			ACCEPT_LOCK();
+		}
+		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
+			TAILQ_REMOVE(&so->so_comp, sp, so_list);
+			so->so_qlen--;
+			sp->so_qstate &= ~SQ_COMP;
+			sp->so_head = NULL;
+			ACCEPT_UNLOCK();
+			soabort(sp);
+			ACCEPT_LOCK();
+		}
+		KASSERT((TAILQ_EMPTY(&so->so_comp)),
+		    ("%s: so_comp populated", __func__));
+		KASSERT((TAILQ_EMPTY(&so->so_incomp)),
+		    ("%s: so_incomp populated", __func__));
+	}
+	SOCK_LOCK(so);
+	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
+	so->so_state |= SS_NOFDREF;
+	sorele(so);			/* NB: Returns with ACCEPT_UNLOCK(). */
+	CURVNET_RESTORE();
+	return (error);
+}
+
+/*
+ * soabort() is used to abruptly tear down a connection, such as when a
+ * resource limit is reached (listen queue depth exceeded), or if a listen
+ * socket is closed while there are sockets waiting to be accepted.
+ *
+ * This interface is tricky, because it is called on an unreferenced socket,
+ * and must be called only by a thread that has actually removed the socket
+ * from the listen queue it was on, or races with other threads are risked.
+ *
+ * This interface will call into the protocol code, so must not be called
+ * with any socket locks held.  Protocols do call it while holding their own
+ * recursible protocol mutexes, but this is something that should be subject
+ * to review in the future.
+ */
+void
+soabort(struct socket *so)
+{
+
+	/*
+	 * In as much as is possible, assert that no references to this
+	 * socket are held.  This is not quite the same as asserting that the
+	 * current thread is responsible for arranging for no references, but
+	 * is as close as we can get for now.
+	 */
+	KASSERT(so->so_count == 0, ("soabort: so_count"));
+	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
+	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
+	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
+	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
+	VNET_SO_ASSERT(so);
+
+	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
+		(*so->so_proto->pr_usrreqs->pru_abort)(so);
+	ACCEPT_LOCK();
+	SOCK_LOCK(so);
+	sofree(so);
+}
+
+int
+soaccept(struct socket *so, struct sockaddr **nam)
+{
+	int error;
+
+	SOCK_LOCK(so);
+	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
+	so->so_state &= ~SS_NOFDREF;
+	SOCK_UNLOCK(so);
+
+	CURVNET_SET(so->so_vnet);
+	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
+	CURVNET_RESTORE();
+	return (error);
+}
+
+int
+soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+	return (soconnectat(AT_FDCWD, so, nam, td));
+}
+
+int
+soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	int error;
+
+	if (so->so_options & SO_ACCEPTCONN)
+		return (EOPNOTSUPP);
+
+	CURVNET_SET(so->so_vnet);
+	/*
+	 * If protocol is connection-based, can only connect once.
+	 * Otherwise, if connected, try to disconnect first.  This allows
+	 * user to disconnect by connecting to, e.g., a null address.
+	 */
+	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
+	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
+	    (error = sodisconnect(so)))) {
+		error = EISCONN;
+	} else {
+		/*
+		 * Prevent accumulated error from previous connection from
+		 * biting us.
+		 */
+		so->so_error = 0;
+		if (fd == AT_FDCWD) {
+			error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
+			    nam, td);
+		} else {
+			error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
+			    so, nam, td);
+		}
+	}
+	CURVNET_RESTORE();
+
+	return (error);
+}
+
+int
+soconnect2(struct socket *so1, struct socket *so2)
+{
+	int error;
+
+	CURVNET_SET(so1->so_vnet);
+	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
+	CURVNET_RESTORE();
+	return (error);
+}
+
+int
+sodisconnect(struct socket *so)
+{
+	int error;
+
+	if ((so->so_state & SS_ISCONNECTED) == 0)
+		return (ENOTCONN);
+	if (so->so_state & SS_ISDISCONNECTING)
+		return (EALREADY);
+	VNET_SO_ASSERT(so);
+	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
+	return (error);
+}
+
+#ifdef SOCKET_SEND_COW
+struct so_zerocopy_stats{
+	int size_ok;
+	int align_ok;
+	int found_ifp;
+};
+struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
+
+/*
+ * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
+ * sosend_dgram() and sosend_generic() use m_uiotombuf().
+ *
+ * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
+ * all of the data referenced by the uio.  If desired, it uses zero-copy.
+ * *space will be updated to reflect data copied in.
+ *
+ * NB: If atomic I/O is requested, the caller must already have checked that
+ * space can hold resid bytes.
+ *
+ * NB: In the event of an error, the caller may need to free the partial
+ * chain pointed to by *mpp.  The contents of both *uio and *space may be
+ * modified even in the case of an error.
+ */
+static int
+sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
+    int flags)
+{
+	struct mbuf *m, **mp, *top;
+	long len;
+	ssize_t resid;
+	int error;
+	int cow_send;
+
+	*retmp = top = NULL;
+	mp = &top;
+	len = 0;
+	resid = uio->uio_resid;
+	error = 0;
+	do {
+		cow_send = 0;
+		if (resid >= MINCLSIZE) {
+			if (top == NULL) {
+				m = m_gethdr(M_WAITOK, MT_DATA);
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = NULL;
+			} else
+				m = m_get(M_WAITOK, MT_DATA);
+			if (so_zero_copy_send &&
+			    resid >= PAGE_SIZE &&
+			    *space >= PAGE_SIZE &&
+			    uio->uio_iov->iov_len >= PAGE_SIZE) {
+				so_zerocp_stats.size_ok++;
+				so_zerocp_stats.align_ok++;
+				cow_send = socow_setup(m, uio);
+				len = cow_send;
+			}
+			if (!cow_send) {
+				m_clget(m, M_WAITOK);
+				len = min(min(MCLBYTES, resid), *space);
+			}
+		} else {
+			if (top == NULL) {
+				m = m_gethdr(M_WAITOK, MT_DATA);
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = NULL;
+
+				len = min(min(MHLEN, resid), *space);
+				/*
+				 * For datagram protocols, leave room
+				 * for protocol headers in first mbuf.
+				 */
+				if (atomic && m && len < MHLEN)
+					MH_ALIGN(m, len);
+			} else {
+				m = m_get(M_WAITOK, MT_DATA);
+				len = min(min(MLEN, resid), *space);
+			}
+		}
+		if (m == NULL) {
+			error = ENOBUFS;
+			goto out;
+		}
+
+		*space -= len;
+		if (cow_send)
+			error = 0;
+		else
+			error = uiomove(mtod(m, void *), (int)len, uio);
+		resid = uio->uio_resid;
+		m->m_len = len;
+		*mp = m;
+		top->m_pkthdr.len += len;
+		if (error)
+			goto out;
+		mp = &m->m_next;
+		if (resid <= 0) {
+			if (flags & MSG_EOR)
+				top->m_flags |= M_EOR;
+			break;
+		}
+	} while (*space > 0 && atomic);
+out:
+	*retmp = top;
+	return (error);
+}
+#endif /* SOCKET_SEND_COW */
+
+#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
+
+int
+sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+	long space;
+	ssize_t resid;
+	int clen = 0, error, dontroute;
+#ifdef SOCKET_SEND_COW
+	int atomic = sosendallatonce(so) || top;
+#endif
+
+	KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
+	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
+	    ("sosend_dgram: !PR_ATOMIC"));
+
+	if (uio != NULL)
+		resid = uio->uio_resid;
+	else
+		resid = top->m_pkthdr.len;
+	/*
+	 * In theory resid should be unsigned.  However, space must be
+	 * signed, as it might be less than 0 if we over-committed, and we
+	 * must use a signed comparison of space and resid.  On the other
+	 * hand, a negative resid causes us to loop sending 0-length
+	 * segments to the protocol.
+	 */
+	if (resid < 0) {
+		error = EINVAL;
+		goto out;
+	}
+
+	dontroute =
+	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
+	if (td != NULL)
+		td->td_ru.ru_msgsnd++;
+	if (control != NULL)
+		clen = control->m_len;
+
+	SOCKBUF_LOCK(&so->so_snd);
+	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+		SOCKBUF_UNLOCK(&so->so_snd);
+		error = EPIPE;
+		goto out;
+	}
+	if (so->so_error) {
+		error = so->so_error;
+		so->so_error = 0;
+		SOCKBUF_UNLOCK(&so->so_snd);
+		goto out;
+	}
+	if ((so->so_state & SS_ISCONNECTED) == 0) {
+		/*
+		 * `sendto' and `sendmsg' is allowed on a connection-based
+		 * socket if it supports implied connect.  Return ENOTCONN if
+		 * not connected and no address is supplied.
+		 */
+		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
+			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
+			    !(resid == 0 && clen != 0)) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				error = ENOTCONN;
+				goto out;
+			}
+		} else if (addr == NULL) {
+			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
+				error = ENOTCONN;
+			else
+				error = EDESTADDRREQ;
+			SOCKBUF_UNLOCK(&so->so_snd);
+			goto out;
+		}
+	}
+
+	/*
+	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
+	 * problem and need fixing.
+	 */
+	space = sbspace(&so->so_snd);
+	if (flags & MSG_OOB)
+		space += 1024;
+	space -= clen;
+	SOCKBUF_UNLOCK(&so->so_snd);
+	if (resid > space) {
+		error = EMSGSIZE;
+		goto out;
+	}
+	if (uio == NULL) {
+		resid = 0;
+		if (flags & MSG_EOR)
+			top->m_flags |= M_EOR;
+	} else {
+#ifdef SOCKET_SEND_COW
+		error = sosend_copyin(uio, &top, atomic, &space, flags);
+		if (error)
+			goto out;
+#else
+		/*
+		 * Copy the data from userland into a mbuf chain.
+		 * If no data is to be copied in, a single empty mbuf
+		 * is returned.
+		 */
+		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
+		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
+		if (top == NULL) {
+			error = EFAULT;	/* only possible error */
+			goto out;
+		}
+		space -= resid - uio->uio_resid;
+#endif /* SOCKET_SEND_COW */
+		resid = uio->uio_resid;
+	}
+	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
+	/*
+	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
+	 * than with.
+	 */
+	if (dontroute) {
+		SOCK_LOCK(so);
+		so->so_options |= SO_DONTROUTE;
+		SOCK_UNLOCK(so);
+	}
+	/*
+	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
+	 * of date.  We could have recieved a reset packet in an interrupt or
+	 * maybe we slept while doing page faults in uiomove() etc.  We could
+	 * probably recheck again inside the locking protection here, but
+	 * there are probably other places that this also happens.  We must
+	 * rethink this.
+	 */
+	VNET_SO_ASSERT(so);
+	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+	    (flags & MSG_OOB) ? PRUS_OOB :
+	/*
+	 * If the user set MSG_EOF, the protocol understands this flag and
+	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
+	 */
+	    ((flags & MSG_EOF) &&
+	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+	     (resid <= 0)) ?
+		PRUS_EOF :
+		/* If there is more to send set PRUS_MORETOCOME */
+		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+		top, addr, control, td);
+	if (dontroute) {
+		SOCK_LOCK(so);
+		so->so_options &= ~SO_DONTROUTE;
+		SOCK_UNLOCK(so);
+	}
+	clen = 0;
+	control = NULL;
+	top = NULL;
+out:
+	if (top != NULL)
+		m_freem(top);
+	if (control != NULL)
+		m_freem(control);
+	return (error);
+}
+
+/*
+ * Send on a socket.  If send must go all at once and message is larger than
+ * send buffering, then hard error.  Lock against other senders.  If must go
+ * all at once and not enough room now, then inform user that this would
+ * block and do nothing.  Otherwise, if nonblocking, send as much as
+ * possible.  The data to be sent is described by "uio" if nonzero, otherwise
+ * by the mbuf chain "top" (which must be null if uio is not).  Data provided
+ * in mbuf chain must be small enough to send all at once.
+ *
+ * Returns nonzero on error, timeout or signal; callers must check for short
+ * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
+ * on return.
+ */
+int
+sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+	long space;
+	ssize_t resid;
+	int clen = 0, error, dontroute;
+	int atomic = sosendallatonce(so) || top;
+
+	if (uio != NULL)
+		resid = uio->uio_resid;
+	else
+		resid = top->m_pkthdr.len;
+	/*
+	 * In theory resid should be unsigned.  However, space must be
+	 * signed, as it might be less than 0 if we over-committed, and we
+	 * must use a signed comparison of space and resid.  On the other
+	 * hand, a negative resid causes us to loop sending 0-length
+	 * segments to the protocol.
+	 *
+	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
+	 * type sockets since that's an error.
+	 */
+	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
+		error = EINVAL;
+		goto out;
+	}
+
+	dontroute =
+	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
+	    (so->so_proto->pr_flags & PR_ATOMIC);
+	if (td != NULL)
+		td->td_ru.ru_msgsnd++;
+	if (control != NULL)
+		clen = control->m_len;
+
+	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
+	if (error)
+		goto out;
+
+restart:
+	do {
+		SOCKBUF_LOCK(&so->so_snd);
+		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+			SOCKBUF_UNLOCK(&so->so_snd);
+			error = EPIPE;
+			goto release;
+		}
+		if (so->so_error) {
+			error = so->so_error;
+			so->so_error = 0;
+			SOCKBUF_UNLOCK(&so->so_snd);
+			goto release;
+		}
+		if ((so->so_state & SS_ISCONNECTED) == 0) {
+			/*
+			 * `sendto' and `sendmsg' is allowed on a connection-
+			 * based socket if it supports implied connect.
+			 * Return ENOTCONN if not connected and no address is
+			 * supplied.
+			 */
+			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
+				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
+				    !(resid == 0 && clen != 0)) {
+					SOCKBUF_UNLOCK(&so->so_snd);
+					error = ENOTCONN;
+					goto release;
+				}
+			} else if (addr == NULL) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
+					error = ENOTCONN;
+				else
+					error = EDESTADDRREQ;
+				goto release;
+			}
+		}
+		space = sbspace(&so->so_snd);
+		if (flags & MSG_OOB)
+			space += 1024;
+		if ((atomic && resid > so->so_snd.sb_hiwat) ||
+		    clen > so->so_snd.sb_hiwat) {
+			SOCKBUF_UNLOCK(&so->so_snd);
+			error = EMSGSIZE;
+			goto release;
+		}
+		if (space < resid + clen &&
+		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
+			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				error = EWOULDBLOCK;
+				goto release;
+			}
+			error = sbwait(&so->so_snd);
+			SOCKBUF_UNLOCK(&so->so_snd);
+			if (error)
+				goto release;
+			goto restart;
+		}
+		SOCKBUF_UNLOCK(&so->so_snd);
+		space -= clen;
+		do {
+			if (uio == NULL) {
+				resid = 0;
+				if (flags & MSG_EOR)
+					top->m_flags |= M_EOR;
+			} else {
+#ifdef SOCKET_SEND_COW
+				error = sosend_copyin(uio, &top, atomic,
+				    &space, flags);
+				if (error != 0)
+					goto release;
+#else
+				/*
+				 * Copy the data from userland into a mbuf
+				 * chain.  If no data is to be copied in,
+				 * a single empty mbuf is returned.
+				 */
+				top = m_uiotombuf(uio, M_WAITOK, space,
+				    (atomic ? max_hdr : 0),
+				    (atomic ? M_PKTHDR : 0) |
+				    ((flags & MSG_EOR) ? M_EOR : 0));
+				if (top == NULL) {
+					error = EFAULT; /* only possible error */
+					goto release;
+				}
+				space -= resid - uio->uio_resid;
+#endif /* SOCKET_SEND_COW */
+				resid = uio->uio_resid;
+			}
+			if (dontroute) {
+				SOCK_LOCK(so);
+				so->so_options |= SO_DONTROUTE;
+				SOCK_UNLOCK(so);
+			}
+			/*
+			 * XXX all the SBS_CANTSENDMORE checks previously
+			 * done could be out of date.  We could have recieved
+			 * a reset packet in an interrupt or maybe we slept
+			 * while doing page faults in uiomove() etc.  We
+			 * could probably recheck again inside the locking
+			 * protection here, but there are probably other
+			 * places that this also happens.  We must rethink
+			 * this.
+			 */
+			VNET_SO_ASSERT(so);
+			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+			    (flags & MSG_OOB) ? PRUS_OOB :
+			/*
+			 * If the user set MSG_EOF, the protocol understands
+			 * this flag and nothing left to send then use
+			 * PRU_SEND_EOF instead of PRU_SEND.
+			 */
+			    ((flags & MSG_EOF) &&
+			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+			     (resid <= 0)) ?
+				PRUS_EOF :
+			/* If there is more to send set PRUS_MORETOCOME. */
+			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+			    top, addr, control, td);
+			if (dontroute) {
+				SOCK_LOCK(so);
+				so->so_options &= ~SO_DONTROUTE;
+				SOCK_UNLOCK(so);
+			}
+			clen = 0;
+			control = NULL;
+			top = NULL;
+			if (error)
+				goto release;
+		} while (resid && space > 0);
+	} while (resid);
+
+release:
+	sbunlock(&so->so_snd);
+out:
+	if (top != NULL)
+		m_freem(top);
+	if (control != NULL)
+		m_freem(control);
+	return (error);
+}
+
+int
+sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+	int error;
+
+	CURVNET_SET(so->so_vnet);
+	error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
+	    control, flags, td);
+	CURVNET_RESTORE();
+	return (error);
+}
+
+/*
+ * The part of soreceive() that implements reading non-inline out-of-band
+ * data from a socket.  For more complete comments, see soreceive(), from
+ * which this code originated.
+ *
+ * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
+ * unable to return an mbuf chain to the caller.
+ */
+static int
+soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
+{
+	struct protosw *pr = so->so_proto;
+	struct mbuf *m;
+	int error;
+
+	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
+	VNET_SO_ASSERT(so);
+
+	m = m_get(M_WAITOK, MT_DATA);
+	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
+	if (error)
+		goto bad;
+	do {
+#ifdef SOCKET_RECV_PFLIP
+		if (so_zero_copy_receive) {
+			int disposable;
+
+			if ((m->m_flags & M_EXT)
+			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
+				disposable = 1;
+			else
+				disposable = 0;
+
+			error = uiomoveco(mtod(m, void *),
+			    min(uio->uio_resid, m->m_len), uio, disposable);
+		} else
+#endif /* SOCKET_RECV_PFLIP */
+		error = uiomove(mtod(m, void *),
+		    (int) min(uio->uio_resid, m->m_len), uio);
+		m = m_free(m);
+	} while (uio->uio_resid && error == 0 && m);
+bad:
+	if (m != NULL)
+		m_freem(m);
+	return (error);
+}
+
+/*
+ * Following replacement or removal of the first mbuf on the first mbuf chain
+ * of a socket buffer, push necessary state changes back into the socket
+ * buffer so that other consumers see the values consistently.  'nextrecord'
+ * is the callers locally stored value of the original value of
+ * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
+ * NOTE: 'nextrecord' may be NULL.
+ */
+static __inline void
+sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+	/*
+	 * First, update for the new value of nextrecord.  If necessary, make
+	 * it the first record.
+	 */
+	if (sb->sb_mb != NULL)
+		sb->sb_mb->m_nextpkt = nextrecord;
+	else
+		sb->sb_mb = nextrecord;
+
+	/*
+	 * Now update any dependent socket buffer fields to reflect the new
+	 * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
+	 * addition of a second clause that takes care of the case where
+	 * sb_mb has been updated, but remains the last record.
+	 */
+	if (sb->sb_mb == NULL) {
+		sb->sb_mbtail = NULL;
+		sb->sb_lastrecord = NULL;
+	} else if (sb->sb_mb->m_nextpkt == NULL)
+		sb->sb_lastrecord = sb->sb_mb;
+}
+
+/*
+ * Implement receive operations on a socket.  We depend on the way that
+ * records are added to the sockbuf by sbappend.  In particular, each record
+ * (mbufs linked through m_next) must begin with an address if the protocol
+ * so specifies, followed by an optional mbuf or mbufs containing ancillary
+ * data, and then zero or more mbufs of data.  In order to allow parallelism
+ * between network receive and copying to user space, as well as avoid
+ * sleeping with a mutex held, we release the socket buffer mutex during the
+ * user space copy.  Although the sockbuf is locked, new data may still be
+ * appended, and thus we must maintain consistency of the sockbuf during that
+ * time.
+ *
+ * The caller may receive the data as a single mbuf chain by supplying an
+ * mbuf **mp0 for use in returning the chain.  The uio is then used only for
+ * the count in uio_resid.
+ */
+int
+soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
+    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+	struct mbuf *m, **mp;
+	int flags, error, offset;
+	ssize_t len;
+	struct protosw *pr = so->so_proto;
+	struct mbuf *nextrecord;
+	int moff, type = 0;
+	ssize_t orig_resid = uio->uio_resid;
+
+	mp = mp0;
+	if (psa != NULL)
+		*psa = NULL;
+	if (controlp != NULL)
+		*controlp = NULL;
+	if (flagsp != NULL)
+		flags = *flagsp &~ MSG_EOR;
+	else
+		flags = 0;
+	if (flags & MSG_OOB)
+		return (soreceive_rcvoob(so, uio, flags));
+	if (mp != NULL)
+		*mp = NULL;
+	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
+	    && uio->uio_resid) {
+		VNET_SO_ASSERT(so);
+		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
+	}
+
+	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
+	if (error)
+		return (error);
+
+restart:
+	SOCKBUF_LOCK(&so->so_rcv);
+	m = so->so_rcv.sb_mb;
+	/*
+	 * If we have less data than requested, block awaiting more (subject
+	 * to any timeout) if:
+	 *   1. the current count is less than the low water mark, or
+	 *   2. MSG_DONTWAIT is not set
+	 */
+	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
+	    so->so_rcv.sb_cc < uio->uio_resid) &&
+	    so->so_rcv.sb_cc < so->so_rcv.sb_lowat &&
+	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
+		KASSERT(m != NULL || !so->so_rcv.sb_cc,
+		    ("receive: m == %p so->so_rcv.sb_cc == %u",
+		    m, so->so_rcv.sb_cc));
+		if (so->so_error) {
+			if (m != NULL)
+				goto dontblock;
+			error = so->so_error;
+			if ((flags & MSG_PEEK) == 0)
+				so->so_error = 0;
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			goto release;
+		}
+		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+			if (m == NULL) {
+				SOCKBUF_UNLOCK(&so->so_rcv);
+				goto release;
+			} else
+				goto dontblock;
+		}
+		for (; m != NULL; m = m->m_next)
+			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
+				m = so->so_rcv.sb_mb;
+				goto dontblock;
+			}
+		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
+		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			error = ENOTCONN;
+			goto release;
+		}
+		if (uio->uio_resid == 0) {
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			goto release;
+		}
+		if ((so->so_state & SS_NBIO) ||
+		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			error = EWOULDBLOCK;
+			goto release;
+		}
+		SBLASTRECORDCHK(&so->so_rcv);
+		SBLASTMBUFCHK(&so->so_rcv);
+		error = sbwait(&so->so_rcv);
+		SOCKBUF_UNLOCK(&so->so_rcv);
+		if (error)
+			goto release;
+		goto restart;
+	}
+dontblock:
+	/*
+	 * From this point onward, we maintain 'nextrecord' as a cache of the
+	 * pointer to the next record in the socket buffer.  We must keep the
+	 * various socket buffer pointers and local stack versions of the
+	 * pointers in sync, pushing out modifications before dropping the
+	 * socket buffer mutex, and re-reading them when picking it up.
+	 *
+	 * Otherwise, we will race with the network stack appending new data
+	 * or records onto the socket buffer by using inconsistent/stale
+	 * versions of the field, possibly resulting in socket buffer
+	 * corruption.
+	 *
+	 * By holding the high-level sblock(), we prevent simultaneous
+	 * readers from pulling off the front of the socket buffer.
+	 */
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+	if (uio->uio_td)
+		uio->uio_td->td_ru.ru_msgrcv++;
+	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
+	SBLASTRECORDCHK(&so->so_rcv);
+	SBLASTMBUFCHK(&so->so_rcv);
+	nextrecord = m->m_nextpkt;
+	if (pr->pr_flags & PR_ADDR) {
+		KASSERT(m->m_type == MT_SONAME,
+		    ("m->m_type == %d", m->m_type));
+		orig_resid = 0;
+		if (psa != NULL)
+			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
+			    M_NOWAIT);
+		if (flags & MSG_PEEK) {
+			m = m->m_next;
+		} else {
+			sbfree(&so->so_rcv, m);
+			so->so_rcv.sb_mb = m_free(m);
+			m = so->so_rcv.sb_mb;
+			sockbuf_pushsync(&so->so_rcv, nextrecord);
+		}
+	}
+
+	/*
+	 * Process one or more MT_CONTROL mbufs present before any data mbufs
+	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
+	 * just copy the data; if !MSG_PEEK, we call into the protocol to
+	 * perform externalization (or freeing if controlp == NULL).
+	 */
+	if (m != NULL && m->m_type == MT_CONTROL) {
+		struct mbuf *cm = NULL, *cmn;
+		struct mbuf **cme = &cm;
+
+		do {
+			if (flags & MSG_PEEK) {
+				if (controlp != NULL) {
+					*controlp = m_copy(m, 0, m->m_len);
+					controlp = &(*controlp)->m_next;
+				}
+				m = m->m_next;
+			} else {
+				sbfree(&so->so_rcv, m);
+				so->so_rcv.sb_mb = m->m_next;
+				m->m_next = NULL;
+				*cme = m;
+				cme = &(*cme)->m_next;
+				m = so->so_rcv.sb_mb;
+			}
+		} while (m != NULL && m->m_type == MT_CONTROL);
+		if ((flags & MSG_PEEK) == 0)
+			sockbuf_pushsync(&so->so_rcv, nextrecord);
+		while (cm != NULL) {
+			cmn = cm->m_next;
+			cm->m_next = NULL;
+			if (pr->pr_domain->dom_externalize != NULL) {
+				SOCKBUF_UNLOCK(&so->so_rcv);
+				VNET_SO_ASSERT(so);
+				error = (*pr->pr_domain->dom_externalize)
+				    (cm, controlp, flags);
+				SOCKBUF_LOCK(&so->so_rcv);
+			} else if (controlp != NULL)
+				*controlp = cm;
+			else
+				m_freem(cm);
+			if (controlp != NULL) {
+				orig_resid = 0;
+				while (*controlp != NULL)
+					controlp = &(*controlp)->m_next;
+			}
+			cm = cmn;
+		}
+		if (m != NULL)
+			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
+		else
+			nextrecord = so->so_rcv.sb_mb;
+		orig_resid = 0;
+	}
+	if (m != NULL) {
+		if ((flags & MSG_PEEK) == 0) {
+			KASSERT(m->m_nextpkt == nextrecord,
+			    ("soreceive: post-control, nextrecord !sync"));
+			if (nextrecord == NULL) {
+				KASSERT(so->so_rcv.sb_mb == m,
+				    ("soreceive: post-control, sb_mb!=m"));
+				KASSERT(so->so_rcv.sb_lastrecord == m,
+				    ("soreceive: post-control, lastrecord!=m"));
+			}
+		}
+		type = m->m_type;
+		if (type == MT_OOBDATA)
+			flags |= MSG_OOB;
+	} else {
+		if ((flags & MSG_PEEK) == 0) {
+			KASSERT(so->so_rcv.sb_mb == nextrecord,
+			    ("soreceive: sb_mb != nextrecord"));
+			if (so->so_rcv.sb_mb == NULL) {
+				KASSERT(so->so_rcv.sb_lastrecord == NULL,
+				    ("soreceive: sb_lastercord != NULL"));
+			}
+		}
+	}
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+	SBLASTRECORDCHK(&so->so_rcv);
+	SBLASTMBUFCHK(&so->so_rcv);
+
+	/*
+	 * Now continue to read any data mbufs off of the head of the socket
+	 * buffer until the read request is satisfied.  Note that 'type' is
+	 * used to store the type of any mbuf reads that have happened so far
+	 * such that soreceive() can stop reading if the type changes, which
+	 * causes soreceive() to return only one of regular data and inline
+	 * out-of-band data in a single socket receive operation.
+	 */
+	moff = 0;
+	offset = 0;
+	while (m != NULL && uio->uio_resid > 0 && error == 0) {
+		/*
+		 * If the type of mbuf has changed since the last mbuf
+		 * examined ('type'), end the receive operation.
+		 */
+		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+		if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
+			if (type != m->m_type)
+				break;
+		} else if (type == MT_OOBDATA)
+			break;
+		else
+		    KASSERT(m->m_type == MT_DATA,
+			("m->m_type == %d", m->m_type));
+		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
+		len = uio->uio_resid;
+		if (so->so_oobmark && len > so->so_oobmark - offset)
+			len = so->so_oobmark - offset;
+		if (len > m->m_len - moff)
+			len = m->m_len - moff;
+		/*
+		 * If mp is set, just pass back the mbufs.  Otherwise copy
+		 * them out via the uio, then free.  Sockbuf must be
+		 * consistent here (points to current mbuf, it points to next
+		 * record) when we drop priority; we must note any additions
+		 * to the sockbuf when we block interrupts again.
+		 */
+		if (mp == NULL) {
+			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+			SBLASTRECORDCHK(&so->so_rcv);
+			SBLASTMBUFCHK(&so->so_rcv);
+			SOCKBUF_UNLOCK(&so->so_rcv);
+#ifdef SOCKET_RECV_PFLIP
+			if (so_zero_copy_receive) {
+				int disposable;
+
+				if ((m->m_flags & M_EXT)
+				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
+					disposable = 1;
+				else
+					disposable = 0;
+
+				error = uiomoveco(mtod(m, char *) + moff,
+				    (int)len, uio, disposable);
+			} else
+#endif /* SOCKET_RECV_PFLIP */
+			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
+			SOCKBUF_LOCK(&so->so_rcv);
+			if (error) {
+				/*
+				 * The MT_SONAME mbuf has already been removed
+				 * from the record, so it is necessary to
+				 * remove the data mbufs, if any, to preserve
+				 * the invariant in the case of PR_ADDR that
+				 * requires MT_SONAME mbufs at the head of
+				 * each record.
+				 */
+				if (m && pr->pr_flags & PR_ATOMIC &&
+				    ((flags & MSG_PEEK) == 0))
+					(void)sbdroprecord_locked(&so->so_rcv);
+				SOCKBUF_UNLOCK(&so->so_rcv);
+				goto release;
+			}
+		} else
+			uio->uio_resid -= len;
+		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+		if (len == m->m_len - moff) {
+			if (m->m_flags & M_EOR)
+				flags |= MSG_EOR;
+			if (flags & MSG_PEEK) {
+				m = m->m_next;
+				moff = 0;
+			} else {
+				nextrecord = m->m_nextpkt;
+				sbfree(&so->so_rcv, m);
+				if (mp != NULL) {
+					m->m_nextpkt = NULL;
+					*mp = m;
+					mp = &m->m_next;
+					so->so_rcv.sb_mb = m = m->m_next;
+					*mp = NULL;
+				} else {
+					so->so_rcv.sb_mb = m_free(m);
+					m = so->so_rcv.sb_mb;
+				}
+				sockbuf_pushsync(&so->so_rcv, nextrecord);
+				SBLASTRECORDCHK(&so->so_rcv);
+				SBLASTMBUFCHK(&so->so_rcv);
+			}
+		} else {
+			if (flags & MSG_PEEK)
+				moff += len;
+			else {
+				if (mp != NULL) {
+					int copy_flag;
+
+					if (flags & MSG_DONTWAIT)
+						copy_flag = M_NOWAIT;
+					else
+						copy_flag = M_WAIT;
+					if (copy_flag == M_WAITOK)
+						SOCKBUF_UNLOCK(&so->so_rcv);
+					*mp = m_copym(m, 0, len, copy_flag);
+					if (copy_flag == M_WAITOK)
+						SOCKBUF_LOCK(&so->so_rcv);
+					if (*mp == NULL) {
+						/*
+						 * m_copym() couldn't
+						 * allocate an mbuf.  Adjust
+						 * uio_resid back (it was
+						 * adjusted down by len
+						 * bytes, which we didn't end
+						 * up "copying" over).
+						 */
+						uio->uio_resid += len;
+						break;
+					}
+				}
+				m->m_data += len;
+				m->m_len -= len;
+				so->so_rcv.sb_cc -= len;
+			}
+		}
+		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+		if (so->so_oobmark) {
+			if ((flags & MSG_PEEK) == 0) {
+				so->so_oobmark -= len;
+				if (so->so_oobmark == 0) {
+					so->so_rcv.sb_state |= SBS_RCVATMARK;
+					break;
+				}
+			} else {
+				offset += len;
+				if (offset == so->so_oobmark)
+					break;
+			}
+		}
+		if (flags & MSG_EOR)
+			break;
+		/*
+		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
+		 * must not quit until "uio->uio_resid == 0" or an error
+		 * termination.  If a signal/timeout occurs, return with a
+		 * short count but without error.  Keep sockbuf locked
+		 * against other readers.
+		 */
+		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
+		    !sosendallatonce(so) && nextrecord == NULL) {
+			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+			if (so->so_error ||
+			    so->so_rcv.sb_state & SBS_CANTRCVMORE)
+				break;
+			/*
+			 * Notify the protocol that some data has been
+			 * drained before blocking.
+			 */
+			if (pr->pr_flags & PR_WANTRCVD) {
+				SOCKBUF_UNLOCK(&so->so_rcv);
+				VNET_SO_ASSERT(so);
+				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
+				SOCKBUF_LOCK(&so->so_rcv);
+			}
+			SBLASTRECORDCHK(&so->so_rcv);
+			SBLASTMBUFCHK(&so->so_rcv);
+			/*
+			 * We could receive some data while was notifying
+			 * the protocol. Skip blocking in this case.
+			 */
+			if (so->so_rcv.sb_mb == NULL) {
+				error = sbwait(&so->so_rcv);
+				if (error) {
+					SOCKBUF_UNLOCK(&so->so_rcv);
+					goto release;
+				}
+			}
+			m = so->so_rcv.sb_mb;
+			if (m != NULL)
+				nextrecord = m->m_nextpkt;
+		}
+	}
+
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
+		flags |= MSG_TRUNC;
+		if ((flags & MSG_PEEK) == 0)
+			(void) sbdroprecord_locked(&so->so_rcv);
+	}
+	if ((flags & MSG_PEEK) == 0) {
+		if (m == NULL) {
+			/*
+			 * First part is an inline SB_EMPTY_FIXUP().  Second
+			 * part makes sure sb_lastrecord is up-to-date if
+			 * there is still data in the socket buffer.
+			 */
+			so->so_rcv.sb_mb = nextrecord;
+			if (so->so_rcv.sb_mb == NULL) {
+				so->so_rcv.sb_mbtail = NULL;
+				so->so_rcv.sb_lastrecord = NULL;
+			} else if (nextrecord->m_nextpkt == NULL)
+				so->so_rcv.sb_lastrecord = nextrecord;
+		}
+		SBLASTRECORDCHK(&so->so_rcv);
+		SBLASTMBUFCHK(&so->so_rcv);
+		/*
+		 * If soreceive() is being done from the socket callback,
+		 * then don't need to generate ACK to peer to update window,
+		 * since ACK will be generated on return to TCP.
+		 */
+		if (!(flags & MSG_SOCALLBCK) &&
+		    (pr->pr_flags & PR_WANTRCVD)) {
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			VNET_SO_ASSERT(so);
+			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
+			SOCKBUF_LOCK(&so->so_rcv);
+		}
+	}
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+	if (orig_resid == uio->uio_resid && orig_resid &&
+	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
+		SOCKBUF_UNLOCK(&so->so_rcv);
+		goto restart;
+	}
+	SOCKBUF_UNLOCK(&so->so_rcv);
+
+	if (flagsp != NULL)
+		*flagsp |= flags;
+release:
+	sbunlock(&so->so_rcv);
+	return (error);
+}
+
+/*
+ * Optimized version of soreceive() for stream (TCP) sockets.
+ * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled.
+ */
+int
+soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
+    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+	int len = 0, error = 0, flags, oresid;
+	struct sockbuf *sb;
+	struct mbuf *m, *n = NULL;
+
+	/* We only do stream sockets. */
+	if (so->so_type != SOCK_STREAM)
+		return (EINVAL);
+	if (psa != NULL)
+		*psa = NULL;
+	if (controlp != NULL)
+		return (EINVAL);
+	if (flagsp != NULL)
+		flags = *flagsp &~ MSG_EOR;
+	else
+		flags = 0;
+	if (flags & MSG_OOB)
+		return (soreceive_rcvoob(so, uio, flags));
+	if (mp0 != NULL)
+		*mp0 = NULL;
+
+	sb = &so->so_rcv;
+
+	/* Prevent other readers from entering the socket. */
+	error = sblock(sb, SBLOCKWAIT(flags));
+	if (error)
+		goto out;
+	SOCKBUF_LOCK(sb);
+
+	/* Easy one, no space to copyout anything. */
+	if (uio->uio_resid == 0) {
+		error = EINVAL;
+		goto out;
+	}
+	oresid = uio->uio_resid;
+
+	/* We will never ever get anything unless we are or were connected. */
+	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
+		error = ENOTCONN;
+		goto out;
+	}
+
+restart:
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+	/* Abort if socket has reported problems. */
+	if (so->so_error) {
+		if (sb->sb_cc > 0)
+			goto deliver;
+		if (oresid > uio->uio_resid)
+			goto out;
+		error = so->so_error;
+		if (!(flags & MSG_PEEK))
+			so->so_error = 0;
+		goto out;
+	}
+
+	/* Door is closed.  Deliver what is left, if any. */
+	if (sb->sb_state & SBS_CANTRCVMORE) {
+		if (sb->sb_cc > 0)
+			goto deliver;
+		else
+			goto out;
+	}
+
+	/* Socket buffer is empty and we shall not block. */
+	if (sb->sb_cc == 0 &&
+	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
+		error = EAGAIN;
+		goto out;
+	}
+
+	/* Socket buffer got some data that we shall deliver now. */
+	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
+	    ((sb->sb_flags & SS_NBIO) ||
+	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
+	     sb->sb_cc >= sb->sb_lowat ||
+	     sb->sb_cc >= uio->uio_resid ||
+	     sb->sb_cc >= sb->sb_hiwat) ) {
+		goto deliver;
+	}
+
+	/* On MSG_WAITALL we must wait until all data or error arrives. */
+	if ((flags & MSG_WAITALL) &&
+	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat))
+		goto deliver;
+
+	/*
+	 * Wait and block until (more) data comes in.
+	 * NB: Drops the sockbuf lock during wait.
+	 */
+	error = sbwait(sb);
+	if (error)
+		goto out;
+	goto restart;
+
+deliver:
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
+	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
+
+	/* Statistics. */
+	if (uio->uio_td)
+		uio->uio_td->td_ru.ru_msgrcv++;
+
+	/* Fill uio until full or current end of socket buffer is reached. */
+	len = min(uio->uio_resid, sb->sb_cc);
+	if (mp0 != NULL) {
+		/* Dequeue as many mbufs as possible. */
+		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
+			if (*mp0 == NULL)
+				*mp0 = sb->sb_mb;
+			else
+				m_cat(*mp0, sb->sb_mb);
+			for (m = sb->sb_mb;
+			     m != NULL && m->m_len <= len;
+			     m = m->m_next) {
+				len -= m->m_len;
+				uio->uio_resid -= m->m_len;
+				sbfree(sb, m);
+				n = m;
+			}
+			n->m_next = NULL;
+			sb->sb_mb = m;
+			sb->sb_lastrecord = sb->sb_mb;
+			if (sb->sb_mb == NULL)
+				SB_EMPTY_FIXUP(sb);
+		}
+		/* Copy the remainder. */
+		if (len > 0) {
+			KASSERT(sb->sb_mb != NULL,
+			    ("%s: len > 0 && sb->sb_mb empty", __func__));
+
+			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
+			if (m == NULL)
+				len = 0;	/* Don't flush data from sockbuf. */
+			else
+				uio->uio_resid -= len;
+			if (*mp0 != NULL)
+				m_cat(*mp0, m);
+			else
+				*mp0 = m;
+			if (*mp0 == NULL) {
+				error = ENOBUFS;
+				goto out;
+			}
+		}
+	} else {
+		/* NB: Must unlock socket buffer as uiomove may sleep. */
+		SOCKBUF_UNLOCK(sb);
+		error = m_mbuftouio(uio, sb->sb_mb, len);
+		SOCKBUF_LOCK(sb);
+		if (error)
+			goto out;
+	}
+	SBLASTRECORDCHK(sb);
+	SBLASTMBUFCHK(sb);
+
+	/*
+	 * Remove the delivered data from the socket buffer unless we
+	 * were only peeking.
+	 */
+	if (!(flags & MSG_PEEK)) {
+		if (len > 0)
+			sbdrop_locked(sb, len);
+
+		/* Notify protocol that we drained some data. */
+		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
+		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
+		     !(flags & MSG_SOCALLBCK))) {
+			SOCKBUF_UNLOCK(sb);
+			VNET_SO_ASSERT(so);
+			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
+			SOCKBUF_LOCK(sb);
+		}
+	}
+
+	/*
+	 * For MSG_WAITALL we may have to loop again and wait for
+	 * more data to come in.
+	 */
+	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
+		goto restart;
+out:
+	SOCKBUF_LOCK_ASSERT(sb);
+	SBLASTRECORDCHK(sb);
+	SBLASTMBUFCHK(sb);
+	SOCKBUF_UNLOCK(sb);
+	sbunlock(sb);
+	return (error);
+}
+
+/*
+ * Optimized version of soreceive() for simple datagram cases from userspace.
+ * Unlike in the stream case, we're able to drop a datagram if copyout()
+ * fails, and because we handle datagrams atomically, we don't need to use a
+ * sleep lock to prevent I/O interlacing.
+ */
+int
+soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
+    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+	struct mbuf *m, *m2;
+	int flags, error;
+	ssize_t len;
+	struct protosw *pr = so->so_proto;
+	struct mbuf *nextrecord;
+
+	if (psa != NULL)
+		*psa = NULL;
+	if (controlp != NULL)
+		*controlp = NULL;
+	if (flagsp != NULL)
+		flags = *flagsp &~ MSG_EOR;
+	else
+		flags = 0;
+
+	/*
+	 * For any complicated cases, fall back to the full
+	 * soreceive_generic().
+	 */
+	if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
+		return (soreceive_generic(so, psa, uio, mp0, controlp,
+		    flagsp));
+
+	/*
+	 * Enforce restrictions on use.
+	 */
+	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
+	    ("soreceive_dgram: wantrcvd"));
+	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
+	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
+	    ("soreceive_dgram: SBS_RCVATMARK"));
+	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
+	    ("soreceive_dgram: P_CONNREQUIRED"));
+
+	/*
+	 * Loop blocking while waiting for a datagram.
+	 */
+	SOCKBUF_LOCK(&so->so_rcv);
+	while ((m = so->so_rcv.sb_mb) == NULL) {
+		KASSERT(so->so_rcv.sb_cc == 0,
+		    ("soreceive_dgram: sb_mb NULL but sb_cc %u",
+		    so->so_rcv.sb_cc));
+		if (so->so_error) {
+			error = so->so_error;
+			so->so_error = 0;
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			return (error);
+		}
+		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
+		    uio->uio_resid == 0) {
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			return (0);
+		}
+		if ((so->so_state & SS_NBIO) ||
+		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			return (EWOULDBLOCK);
+		}
+		SBLASTRECORDCHK(&so->so_rcv);
+		SBLASTMBUFCHK(&so->so_rcv);
+		error = sbwait(&so->so_rcv);
+		if (error) {
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			return (error);
+		}
+	}
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+	if (uio->uio_td)
+		uio->uio_td->td_ru.ru_msgrcv++;
+	SBLASTRECORDCHK(&so->so_rcv);
+	SBLASTMBUFCHK(&so->so_rcv);
+	nextrecord = m->m_nextpkt;
+	if (nextrecord == NULL) {
+		KASSERT(so->so_rcv.sb_lastrecord == m,
+		    ("soreceive_dgram: lastrecord != m"));
+	}
+
+	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
+	    ("soreceive_dgram: m_nextpkt != nextrecord"));
+
+	/*
+	 * Pull 'm' and its chain off the front of the packet queue.
+	 */
+	so->so_rcv.sb_mb = NULL;
+	sockbuf_pushsync(&so->so_rcv, nextrecord);
+
+	/*
+	 * Walk 'm's chain and free that many bytes from the socket buffer.
+	 */
+	for (m2 = m; m2 != NULL; m2 = m2->m_next)
+		sbfree(&so->so_rcv, m2);
+
+	/*
+	 * Do a few last checks before we let go of the lock.
+	 */
+	SBLASTRECORDCHK(&so->so_rcv);
+	SBLASTMBUFCHK(&so->so_rcv);
+	SOCKBUF_UNLOCK(&so->so_rcv);
+
+	if (pr->pr_flags & PR_ADDR) {
+		KASSERT(m->m_type == MT_SONAME,
+		    ("m->m_type == %d", m->m_type));
+		if (psa != NULL)
+			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
+			    M_NOWAIT);
+		m = m_free(m);
+	}
+	if (m == NULL) {
+		/* XXXRW: Can this happen? */
+		return (0);
+	}
+
+	/*
+	 * Packet to copyout() is now in 'm' and it is disconnected from the
+	 * queue.
+	 *
+	 * Process one or more MT_CONTROL mbufs present before any data mbufs
+	 * in the first mbuf chain on the socket buffer.  We call into the
+	 * protocol to perform externalization (or freeing if controlp ==
+	 * NULL).
+	 */
+	if (m->m_type == MT_CONTROL) {
+		struct mbuf *cm = NULL, *cmn;
+		struct mbuf **cme = &cm;
+
+		do {
+			m2 = m->m_next;
+			m->m_next = NULL;
+			*cme = m;
+			cme = &(*cme)->m_next;
+			m = m2;
+		} while (m != NULL && m->m_type == MT_CONTROL);
+		while (cm != NULL) {
+			cmn = cm->m_next;
+			cm->m_next = NULL;
+			if (pr->pr_domain->dom_externalize != NULL) {
+				error = (*pr->pr_domain->dom_externalize)
+				    (cm, controlp, flags);
+			} else if (controlp != NULL)
+				*controlp = cm;
+			else
+				m_freem(cm);
+			if (controlp != NULL) {
+				while (*controlp != NULL)
+					controlp = &(*controlp)->m_next;
+			}
+			cm = cmn;
+		}
+	}
+	KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
+
+	while (m != NULL && uio->uio_resid > 0) {
+		len = uio->uio_resid;
+		if (len > m->m_len)
+			len = m->m_len;
+		error = uiomove(mtod(m, char *), (int)len, uio);
+		if (error) {
+			m_freem(m);
+			return (error);
+		}
+		if (len == m->m_len)
+			m = m_free(m);
+		else {
+			m->m_data += len;
+			m->m_len -= len;
+		}
+	}
+	if (m != NULL)
+		flags |= MSG_TRUNC;
+	m_freem(m);
+	if (flagsp != NULL)
+		*flagsp |= flags;
+	return (0);
+}
+
+int
+soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
+    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+	int error;
+
+	CURVNET_SET(so->so_vnet);
+	error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
+	    controlp, flagsp));
+	CURVNET_RESTORE();
+	return (error);
+}
+
+int
+soshutdown(struct socket *so, int how)
+{
+	struct protosw *pr = so->so_proto;
+	int error;
+
+	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
+		return (EINVAL);
+
+	CURVNET_SET(so->so_vnet);
+	if (pr->pr_usrreqs->pru_flush != NULL)
+		(*pr->pr_usrreqs->pru_flush)(so, how);
+	if (how != SHUT_WR)
+		sorflush(so);
+	if (how != SHUT_RD) {
+		error = (*pr->pr_usrreqs->pru_shutdown)(so);
+		wakeup(&so->so_timeo);
+		CURVNET_RESTORE();
+		return (error);
+	}
+	wakeup(&so->so_timeo);
+	CURVNET_RESTORE();
+	return (0);
+}
+
+void
+sorflush(struct socket *so)
+{
+	struct sockbuf *sb = &so->so_rcv;
+	struct protosw *pr = so->so_proto;
+	struct sockbuf asb;
+
+	VNET_SO_ASSERT(so);
+
+	/*
+	 * In order to avoid calling dom_dispose with the socket buffer mutex
+	 * held, and in order to generally avoid holding the lock for a long
+	 * time, we make a copy of the socket buffer and clear the original
+	 * (except locks, state).  The new socket buffer copy won't have
+	 * initialized locks so we can only call routines that won't use or
+	 * assert those locks.
+	 *
+	 * Dislodge threads currently blocked in receive and wait to acquire
+	 * a lock against other simultaneous readers before clearing the
+	 * socket buffer.  Don't let our acquire be interrupted by a signal
+	 * despite any existing socket disposition on interruptable waiting.
+	 */
+	socantrcvmore(so);
+	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
+
+	/*
+	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
+	 * and mutex data unchanged.
+	 */
+	SOCKBUF_LOCK(sb);
+	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
+	bcopy(&sb->sb_startzero, &asb.sb_startzero,
+	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
+	bzero(&sb->sb_startzero,
+	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
+	SOCKBUF_UNLOCK(sb);
+	sbunlock(sb);
+
+	/*
+	 * Dispose of special rights and flush the socket buffer.  Don't call
+	 * any unsafe routines (that rely on locks being initialized) on asb.
+	 */
+	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
+		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
+	sbrelease_internal(&asb, so);
+}
+
+/*
+ * Perhaps this routine, and sooptcopyout(), below, ought to come in an
+ * additional variant to handle the case where the option value needs to be
+ * some kind of integer, but not a specific size.  In addition to their use
+ * here, these functions are also called by the protocol-level pr_ctloutput()
+ * routines.
+ */
+int
+sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
+{
+	size_t	valsize;
+
+	/*
+	 * If the user gives us more than we wanted, we ignore it, but if we
+	 * don't get the minimum length the caller wants, we return EINVAL.
+	 * On success, sopt->sopt_valsize is set to however much we actually
+	 * retrieved.
+	 */
+	if ((valsize = sopt->sopt_valsize) < minlen)
+		return EINVAL;
+	if (valsize > len)
+		sopt->sopt_valsize = valsize = len;
+
+	if (sopt->sopt_td != NULL)
+		return (copyin(sopt->sopt_val, buf, valsize));
+
+	bcopy(sopt->sopt_val, buf, valsize);
+	return (0);
+}
+
+/*
+ * Kernel version of setsockopt(2).
+ *
+ * XXX: optlen is size_t, not socklen_t
+ */
+int
+so_setsockopt(struct socket *so, int level, int optname, void *optval,
+    size_t optlen)
+{
+	struct sockopt sopt;
+
+	sopt.sopt_level = level;
+	sopt.sopt_name = optname;
+	sopt.sopt_dir = SOPT_SET;
+	sopt.sopt_val = optval;
+	sopt.sopt_valsize = optlen;
+	sopt.sopt_td = NULL;
+	return (sosetopt(so, &sopt));
+}
+
+int
+sosetopt(struct socket *so, struct sockopt *sopt)
+{
+	int	error, optval;
+	struct	linger l;
+	struct	timeval tv;
+	sbintime_t val;
+	uint32_t val32;
+#ifdef MAC
+	struct mac extmac;
+#endif
+
+	CURVNET_SET(so->so_vnet);
+	error = 0;
+	if (sopt->sopt_level != SOL_SOCKET) {
+		if (so->so_proto->pr_ctloutput != NULL) {
+			error = (*so->so_proto->pr_ctloutput)(so, sopt);
+			CURVNET_RESTORE();
+			return (error);
+		}
+		error = ENOPROTOOPT;
+	} else {
+		switch (sopt->sopt_name) {
+#ifdef INET
+		case SO_ACCEPTFILTER:
+			error = do_setopt_accept_filter(so, sopt);
+			if (error)
+				goto bad;
+			break;
+#endif
+		case SO_LINGER:
+			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
+			if (error)
+				goto bad;
+
+			SOCK_LOCK(so);
+			so->so_linger = l.l_linger;
+			if (l.l_onoff)
+				so->so_options |= SO_LINGER;
+			else
+				so->so_options &= ~SO_LINGER;
+			SOCK_UNLOCK(so);
+			break;
+
+		case SO_DEBUG:
+		case SO_KEEPALIVE:
+		case SO_DONTROUTE:
+		case SO_USELOOPBACK:
+		case SO_BROADCAST:
+		case SO_REUSEADDR:
+		case SO_REUSEPORT:
+		case SO_OOBINLINE:
+		case SO_TIMESTAMP:
+		case SO_BINTIME:
+		case SO_NOSIGPIPE:
+		case SO_NO_DDP:
+		case SO_NO_OFFLOAD:
+			error = sooptcopyin(sopt, &optval, sizeof optval,
+			    sizeof optval);
+			if (error)
+				goto bad;
+			SOCK_LOCK(so);
+			if (optval)
+				so->so_options |= sopt->sopt_name;
+			else
+				so->so_options &= ~sopt->sopt_name;
+			SOCK_UNLOCK(so);
+			break;
+
+		case SO_SETFIB:
+			error = sooptcopyin(sopt, &optval, sizeof optval,
+			    sizeof optval);
+			if (error)
+				goto bad;
+
+			if (optval < 0 || optval >= rt_numfibs) {
+				error = EINVAL;
+				goto bad;
+			}
+			if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
+			   (so->so_proto->pr_domain->dom_family == PF_INET6) ||
+			   (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
+				so->so_fibnum = optval;
+			else
+				so->so_fibnum = 0;
+			break;
+
+		case SO_USER_COOKIE:
+			error = sooptcopyin(sopt, &val32, sizeof val32,
+			    sizeof val32);
+			if (error)
+				goto bad;
+			so->so_user_cookie = val32;
+			break;
+
+		case SO_SNDBUF:
+		case SO_RCVBUF:
+		case SO_SNDLOWAT:
+		case SO_RCVLOWAT:
+			error = sooptcopyin(sopt, &optval, sizeof optval,
+			    sizeof optval);
+			if (error)
+				goto bad;
+
+			/*
+			 * Values < 1 make no sense for any of these options,
+			 * so disallow them.
+			 */
+			if (optval < 1) {
+				error = EINVAL;
+				goto bad;
+			}
+
+			switch (sopt->sopt_name) {
+			case SO_SNDBUF:
+			case SO_RCVBUF:
+				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
+				    &so->so_snd : &so->so_rcv, (u_long)optval,
+				    so, curthread) == 0) {
+					error = ENOBUFS;
+					goto bad;
+				}
+				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
+				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
+				break;
+
+			/*
+			 * Make sure the low-water is never greater than the
+			 * high-water.
+			 */
+			case SO_SNDLOWAT:
+				SOCKBUF_LOCK(&so->so_snd);
+				so->so_snd.sb_lowat =
+				    (optval > so->so_snd.sb_hiwat) ?
+				    so->so_snd.sb_hiwat : optval;
+				SOCKBUF_UNLOCK(&so->so_snd);
+				break;
+			case SO_RCVLOWAT:
+				SOCKBUF_LOCK(&so->so_rcv);
+				so->so_rcv.sb_lowat =
+				    (optval > so->so_rcv.sb_hiwat) ?
+				    so->so_rcv.sb_hiwat : optval;
+				SOCKBUF_UNLOCK(&so->so_rcv);
+				break;
+			}
+			break;
+
+		case SO_SNDTIMEO:
+		case SO_RCVTIMEO:
+#ifdef COMPAT_FREEBSD32
+			if (SV_CURPROC_FLAG(SV_ILP32)) {
+				struct timeval32 tv32;
+
+				error = sooptcopyin(sopt, &tv32, sizeof tv32,
+				    sizeof tv32);
+				CP(tv32, tv, tv_sec);
+				CP(tv32, tv, tv_usec);
+			} else
+#endif
+				error = sooptcopyin(sopt, &tv, sizeof tv,
+				    sizeof tv);
+			if (error)
+				goto bad;
+			if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
+			    tv.tv_usec >= 1000000) {
+				error = EDOM;
+				goto bad;
+			}
+			val = tvtosbt(tv);
+
+			switch (sopt->sopt_name) {
+			case SO_SNDTIMEO:
+				so->so_snd.sb_timeo = val;
+				break;
+			case SO_RCVTIMEO:
+				so->so_rcv.sb_timeo = val;
+				break;
+			}
+			break;
+
+		case SO_LABEL:
+#ifdef MAC
+			error = sooptcopyin(sopt, &extmac, sizeof extmac,
+			    sizeof extmac);
+			if (error)
+				goto bad;
+			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
+			    so, &extmac);
+#else
+			error = EOPNOTSUPP;
+#endif
+			break;
+
+		default:
+			error = ENOPROTOOPT;
+			break;
+		}
+		if (error == 0 && so->so_proto->pr_ctloutput != NULL)
+			(void)(*so->so_proto->pr_ctloutput)(so, sopt);
+	}
+bad:
+	CURVNET_RESTORE();
+	return (error);
+}
+
+/*
+ * Helper routine for getsockopt.
+ */
+int
+sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
+{
+	int	error;
+	size_t	valsize;
+
+	error = 0;
+
+	/*
+	 * Documented get behavior is that we always return a value, possibly
+	 * truncated to fit in the user's buffer.  Traditional behavior is
+	 * that we always tell the user precisely how much we copied, rather
+	 * than something useful like the total amount we had available for
+	 * her.  Note that this interface is not idempotent; the entire
+	 * answer must generated ahead of time.
+	 */
+	valsize = min(len, sopt->sopt_valsize);
+	sopt->sopt_valsize = valsize;
+	if (sopt->sopt_val != NULL) {
+		if (sopt->sopt_td != NULL)
+			error = copyout(buf, sopt->sopt_val, valsize);
+		else
+			bcopy(buf, sopt->sopt_val, valsize);
+	}
+	return (error);
+}
+
+int
+sogetopt(struct socket *so, struct sockopt *sopt)
+{
+	int	error, optval;
+	struct	linger l;
+	struct	timeval tv;
+#ifdef MAC
+	struct mac extmac;
+#endif
+
+	CURVNET_SET(so->so_vnet);
+	error = 0;
+	if (sopt->sopt_level != SOL_SOCKET) {
+		if (so->so_proto->pr_ctloutput != NULL)
+			error = (*so->so_proto->pr_ctloutput)(so, sopt);
+		else
+			error = ENOPROTOOPT;
+		CURVNET_RESTORE();
+		return (error);
+	} else {
+		switch (sopt->sopt_name) {
+#ifdef INET
+		case SO_ACCEPTFILTER:
+			error = do_getopt_accept_filter(so, sopt);
+			break;
+#endif
+		case SO_LINGER:
+			SOCK_LOCK(so);
+			l.l_onoff = so->so_options & SO_LINGER;
+			l.l_linger = so->so_linger;
+			SOCK_UNLOCK(so);
+			error = sooptcopyout(sopt, &l, sizeof l);
+			break;
+
+		case SO_USELOOPBACK:
+		case SO_DONTROUTE:
+		case SO_DEBUG:
+		case SO_KEEPALIVE:
+		case SO_REUSEADDR:
+		case SO_REUSEPORT:
+		case SO_BROADCAST:
+		case SO_OOBINLINE:
+		case SO_ACCEPTCONN:
+		case SO_TIMESTAMP:
+		case SO_BINTIME:
+		case SO_NOSIGPIPE:
+			optval = so->so_options & sopt->sopt_name;
+integer:
+			error = sooptcopyout(sopt, &optval, sizeof optval);
+			break;
+
+		case SO_TYPE:
+			optval = so->so_type;
+			goto integer;
+
+		case SO_PROTOCOL:
+			optval = so->so_proto->pr_protocol;
+			goto integer;
+
+		case SO_ERROR:
+			SOCK_LOCK(so);
+			optval = so->so_error;
+			so->so_error = 0;
+			SOCK_UNLOCK(so);
+			goto integer;
+
+		case SO_SNDBUF:
+			optval = so->so_snd.sb_hiwat;
+			goto integer;
+
+		case SO_RCVBUF:
+			optval = so->so_rcv.sb_hiwat;
+			goto integer;
+
+		case SO_SNDLOWAT:
+			optval = so->so_snd.sb_lowat;
+			goto integer;
+
+		case SO_RCVLOWAT:
+			optval = so->so_rcv.sb_lowat;
+			goto integer;
+
+		case SO_SNDTIMEO:
+		case SO_RCVTIMEO:
+			optval = (sopt->sopt_name == SO_SNDTIMEO ?
+				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
+
+			tv = sbttotv(optval);
+#ifdef COMPAT_FREEBSD32
+			if (SV_CURPROC_FLAG(SV_ILP32)) {
+				struct timeval32 tv32;
+
+				CP(tv, tv32, tv_sec);
+				CP(tv, tv32, tv_usec);
+				error = sooptcopyout(sopt, &tv32, sizeof tv32);
+			} else
+#endif
+				error = sooptcopyout(sopt, &tv, sizeof tv);
+			break;
+
+		case SO_LABEL:
+#ifdef MAC
+			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
+			    sizeof(extmac));
+			if (error)
+				goto bad;
+			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
+			    so, &extmac);
+			if (error)
+				goto bad;
+			error = sooptcopyout(sopt, &extmac, sizeof extmac);
+#else
+			error = EOPNOTSUPP;
+#endif
+			break;
+
+		case SO_PEERLABEL:
+#ifdef MAC
+			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
+			    sizeof(extmac));
+			if (error)
+				goto bad;
+			error = mac_getsockopt_peerlabel(
+			    sopt->sopt_td->td_ucred, so, &extmac);
+			if (error)
+				goto bad;
+			error = sooptcopyout(sopt, &extmac, sizeof extmac);
+#else
+			error = EOPNOTSUPP;
+#endif
+			break;
+
+		case SO_LISTENQLIMIT:
+			optval = so->so_qlimit;
+			goto integer;
+
+		case SO_LISTENQLEN:
+			optval = so->so_qlen;
+			goto integer;
+
+		case SO_LISTENINCQLEN:
+			optval = so->so_incqlen;
+			goto integer;
+
+		default:
+			error = ENOPROTOOPT;
+			break;
+		}
+	}
+#ifdef MAC
+bad:
+#endif
+	CURVNET_RESTORE();
+	return (error);
+}
+
+int
+soopt_getm(struct sockopt *sopt, struct mbuf **mp)
+{
+	struct mbuf *m, *m_prev;
+	int sopt_size = sopt->sopt_valsize;
+
+	MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
+	if (m == NULL)
+		return ENOBUFS;
+	if (sopt_size > MLEN) {
+		MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
+		if ((m->m_flags & M_EXT) == 0) {
+			m_free(m);
+			return ENOBUFS;
+		}
+		m->m_len = min(MCLBYTES, sopt_size);
+	} else {
+		m->m_len = min(MLEN, sopt_size);
+	}
+	sopt_size -= m->m_len;
+	*mp = m;
+	m_prev = m;
+
+	while (sopt_size) {
+		MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
+		if (m == NULL) {
+			m_freem(*mp);
+			return ENOBUFS;
+		}
+		if (sopt_size > MLEN) {
+			MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
+			    M_NOWAIT);
+			if ((m->m_flags & M_EXT) == 0) {
+				m_freem(m);
+				m_freem(*mp);
+				return ENOBUFS;
+			}
+			m->m_len = min(MCLBYTES, sopt_size);
+		} else {
+			m->m_len = min(MLEN, sopt_size);
+		}
+		sopt_size -= m->m_len;
+		m_prev->m_next = m;
+		m_prev = m;
+	}
+	return (0);
+}
+
+int
+soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
+{
+	struct mbuf *m0 = m;
+
+	if (sopt->sopt_val == NULL)
+		return (0);
+	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
+		if (sopt->sopt_td != NULL) {
+			int error;
+
+			error = copyin(sopt->sopt_val, mtod(m, char *),
+			    m->m_len);
+			if (error != 0) {
+				m_freem(m0);
+				return(error);
+			}
+		} else
+			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
+		sopt->sopt_valsize -= m->m_len;
+		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
+		m = m->m_next;
+	}
+	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
+		panic("ip6_sooptmcopyin");
+	return (0);
+}
+
+int
+soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
+{
+	struct mbuf *m0 = m;
+	size_t valsize = 0;
+
+	if (sopt->sopt_val == NULL)
+		return (0);
+	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
+		if (sopt->sopt_td != NULL) {
+			int error;
+
+			error = copyout(mtod(m, char *), sopt->sopt_val,
+			    m->m_len);
+			if (error != 0) {
+				m_freem(m0);
+				return(error);
+			}
+		} else
+			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
+		sopt->sopt_valsize -= m->m_len;
+		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
+		valsize += m->m_len;
+		m = m->m_next;
+	}
+	if (m != NULL) {
+		/* enough soopt buffer should be given from user-land */
+		m_freem(m0);
+		return(EINVAL);
+	}
+	sopt->sopt_valsize = valsize;
+	return (0);
+}
+
+/*
+ * sohasoutofband(): protocol notifies socket layer of the arrival of new
+ * out-of-band data, which will then notify socket consumers.
+ */
+void
+sohasoutofband(struct socket *so)
+{
+
+	if (so->so_sigio != NULL)
+		pgsigio(&so->so_sigio, SIGURG, 0);
+	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
+}
+
+int
+sopoll(struct socket *so, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	/*
+	 * We do not need to set or assert curvnet as long as everyone uses
+	 * sopoll_generic().
+	 */
+	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
+	    td));
+}
+
+int
+sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+	int revents = 0;
+
+	SOCKBUF_LOCK(&so->so_snd);
+	SOCKBUF_LOCK(&so->so_rcv);
+	if (events & (POLLIN | POLLRDNORM))
+		if (soreadabledata(so))
+			revents |= events & (POLLIN | POLLRDNORM);
+
+	if (events & (POLLOUT | POLLWRNORM))
+		if (sowriteable(so))
+			revents |= events & (POLLOUT | POLLWRNORM);
+
+	if (events & (POLLPRI | POLLRDBAND))
+		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
+			revents |= events & (POLLPRI | POLLRDBAND);
+
+	if ((events & POLLINIGNEOF) == 0) {
+		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+			revents |= events & (POLLIN | POLLRDNORM);
+			if (so->so_snd.sb_state & SBS_CANTSENDMORE)
+				revents |= POLLHUP;
+		}
+	}
+
+	if (revents == 0) {
+		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
+			selrecord(td, &so->so_rcv.sb_sel);
+			so->so_rcv.sb_flags |= SB_SEL;
+		}
+
+		if (events & (POLLOUT | POLLWRNORM)) {
+			selrecord(td, &so->so_snd.sb_sel);
+			so->so_snd.sb_flags |= SB_SEL;
+		}
+	}
+
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	SOCKBUF_UNLOCK(&so->so_snd);
+	return (revents);
+}
+
+int
+soo_kqfilter(struct file *fp, struct knote *kn)
+{
+	struct socket *so = kn->kn_fp->f_data;
+	struct sockbuf *sb;
+
+	switch (kn->kn_filter) {
+	case EVFILT_READ:
+		if (so->so_options & SO_ACCEPTCONN)
+			kn->kn_fop = &solisten_filtops;
+		else
+			kn->kn_fop = &soread_filtops;
+		sb = &so->so_rcv;
+		break;
+	case EVFILT_WRITE:
+		kn->kn_fop = &sowrite_filtops;
+		sb = &so->so_snd;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	SOCKBUF_LOCK(sb);
+	knlist_add(&sb->sb_sel.si_note, kn, 1);
+	sb->sb_flags |= SB_KNOTE;
+	SOCKBUF_UNLOCK(sb);
+	return (0);
+}
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol.  Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
+    struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
+    struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
+    struct ifnet *ifp, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_disconnect_notsupp(struct socket *so)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
+    struct sockaddr *addr, struct mbuf *control, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one and
+ * doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+
+	sb->st_blksize = so->so_snd.sb_hiwat;
+	return 0;
+}
+
+int
+pru_shutdown_notsupp(struct socket *so)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
+    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
+    struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+static void
+filt_sordetach(struct knote *kn)
+{
+	struct socket *so = kn->kn_fp->f_data;
+
+	SOCKBUF_LOCK(&so->so_rcv);
+	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
+	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
+		so->so_rcv.sb_flags &= ~SB_KNOTE;
+	SOCKBUF_UNLOCK(&so->so_rcv);
+}
+
+/*ARGSUSED*/
+static int
+filt_soread(struct knote *kn, long hint)
+{
+	struct socket *so;
+
+	so = kn->kn_fp->f_data;
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
+	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+		kn->kn_flags |= EV_EOF;
+		kn->kn_fflags = so->so_error;
+		return (1);
+	} else if (so->so_error)	/* temporary udp error */
+		return (1);
+	else if (kn->kn_sfflags & NOTE_LOWAT)
+		return (kn->kn_data >= kn->kn_sdata);
+	else
+		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
+}
+
+static void
+filt_sowdetach(struct knote *kn)
+{
+	struct socket *so = kn->kn_fp->f_data;
+
+	SOCKBUF_LOCK(&so->so_snd);
+	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
+	if (knlist_empty(&so->so_snd.sb_sel.si_note))
+		so->so_snd.sb_flags &= ~SB_KNOTE;
+	SOCKBUF_UNLOCK(&so->so_snd);
+}
+
+/*ARGSUSED*/
+static int
+filt_sowrite(struct knote *kn, long hint)
+{
+	struct socket *so;
+
+	so = kn->kn_fp->f_data;
+	SOCKBUF_LOCK_ASSERT(&so->so_snd);
+	kn->kn_data = sbspace(&so->so_snd);
+	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+		kn->kn_flags |= EV_EOF;
+		kn->kn_fflags = so->so_error;
+		return (1);
+	} else if (so->so_error)	/* temporary udp error */
+		return (1);
+	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
+	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
+		return (0);
+	else if (kn->kn_sfflags & NOTE_LOWAT)
+		return (kn->kn_data >= kn->kn_sdata);
+	else
+		return (kn->kn_data >= so->so_snd.sb_lowat);
+}
+
+/*ARGSUSED*/
+static int
+filt_solisten(struct knote *kn, long hint)
+{
+	struct socket *so = kn->kn_fp->f_data;
+
+	kn->kn_data = so->so_qlen;
+	return (!TAILQ_EMPTY(&so->so_comp));
+}
+
+int
+socheckuid(struct socket *so, uid_t uid)
+{
+
+	if (so == NULL)
+		return (EPERM);
+	if (so->so_cred->cr_uid != uid)
+		return (EPERM);
+	return (0);
+}
+
+/*
+ * These functions are used by protocols to notify the socket layer (and its
+ * consumers) of state changes in the sockets driven by protocol-side events.
+ */
+
+/*
+ * Procedures to manipulate state flags of socket and do appropriate wakeups.
+ *
+ * Normal sequence from the active (originating) side is that
+ * soisconnecting() is called during processing of connect() call, resulting
+ * in an eventual call to soisconnected() if/when the connection is
+ * established.  When the connection is torn down soisdisconnecting() is
+ * called during processing of disconnect() call, and soisdisconnected() is
+ * called when the connection to the peer is totally severed.  The semantics
+ * of these routines are such that connectionless protocols can call
+ * soisconnected() and soisdisconnected() only, bypassing the in-progress
+ * calls when setting up a ``connection'' takes no time.
+ *
+ * From the passive side, a socket is created with two queues of sockets:
+ * so_incomp for connections in progress and so_comp for connections already
+ * made and awaiting user acceptance.  As a protocol is preparing incoming
+ * connections, it creates a socket structure queued on so_incomp by calling
+ * sonewconn().  When the connection is established, soisconnected() is
+ * called, and transfers the socket structure to so_comp, making it available
+ * to accept().
+ *
+ * If a socket is closed with sockets on either so_incomp or so_comp, these
+ * sockets are dropped.
+ *
+ * If higher-level protocols are implemented in the kernel, the wakeups done
+ * here will sometimes cause software-interrupt process scheduling.
+ */
+void
+soisconnecting(struct socket *so)
+{
+
+	SOCK_LOCK(so);
+	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= SS_ISCONNECTING;
+	SOCK_UNLOCK(so);
+}
+
+void
+soisconnected(struct socket *so)
+{
+	struct socket *head;
+	int ret;
+
+restart:
+	ACCEPT_LOCK();
+	SOCK_LOCK(so);
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+	so->so_state |= SS_ISCONNECTED;
+	head = so->so_head;
+	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
+		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
+			SOCK_UNLOCK(so);
+			TAILQ_REMOVE(&head->so_incomp, so, so_list);
+			head->so_incqlen--;
+			so->so_qstate &= ~SQ_INCOMP;
+			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+			head->so_qlen++;
+			so->so_qstate |= SQ_COMP;
+			ACCEPT_UNLOCK();
+			sorwakeup(head);
+			wakeup_one(&head->so_timeo);
+		} else {
+			ACCEPT_UNLOCK();
+			soupcall_set(so, SO_RCV,
+			    head->so_accf->so_accept_filter->accf_callback,
+			    head->so_accf->so_accept_filter_arg);
+			so->so_options &= ~SO_ACCEPTFILTER;
+			ret = head->so_accf->so_accept_filter->accf_callback(so,
+			    head->so_accf->so_accept_filter_arg, M_NOWAIT);
+			if (ret == SU_ISCONNECTED)
+				soupcall_clear(so, SO_RCV);
+			SOCK_UNLOCK(so);
+			if (ret == SU_ISCONNECTED)
+				goto restart;
+		}
+		return;
+	}
+	SOCK_UNLOCK(so);
+	ACCEPT_UNLOCK();
+	wakeup(&so->so_timeo);
+	sorwakeup(so);
+	sowwakeup(so);
+}
+
+void
+soisdisconnecting(struct socket *so)
+{
+
+	/*
+	 * Note: This code assumes that SOCK_LOCK(so) and
+	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
+	 */
+	SOCKBUF_LOCK(&so->so_rcv);
+	so->so_state &= ~SS_ISCONNECTING;
+	so->so_state |= SS_ISDISCONNECTING;
+	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
+	sorwakeup_locked(so);
+	SOCKBUF_LOCK(&so->so_snd);
+	so->so_snd.sb_state |= SBS_CANTSENDMORE;
+	sowwakeup_locked(so);
+	wakeup(&so->so_timeo);
+}
+
+void
+soisdisconnected(struct socket *so)
+{
+
+	/*
+	 * Note: This code assumes that SOCK_LOCK(so) and
+	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
+	 */
+	SOCKBUF_LOCK(&so->so_rcv);
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= SS_ISDISCONNECTED;
+	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
+	sorwakeup_locked(so);
+	SOCKBUF_LOCK(&so->so_snd);
+	so->so_snd.sb_state |= SBS_CANTSENDMORE;
+	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
+	sowwakeup_locked(so);
+	wakeup(&so->so_timeo);
+}
+
+/*
+ * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
+ */
+struct sockaddr *
+sodupsockaddr(const struct sockaddr *sa, int mflags)
+{
+	struct sockaddr *sa2;
+
+	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
+	if (sa2)
+		bcopy(sa, sa2, sa->sa_len);
+	return sa2;
+}
+
+/*
+ * Register per-socket buffer upcalls.
+ */
+void
+soupcall_set(struct socket *so, int which,
+    int (*func)(struct socket *, void *, int), void *arg)
+{
+	struct sockbuf *sb;
+
+	switch (which) {
+	case SO_RCV:
+		sb = &so->so_rcv;
+		break;
+	case SO_SND:
+		sb = &so->so_snd;
+		break;
+	default:
+		panic("soupcall_set: bad which");
+	}
+	SOCKBUF_LOCK_ASSERT(sb);
+#if 0
+	/* XXX: accf_http actually wants to do this on purpose. */
+	KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
+#endif
+	sb->sb_upcall = func;
+	sb->sb_upcallarg = arg;
+	sb->sb_flags |= SB_UPCALL;
+}
+
+void
+soupcall_clear(struct socket *so, int which)
+{
+	struct sockbuf *sb;
+
+	switch (which) {
+	case SO_RCV:
+		sb = &so->so_rcv;
+		break;
+	case SO_SND:
+		sb = &so->so_snd;
+		break;
+	default:
+		panic("soupcall_clear: bad which");
+	}
+	SOCKBUF_LOCK_ASSERT(sb);
+	KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
+	sb->sb_upcall = NULL;
+	sb->sb_upcallarg = NULL;
+	sb->sb_flags &= ~SB_UPCALL;
+}
+
+/*
+ * Create an external-format (``xsocket'') structure using the information in
+ * the kernel-format socket structure pointed to by so.  This is done to
+ * reduce the spew of irrelevant information over this interface, to isolate
+ * user code from changes in the kernel structure, and potentially to provide
+ * information-hiding if we decide that some of this information should be
+ * hidden from users.
+ */
+void
+sotoxsocket(struct socket *so, struct xsocket *xso)
+{
+
+	xso->xso_len = sizeof *xso;
+	xso->xso_so = so;
+	xso->so_type = so->so_type;
+	xso->so_options = so->so_options;
+	xso->so_linger = so->so_linger;
+	xso->so_state = so->so_state;
+	xso->so_pcb = so->so_pcb;
+	xso->xso_protocol = so->so_proto->pr_protocol;
+	xso->xso_family = so->so_proto->pr_domain->dom_family;
+	xso->so_qlen = so->so_qlen;
+	xso->so_incqlen = so->so_incqlen;
+	xso->so_qlimit = so->so_qlimit;
+	xso->so_timeo = so->so_timeo;
+	xso->so_error = so->so_error;
+	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+	xso->so_oobmark = so->so_oobmark;
+	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+	xso->so_uid = so->so_cred->cr_uid;
+}
+
+
+/*
+ * Socket accessor functions to provide external consumers with
+ * a safe interface to socket state
+ *
+ */
+
+void
+so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *),
+    void *arg)
+{
+
+	TAILQ_FOREACH(so, &so->so_comp, so_list)
+		func(so, arg);
+}
+
+struct sockbuf *
+so_sockbuf_rcv(struct socket *so)
+{
+
+	return (&so->so_rcv);
+}
+
+struct sockbuf *
+so_sockbuf_snd(struct socket *so)
+{
+
+	return (&so->so_snd);
+}
+
+int
+so_state_get(const struct socket *so)
+{
+
+	return (so->so_state);
+}
+
+void
+so_state_set(struct socket *so, int val)
+{
+
+	so->so_state = val;
+}
+
+int
+so_options_get(const struct socket *so)
+{
+
+	return (so->so_options);
+}
+
+void
+so_options_set(struct socket *so, int val)
+{
+
+	so->so_options = val;
+}
+
+int
+so_error_get(const struct socket *so)
+{
+
+	return (so->so_error);
+}
+
+void
+so_error_set(struct socket *so, int val)
+{
+
+	so->so_error = val;
+}
+
+int
+so_linger_get(const struct socket *so)
+{
+
+	return (so->so_linger);
+}
+
+void
+so_linger_set(struct socket *so, int val)
+{
+
+	so->so_linger = val;
+}
+
+struct protosw *
+so_protosw_get(const struct socket *so)
+{
+
+	return (so->so_proto);
+}
+
+void
+so_protosw_set(struct socket *so, struct protosw *val)
+{
+
+	so->so_proto = val;
+}
+
+void
+so_sorwakeup(struct socket *so)
+{
+
+	sorwakeup(so);
+}
+
+void
+so_sowwakeup(struct socket *so)
+{
+
+	sowwakeup(so);
+}
+
+void
+so_sorwakeup_locked(struct socket *so)
+{
+
+	sorwakeup_locked(so);
+}
+
+void
+so_sowwakeup_locked(struct socket *so)
+{
+
+	sowwakeup_locked(so);
+}
+
+void
+so_lock(struct socket *so)
+{
+
+	SOCK_LOCK(so);
+}
+
+void
+so_unlock(struct socket *so)
+{
+
+	SOCK_UNLOCK(so);
+}
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
new file mode 100644
index 0000000..8229390
--- /dev/null
+++ b/sys/kern/uipc_syscalls.c
@@ -0,0 +1,2935 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * sendfile(2) and related extensions:
+ * Copyright (c) 1998, David Greenman. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_sctp.h"
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/capability.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/malloc.h>
+#include <sys/filedesc.h>
+#include <sys/event.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/jail.h>
+#include <sys/mount.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/rwlock.h>
+#include <sys/sf_buf.h>
+#include <sys/sysent.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32_util.h>
+#endif
+
+#include <net/vnet.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+#if defined(INET) || defined(INET6)
+#ifdef SCTP
+#include <netinet/sctp.h>
+#include <netinet/sctp_peeloff.h>
+#endif /* SCTP */
+#endif /* INET || INET6 */
+
+/*
+ * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
+ * and SOCK_NONBLOCK.
+ */
+#define	ACCEPT4_INHERIT	0x1
+#define	ACCEPT4_COMPAT	0x2
+
+static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
+static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
+
+static int accept1(struct thread *td, int s, struct sockaddr *uname,
+		   socklen_t *anamelen, int flags);
+static int do_sendfile(struct thread *td, struct sendfile_args *uap,
+		   int compat);
+static int getsockname1(struct thread *td, struct getsockname_args *uap,
+			int compat);
+static int getpeername1(struct thread *td, struct getpeername_args *uap,
+			int compat);
+
+counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
+
+/*
+ * sendfile(2)-related variables and associated sysctls
+ */
+int nsfbufs;
+int nsfbufspeak;
+int nsfbufsused;
+static int sfreadahead = 1;
+
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
+    "Maximum number of sendfile(2) sf_bufs available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
+    "Number of sendfile(2) sf_bufs at peak usage");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
+    "Number of sendfile(2) sf_bufs in use");
+SYSCTL_INT(_kern_ipc, OID_AUTO, sfreadahead, CTLFLAG_RW, &sfreadahead, 0,
+    "Number of sendfile(2) read-ahead MAXBSIZE blocks");
+
+
+static void
+sfstat_init(const void *unused)
+{
+
+	COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
+	    M_WAITOK);
+}
+SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
+
+static int
+sfstat_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct sfstat s;
+
+	COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
+	if (req->newptr)
+		COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
+	return (SYSCTL_OUT(req, &s, sizeof(s)));
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
+    NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
+
+/*
+ * Convert a user file descriptor to a kernel file entry and check if required
+ * capability rights are present.
+ * A reference on the file entry is held upon returning.
+ */
+static int
+getsock_cap(struct filedesc *fdp, int fd, cap_rights_t *rightsp,
+    struct file **fpp, u_int *fflagp)
+{
+	struct file *fp;
+	int error;
+
+	error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
+	if (error != 0)
+		return (error);
+	if (fp->f_type != DTYPE_SOCKET) {
+		fdrop(fp, curthread);
+		return (ENOTSOCK);
+	}
+	if (fflagp != NULL)
+		*fflagp = fp->f_flag;
+	*fpp = fp;
+	return (0);
+}
+
+/*
+ * System call interface to the socket abstraction.
+ */
+#if defined(COMPAT_43)
+#define COMPAT_OLDSOCK
+#endif
+
+int
+sys_socket(td, uap)
+	struct thread *td;
+	struct socket_args /* {
+		int	domain;
+		int	type;
+		int	protocol;
+	} */ *uap;
+{
+	struct socket *so;
+	struct file *fp;
+	int fd, error, type, oflag, fflag;
+
+	AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
+
+	type = uap->type;
+	oflag = 0;
+	fflag = 0;
+	if ((type & SOCK_CLOEXEC) != 0) {
+		type &= ~SOCK_CLOEXEC;
+		oflag |= O_CLOEXEC;
+	}
+	if ((type & SOCK_NONBLOCK) != 0) {
+		type &= ~SOCK_NONBLOCK;
+		fflag |= FNONBLOCK;
+	}
+
+#ifdef MAC
+	error = mac_socket_check_create(td->td_ucred, uap->domain, type,
+	    uap->protocol);
+	if (error != 0)
+		return (error);
+#endif
+	error = falloc(td, &fp, &fd, oflag);
+	if (error != 0)
+		return (error);
+	/* An extra reference on `fp' has been held for us by falloc(). */
+	error = socreate(uap->domain, &so, type, uap->protocol,
+	    td->td_ucred, td);
+	if (error != 0) {
+		fdclose(td->td_proc->p_fd, fp, fd, td);
+	} else {
+		finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
+		if ((fflag & FNONBLOCK) != 0)
+			(void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
+		td->td_retval[0] = fd;
+	}
+	fdrop(fp, td);
+	return (error);
+}
+
+/* ARGSUSED */
+int
+sys_bind(td, uap)
+	struct thread *td;
+	struct bind_args /* {
+		int	s;
+		caddr_t	name;
+		int	namelen;
+	} */ *uap;
+{
+	struct sockaddr *sa;
+	int error;
+
+	error = getsockaddr(&sa, uap->name, uap->namelen);
+	if (error == 0) {
+		error = kern_bind(td, uap->s, sa);
+		free(sa, M_SONAME);
+	}
+	return (error);
+}
+
+static int
+kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
+{
+	struct socket *so;
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(fd);
+	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
+	error = getsock_cap(td->td_proc->p_fd, fd,
+	    cap_rights_init(&rights, CAP_BIND), &fp, NULL);
+	if (error != 0)
+		return (error);
+	so = fp->f_data;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_STRUCT))
+		ktrsockaddr(sa);
+#endif
+#ifdef MAC
+	error = mac_socket_check_bind(td->td_ucred, so, sa);
+	if (error == 0) {
+#endif
+		if (dirfd == AT_FDCWD)
+			error = sobind(so, sa, td);
+		else
+			error = sobindat(dirfd, so, sa, td);
+#ifdef MAC
+	}
+#endif
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+kern_bind(struct thread *td, int fd, struct sockaddr *sa)
+{
+
+	return (kern_bindat(td, AT_FDCWD, fd, sa));
+}
+
+/* ARGSUSED */
+int
+sys_bindat(td, uap)
+	struct thread *td;
+	struct bindat_args /* {
+		int	fd;
+		int	s;
+		caddr_t	name;
+		int	namelen;
+	} */ *uap;
+{
+	struct sockaddr *sa;
+	int error;
+
+	error = getsockaddr(&sa, uap->name, uap->namelen);
+	if (error == 0) {
+		error = kern_bindat(td, uap->fd, uap->s, sa);
+		free(sa, M_SONAME);
+	}
+	return (error);
+}
+
+/* ARGSUSED */
+int
+sys_listen(td, uap)
+	struct thread *td;
+	struct listen_args /* {
+		int	s;
+		int	backlog;
+	} */ *uap;
+{
+	struct socket *so;
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->s);
+	error = getsock_cap(td->td_proc->p_fd, uap->s,
+	    cap_rights_init(&rights, CAP_LISTEN), &fp, NULL);
+	if (error == 0) {
+		so = fp->f_data;
+#ifdef MAC
+		error = mac_socket_check_listen(td->td_ucred, so);
+		if (error == 0)
+#endif
+			error = solisten(so, uap->backlog, td);
+		fdrop(fp, td);
+	}
+	return(error);
+}
+
+/*
+ * accept1()
+ */
+static int
+accept1(td, s, uname, anamelen, flags)
+	struct thread *td;
+	int s;
+	struct sockaddr *uname;
+	socklen_t *anamelen;
+	int flags;
+{
+	struct sockaddr *name;
+	socklen_t namelen;
+	struct file *fp;
+	int error;
+
+	if (uname == NULL)
+		return (kern_accept4(td, s, NULL, NULL, flags, NULL));
+
+	error = copyin(anamelen, &namelen, sizeof (namelen));
+	if (error != 0)
+		return (error);
+
+	error = kern_accept4(td, s, &name, &namelen, flags, &fp);
+
+	/*
+	 * return a namelen of zero for older code which might
+	 * ignore the return value from accept.
+	 */
+	if (error != 0) {
+		(void) copyout(&namelen, anamelen, sizeof(*anamelen));
+		return (error);
+	}
+
+	if (error == 0 && uname != NULL) {
+#ifdef COMPAT_OLDSOCK
+		if (flags & ACCEPT4_COMPAT)
+			((struct osockaddr *)name)->sa_family =
+			    name->sa_family;
+#endif
+		error = copyout(name, uname, namelen);
+	}
+	if (error == 0)
+		error = copyout(&namelen, anamelen,
+		    sizeof(namelen));
+	if (error != 0)
+		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
+	fdrop(fp, td);
+	free(name, M_SONAME);
+	return (error);
+}
+
+int
+kern_accept(struct thread *td, int s, struct sockaddr **name,
+    socklen_t *namelen, struct file **fp)
+{
+	return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
+}
+
+int
+kern_accept4(struct thread *td, int s, struct sockaddr **name,
+    socklen_t *namelen, int flags, struct file **fp)
+{
+	struct filedesc *fdp;
+	struct file *headfp, *nfp = NULL;
+	struct sockaddr *sa = NULL;
+	struct socket *head, *so;
+	cap_rights_t rights;
+	u_int fflag;
+	pid_t pgid;
+	int error, fd, tmp;
+
+	if (name != NULL)
+		*name = NULL;
+
+	AUDIT_ARG_FD(s);
+	fdp = td->td_proc->p_fd;
+	error = getsock_cap(fdp, s, cap_rights_init(&rights, CAP_ACCEPT),
+	    &headfp, &fflag);
+	if (error != 0)
+		return (error);
+	head = headfp->f_data;
+	if ((head->so_options & SO_ACCEPTCONN) == 0) {
+		error = EINVAL;
+		goto done;
+	}
+#ifdef MAC
+	error = mac_socket_check_accept(td->td_ucred, head);
+	if (error != 0)
+		goto done;
+#endif
+	error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
+	if (error != 0)
+		goto done;
+	ACCEPT_LOCK();
+	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
+		ACCEPT_UNLOCK();
+		error = EWOULDBLOCK;
+		goto noconnection;
+	}
+	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
+		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
+			head->so_error = ECONNABORTED;
+			break;
+		}
+		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
+		    "accept", 0);
+		if (error != 0) {
+			ACCEPT_UNLOCK();
+			goto noconnection;
+		}
+	}
+	if (head->so_error) {
+		error = head->so_error;
+		head->so_error = 0;
+		ACCEPT_UNLOCK();
+		goto noconnection;
+	}
+	so = TAILQ_FIRST(&head->so_comp);
+	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
+	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
+
+	/*
+	 * Before changing the flags on the socket, we have to bump the
+	 * reference count.  Otherwise, if the protocol calls sofree(),
+	 * the socket will be released due to a zero refcount.
+	 */
+	SOCK_LOCK(so);			/* soref() and so_state update */
+	soref(so);			/* file descriptor reference */
+
+	TAILQ_REMOVE(&head->so_comp, so, so_list);
+	head->so_qlen--;
+	if (flags & ACCEPT4_INHERIT)
+		so->so_state |= (head->so_state & SS_NBIO);
+	else
+		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
+	so->so_qstate &= ~SQ_COMP;
+	so->so_head = NULL;
+
+	SOCK_UNLOCK(so);
+	ACCEPT_UNLOCK();
+
+	/* An extra reference on `nfp' has been held for us by falloc(). */
+	td->td_retval[0] = fd;
+
+	/* connection has been removed from the listen queue */
+	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
+
+	if (flags & ACCEPT4_INHERIT) {
+		pgid = fgetown(&head->so_sigio);
+		if (pgid != 0)
+			fsetown(pgid, &so->so_sigio);
+	} else {
+		fflag &= ~(FNONBLOCK | FASYNC);
+		if (flags & SOCK_NONBLOCK)
+			fflag |= FNONBLOCK;
+	}
+
+	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
+	/* Sync socket nonblocking/async state with file flags */
+	tmp = fflag & FNONBLOCK;
+	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
+	tmp = fflag & FASYNC;
+	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
+	sa = 0;
+	error = soaccept(so, &sa);
+	if (error != 0) {
+		/*
+		 * return a namelen of zero for older code which might
+		 * ignore the return value from accept.
+		 */
+		if (name)
+			*namelen = 0;
+		goto noconnection;
+	}
+	if (sa == NULL) {
+		if (name)
+			*namelen = 0;
+		goto done;
+	}
+	AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
+	if (name) {
+		/* check sa_len before it is destroyed */
+		if (*namelen > sa->sa_len)
+			*namelen = sa->sa_len;
+#ifdef KTRACE
+		if (KTRPOINT(td, KTR_STRUCT))
+			ktrsockaddr(sa);
+#endif
+		*name = sa;
+		sa = NULL;
+	}
+noconnection:
+	free(sa, M_SONAME);
+
+	/*
+	 * close the new descriptor, assuming someone hasn't ripped it
+	 * out from under us.
+	 */
+	if (error != 0)
+		fdclose(fdp, nfp, fd, td);
+
+	/*
+	 * Release explicitly held references before returning.  We return
+	 * a reference on nfp to the caller on success if they request it.
+	 */
+done:
+	if (fp != NULL) {
+		if (error == 0) {
+			*fp = nfp;
+			nfp = NULL;
+		} else
+			*fp = NULL;
+	}
+	if (nfp != NULL)
+		fdrop(nfp, td);
+	fdrop(headfp, td);
+	return (error);
+}
+
+int
+sys_accept(td, uap)
+	struct thread *td;
+	struct accept_args *uap;
+{
+
+	return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
+}
+
+int
+sys_accept4(td, uap)
+	struct thread *td;
+	struct accept4_args *uap;
+{
+
+	if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+		return (EINVAL);
+
+	return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+oaccept(td, uap)
+	struct thread *td;
+	struct accept_args *uap;
+{
+
+	return (accept1(td, uap->s, uap->name, uap->anamelen,
+	    ACCEPT4_INHERIT | ACCEPT4_COMPAT));
+}
+#endif /* COMPAT_OLDSOCK */
+
+/* ARGSUSED */
+int
+sys_connect(td, uap)
+	struct thread *td;
+	struct connect_args /* {
+		int	s;
+		caddr_t	name;
+		int	namelen;
+	} */ *uap;
+{
+	struct sockaddr *sa;
+	int error;
+
+	error = getsockaddr(&sa, uap->name, uap->namelen);
+	if (error == 0) {
+		error = kern_connect(td, uap->s, sa);
+		free(sa, M_SONAME);
+	}
+	return (error);
+}
+
+static int
+kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
+{
+	struct socket *so;
+	struct file *fp;
+	cap_rights_t rights;
+	int error, interrupted = 0;
+
+	AUDIT_ARG_FD(fd);
+	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
+	error = getsock_cap(td->td_proc->p_fd, fd,
+	    cap_rights_init(&rights, CAP_CONNECT), &fp, NULL);
+	if (error != 0)
+		return (error);
+	so = fp->f_data;
+	if (so->so_state & SS_ISCONNECTING) {
+		error = EALREADY;
+		goto done1;
+	}
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_STRUCT))
+		ktrsockaddr(sa);
+#endif
+#ifdef MAC
+	error = mac_socket_check_connect(td->td_ucred, so, sa);
+	if (error != 0)
+		goto bad;
+#endif
+	if (dirfd == AT_FDCWD)
+		error = soconnect(so, sa, td);
+	else
+		error = soconnectat(dirfd, so, sa, td);
+	if (error != 0)
+		goto bad;
+	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
+		error = EINPROGRESS;
+		goto done1;
+	}
+	SOCK_LOCK(so);
+	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
+		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
+		    "connec", 0);
+		if (error != 0) {
+			if (error == EINTR || error == ERESTART)
+				interrupted = 1;
+			break;
+		}
+	}
+	if (error == 0) {
+		error = so->so_error;
+		so->so_error = 0;
+	}
+	SOCK_UNLOCK(so);
+bad:
+	if (!interrupted)
+		so->so_state &= ~SS_ISCONNECTING;
+	if (error == ERESTART)
+		error = EINTR;
+done1:
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+kern_connect(struct thread *td, int fd, struct sockaddr *sa)
+{
+
+	return (kern_connectat(td, AT_FDCWD, fd, sa));
+}
+
+/* ARGSUSED */
+int
+sys_connectat(td, uap)
+	struct thread *td;
+	struct connectat_args /* {
+		int	fd;
+		int	s;
+		caddr_t	name;
+		int	namelen;
+	} */ *uap;
+{
+	struct sockaddr *sa;
+	int error;
+
+	error = getsockaddr(&sa, uap->name, uap->namelen);
+	if (error == 0) {
+		error = kern_connectat(td, uap->fd, uap->s, sa);
+		free(sa, M_SONAME);
+	}
+	return (error);
+}
+
+int
+kern_socketpair(struct thread *td, int domain, int type, int protocol,
+    int *rsv)
+{
+	struct filedesc *fdp = td->td_proc->p_fd;
+	struct file *fp1, *fp2;
+	struct socket *so1, *so2;
+	int fd, error, oflag, fflag;
+
+	AUDIT_ARG_SOCKET(domain, type, protocol);
+
+	oflag = 0;
+	fflag = 0;
+	if ((type & SOCK_CLOEXEC) != 0) {
+		type &= ~SOCK_CLOEXEC;
+		oflag |= O_CLOEXEC;
+	}
+	if ((type & SOCK_NONBLOCK) != 0) {
+		type &= ~SOCK_NONBLOCK;
+		fflag |= FNONBLOCK;
+	}
+#ifdef MAC
+	/* We might want to have a separate check for socket pairs. */
+	error = mac_socket_check_create(td->td_ucred, domain, type,
+	    protocol);
+	if (error != 0)
+		return (error);
+#endif
+	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
+	if (error != 0)
+		return (error);
+	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
+	if (error != 0)
+		goto free1;
+	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
+	error = falloc(td, &fp1, &fd, oflag);
+	if (error != 0)
+		goto free2;
+	rsv[0] = fd;
+	fp1->f_data = so1;	/* so1 already has ref count */
+	error = falloc(td, &fp2, &fd, oflag);
+	if (error != 0)
+		goto free3;
+	fp2->f_data = so2;	/* so2 already has ref count */
+	rsv[1] = fd;
+	error = soconnect2(so1, so2);
+	if (error != 0)
+		goto free4;
+	if (type == SOCK_DGRAM) {
+		/*
+		 * Datagram socket connection is asymmetric.
+		 */
+		 error = soconnect2(so2, so1);
+		 if (error != 0)
+			goto free4;
+	}
+	finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
+	    &socketops);
+	finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
+	    &socketops);
+	if ((fflag & FNONBLOCK) != 0) {
+		(void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
+		(void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
+	}
+	fdrop(fp1, td);
+	fdrop(fp2, td);
+	return (0);
+free4:
+	fdclose(fdp, fp2, rsv[1], td);
+	fdrop(fp2, td);
+free3:
+	fdclose(fdp, fp1, rsv[0], td);
+	fdrop(fp1, td);
+free2:
+	if (so2 != NULL)
+		(void)soclose(so2);
+free1:
+	if (so1 != NULL)
+		(void)soclose(so1);
+	return (error);
+}
+
+int
+sys_socketpair(struct thread *td, struct socketpair_args *uap)
+{
+	int error, sv[2];
+
+	error = kern_socketpair(td, uap->domain, uap->type,
+	    uap->protocol, sv);
+	if (error != 0)
+		return (error);
+	error = copyout(sv, uap->rsv, 2 * sizeof(int));
+	if (error != 0) {
+		(void)kern_close(td, sv[0]);
+		(void)kern_close(td, sv[1]);
+	}
+	return (error);
+}
+
+static int
+sendit(td, s, mp, flags)
+	struct thread *td;
+	int s;
+	struct msghdr *mp;
+	int flags;
+{
+	struct mbuf *control;
+	struct sockaddr *to;
+	int error;
+
+#ifdef CAPABILITY_MODE
+	if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
+		return (ECAPMODE);
+#endif
+
+	if (mp->msg_name != NULL) {
+		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
+		if (error != 0) {
+			to = NULL;
+			goto bad;
+		}
+		mp->msg_name = to;
+	} else {
+		to = NULL;
+	}
+
+	if (mp->msg_control) {
+		if (mp->msg_controllen < sizeof(struct cmsghdr)
+#ifdef COMPAT_OLDSOCK
+		    && mp->msg_flags != MSG_COMPAT
+#endif
+		) {
+			error = EINVAL;
+			goto bad;
+		}
+		error = sockargs(&control, mp->msg_control,
+		    mp->msg_controllen, MT_CONTROL);
+		if (error != 0)
+			goto bad;
+#ifdef COMPAT_OLDSOCK
+		if (mp->msg_flags == MSG_COMPAT) {
+			struct cmsghdr *cm;
+
+			M_PREPEND(control, sizeof(*cm), M_WAITOK);
+			cm = mtod(control, struct cmsghdr *);
+			cm->cmsg_len = control->m_len;
+			cm->cmsg_level = SOL_SOCKET;
+			cm->cmsg_type = SCM_RIGHTS;
+		}
+#endif
+	} else {
+		control = NULL;
+	}
+
+	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
+
+bad:
+	free(to, M_SONAME);
+	return (error);
+}
+
+int
+kern_sendit(td, s, mp, flags, control, segflg)
+	struct thread *td;
+	int s;
+	struct msghdr *mp;
+	int flags;
+	struct mbuf *control;
+	enum uio_seg segflg;
+{
+	struct file *fp;
+	struct uio auio;
+	struct iovec *iov;
+	struct socket *so;
+	cap_rights_t rights;
+#ifdef KTRACE
+	struct uio *ktruio = NULL;
+#endif
+	ssize_t len;
+	int i, error;
+
+	AUDIT_ARG_FD(s);
+	cap_rights_init(&rights, CAP_SEND);
+	if (mp->msg_name != NULL) {
+		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
+		cap_rights_set(&rights, CAP_CONNECT);
+	}
+	error = getsock_cap(td->td_proc->p_fd, s, &rights, &fp, NULL);
+	if (error != 0)
+		return (error);
+	so = (struct socket *)fp->f_data;
+
+#ifdef KTRACE
+	if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
+		ktrsockaddr(mp->msg_name);
+#endif
+#ifdef MAC
+	if (mp->msg_name != NULL) {
+		error = mac_socket_check_connect(td->td_ucred, so,
+		    mp->msg_name);
+		if (error != 0)
+			goto bad;
+	}
+	error = mac_socket_check_send(td->td_ucred, so);
+	if (error != 0)
+		goto bad;
+#endif
+
+	auio.uio_iov = mp->msg_iov;
+	auio.uio_iovcnt = mp->msg_iovlen;
+	auio.uio_segflg = segflg;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_td = td;
+	auio.uio_offset = 0;			/* XXX */
+	auio.uio_resid = 0;
+	iov = mp->msg_iov;
+	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
+		if ((auio.uio_resid += iov->iov_len) < 0) {
+			error = EINVAL;
+			goto bad;
+		}
+	}
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_GENIO))
+		ktruio = cloneuio(&auio);
+#endif
+	len = auio.uio_resid;
+	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
+	if (error != 0) {
+		if (auio.uio_resid != len && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+		/* Generation of SIGPIPE can be controlled per socket */
+		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
+		    !(flags & MSG_NOSIGNAL)) {
+			PROC_LOCK(td->td_proc);
+			tdsignal(td, SIGPIPE);
+			PROC_UNLOCK(td->td_proc);
+		}
+	}
+	if (error == 0)
+		td->td_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+	if (ktruio != NULL) {
+		ktruio->uio_resid = td->td_retval[0];
+		ktrgenio(s, UIO_WRITE, ktruio, error);
+	}
+#endif
+bad:
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_sendto(td, uap)
+	struct thread *td;
+	struct sendto_args /* {
+		int	s;
+		caddr_t	buf;
+		size_t	len;
+		int	flags;
+		caddr_t	to;
+		int	tolen;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+
+	msg.msg_name = uap->to;
+	msg.msg_namelen = uap->tolen;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = 0;
+#ifdef COMPAT_OLDSOCK
+	msg.msg_flags = 0;
+#endif
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	return (sendit(td, uap->s, &msg, uap->flags));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+osend(td, uap)
+	struct thread *td;
+	struct osend_args /* {
+		int	s;
+		caddr_t	buf;
+		int	len;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+
+	msg.msg_name = 0;
+	msg.msg_namelen = 0;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	msg.msg_control = 0;
+	msg.msg_flags = 0;
+	return (sendit(td, uap->s, &msg, uap->flags));
+}
+
+int
+osendmsg(td, uap)
+	struct thread *td;
+	struct osendmsg_args /* {
+		int	s;
+		caddr_t	msg;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec *iov;
+	int error;
+
+	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
+	if (error != 0)
+		return (error);
+	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
+	if (error != 0)
+		return (error);
+	msg.msg_iov = iov;
+	msg.msg_flags = MSG_COMPAT;
+	error = sendit(td, uap->s, &msg, uap->flags);
+	free(iov, M_IOV);
+	return (error);
+}
+#endif
+
+int
+sys_sendmsg(td, uap)
+	struct thread *td;
+	struct sendmsg_args /* {
+		int	s;
+		caddr_t	msg;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec *iov;
+	int error;
+
+	error = copyin(uap->msg, &msg, sizeof (msg));
+	if (error != 0)
+		return (error);
+	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
+	if (error != 0)
+		return (error);
+	msg.msg_iov = iov;
+#ifdef COMPAT_OLDSOCK
+	msg.msg_flags = 0;
+#endif
+	error = sendit(td, uap->s, &msg, uap->flags);
+	free(iov, M_IOV);
+	return (error);
+}
+
+int
+kern_recvit(td, s, mp, fromseg, controlp)
+	struct thread *td;
+	int s;
+	struct msghdr *mp;
+	enum uio_seg fromseg;
+	struct mbuf **controlp;
+{
+	struct uio auio;
+	struct iovec *iov;
+	struct mbuf *m, *control = NULL;
+	caddr_t ctlbuf;
+	struct file *fp;
+	struct socket *so;
+	struct sockaddr *fromsa = NULL;
+	cap_rights_t rights;
+#ifdef KTRACE
+	struct uio *ktruio = NULL;
+#endif
+	ssize_t len;
+	int error, i;
+
+	if (controlp != NULL)
+		*controlp = NULL;
+
+	AUDIT_ARG_FD(s);
+	error = getsock_cap(td->td_proc->p_fd, s,
+	    cap_rights_init(&rights, CAP_RECV), &fp, NULL);
+	if (error != 0)
+		return (error);
+	so = fp->f_data;
+
+#ifdef MAC
+	error = mac_socket_check_receive(td->td_ucred, so);
+	if (error != 0) {
+		fdrop(fp, td);
+		return (error);
+	}
+#endif
+
+	auio.uio_iov = mp->msg_iov;
+	auio.uio_iovcnt = mp->msg_iovlen;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_rw = UIO_READ;
+	auio.uio_td = td;
+	auio.uio_offset = 0;			/* XXX */
+	auio.uio_resid = 0;
+	iov = mp->msg_iov;
+	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
+		if ((auio.uio_resid += iov->iov_len) < 0) {
+			fdrop(fp, td);
+			return (EINVAL);
+		}
+	}
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_GENIO))
+		ktruio = cloneuio(&auio);
+#endif
+	len = auio.uio_resid;
+	error = soreceive(so, &fromsa, &auio, NULL,
+	    (mp->msg_control || controlp) ? &control : NULL,
+	    &mp->msg_flags);
+	if (error != 0) {
+		if (auio.uio_resid != len && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+	}
+	if (fromsa != NULL)
+		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
+#ifdef KTRACE
+	if (ktruio != NULL) {
+		ktruio->uio_resid = len - auio.uio_resid;
+		ktrgenio(s, UIO_READ, ktruio, error);
+	}
+#endif
+	if (error != 0)
+		goto out;
+	td->td_retval[0] = len - auio.uio_resid;
+	if (mp->msg_name) {
+		len = mp->msg_namelen;
+		if (len <= 0 || fromsa == NULL)
+			len = 0;
+		else {
+			/* save sa_len before it is destroyed by MSG_COMPAT */
+			len = MIN(len, fromsa->sa_len);
+#ifdef COMPAT_OLDSOCK
+			if (mp->msg_flags & MSG_COMPAT)
+				((struct osockaddr *)fromsa)->sa_family =
+				    fromsa->sa_family;
+#endif
+			if (fromseg == UIO_USERSPACE) {
+				error = copyout(fromsa, mp->msg_name,
+				    (unsigned)len);
+				if (error != 0)
+					goto out;
+			} else
+				bcopy(fromsa, mp->msg_name, len);
+		}
+		mp->msg_namelen = len;
+	}
+	if (mp->msg_control && controlp == NULL) {
+#ifdef COMPAT_OLDSOCK
+		/*
+		 * We assume that old recvmsg calls won't receive access
+		 * rights and other control info, esp. as control info
+		 * is always optional and those options didn't exist in 4.3.
+		 * If we receive rights, trim the cmsghdr; anything else
+		 * is tossed.
+		 */
+		if (control && mp->msg_flags & MSG_COMPAT) {
+			if (mtod(control, struct cmsghdr *)->cmsg_level !=
+			    SOL_SOCKET ||
+			    mtod(control, struct cmsghdr *)->cmsg_type !=
+			    SCM_RIGHTS) {
+				mp->msg_controllen = 0;
+				goto out;
+			}
+			control->m_len -= sizeof (struct cmsghdr);
+			control->m_data += sizeof (struct cmsghdr);
+		}
+#endif
+		len = mp->msg_controllen;
+		m = control;
+		mp->msg_controllen = 0;
+		ctlbuf = mp->msg_control;
+
+		while (m && len > 0) {
+			unsigned int tocopy;
+
+			if (len >= m->m_len)
+				tocopy = m->m_len;
+			else {
+				mp->msg_flags |= MSG_CTRUNC;
+				tocopy = len;
+			}
+
+			if ((error = copyout(mtod(m, caddr_t),
+					ctlbuf, tocopy)) != 0)
+				goto out;
+
+			ctlbuf += tocopy;
+			len -= tocopy;
+			m = m->m_next;
+		}
+		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
+	}
+out:
+	fdrop(fp, td);
+#ifdef KTRACE
+	if (fromsa && KTRPOINT(td, KTR_STRUCT))
+		ktrsockaddr(fromsa);
+#endif
+	free(fromsa, M_SONAME);
+
+	if (error == 0 && controlp != NULL)
+		*controlp = control;
+	else  if (control)
+		m_freem(control);
+
+	return (error);
+}
+
+static int
+recvit(td, s, mp, namelenp)
+	struct thread *td;
+	int s;
+	struct msghdr *mp;
+	void *namelenp;
+{
+	int error;
+
+	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
+	if (error != 0)
+		return (error);
+	if (namelenp != NULL) {
+		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
+#ifdef COMPAT_OLDSOCK
+		if (mp->msg_flags & MSG_COMPAT)
+			error = 0;	/* old recvfrom didn't check */
+#endif
+	}
+	return (error);
+}
+
+int
+sys_recvfrom(td, uap)
+	struct thread *td;
+	struct recvfrom_args /* {
+		int	s;
+		caddr_t	buf;
+		size_t	len;
+		int	flags;
+		struct sockaddr * __restrict	from;
+		socklen_t * __restrict fromlenaddr;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+	int error;
+
+	if (uap->fromlenaddr) {
+		error = copyin(uap->fromlenaddr,
+		    &msg.msg_namelen, sizeof (msg.msg_namelen));
+		if (error != 0)
+			goto done2;
+	} else {
+		msg.msg_namelen = 0;
+	}
+	msg.msg_name = uap->from;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	msg.msg_control = 0;
+	msg.msg_flags = uap->flags;
+	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
+done2:
+	return (error);
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+orecvfrom(td, uap)
+	struct thread *td;
+	struct recvfrom_args *uap;
+{
+
+	uap->flags |= MSG_COMPAT;
+	return (sys_recvfrom(td, uap));
+}
+#endif
+
+#ifdef COMPAT_OLDSOCK
+int
+orecv(td, uap)
+	struct thread *td;
+	struct orecv_args /* {
+		int	s;
+		caddr_t	buf;
+		int	len;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+
+	msg.msg_name = 0;
+	msg.msg_namelen = 0;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	msg.msg_control = 0;
+	msg.msg_flags = uap->flags;
+	return (recvit(td, uap->s, &msg, NULL));
+}
+
+/*
+ * Old recvmsg.  This code takes advantage of the fact that the old msghdr
+ * overlays the new one, missing only the flags, and with the (old) access
+ * rights where the control fields are now.
+ */
+int
+orecvmsg(td, uap)
+	struct thread *td;
+	struct orecvmsg_args /* {
+		int	s;
+		struct	omsghdr *msg;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec *iov;
+	int error;
+
+	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
+	if (error != 0)
+		return (error);
+	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
+	if (error != 0)
+		return (error);
+	msg.msg_flags = uap->flags | MSG_COMPAT;
+	msg.msg_iov = iov;
+	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
+	if (msg.msg_controllen && error == 0)
+		error = copyout(&msg.msg_controllen,
+		    &uap->msg->msg_accrightslen, sizeof (int));
+	free(iov, M_IOV);
+	return (error);
+}
+#endif
+
+int
+sys_recvmsg(td, uap)
+	struct thread *td;
+	struct recvmsg_args /* {
+		int	s;
+		struct	msghdr *msg;
+		int	flags;
+	} */ *uap;
+{
+	struct msghdr msg;
+	struct iovec *uiov, *iov;
+	int error;
+
+	error = copyin(uap->msg, &msg, sizeof (msg));
+	if (error != 0)
+		return (error);
+	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
+	if (error != 0)
+		return (error);
+	msg.msg_flags = uap->flags;
+#ifdef COMPAT_OLDSOCK
+	msg.msg_flags &= ~MSG_COMPAT;
+#endif
+	uiov = msg.msg_iov;
+	msg.msg_iov = iov;
+	error = recvit(td, uap->s, &msg, NULL);
+	if (error == 0) {
+		msg.msg_iov = uiov;
+		error = copyout(&msg, uap->msg, sizeof(msg));
+	}
+	free(iov, M_IOV);
+	return (error);
+}
+
+/* ARGSUSED */
+int
+sys_shutdown(td, uap)
+	struct thread *td;
+	struct shutdown_args /* {
+		int	s;
+		int	how;
+	} */ *uap;
+{
+	struct socket *so;
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->s);
+	error = getsock_cap(td->td_proc->p_fd, uap->s,
+	    cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL);
+	if (error == 0) {
+		so = fp->f_data;
+		error = soshutdown(so, uap->how);
+		fdrop(fp, td);
+	}
+	return (error);
+}
+
+/* ARGSUSED */
+int
+sys_setsockopt(td, uap)
+	struct thread *td;
+	struct setsockopt_args /* {
+		int	s;
+		int	level;
+		int	name;
+		caddr_t	val;
+		int	valsize;
+	} */ *uap;
+{
+
+	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
+	    uap->val, UIO_USERSPACE, uap->valsize));
+}
+
+int
+kern_setsockopt(td, s, level, name, val, valseg, valsize)
+	struct thread *td;
+	int s;
+	int level;
+	int name;
+	void *val;
+	enum uio_seg valseg;
+	socklen_t valsize;
+{
+	struct socket *so;
+	struct file *fp;
+	struct sockopt sopt;
+	cap_rights_t rights;
+	int error;
+
+	if (val == NULL && valsize != 0)
+		return (EFAULT);
+	if ((int)valsize < 0)
+		return (EINVAL);
+
+	sopt.sopt_dir = SOPT_SET;
+	sopt.sopt_level = level;
+	sopt.sopt_name = name;
+	sopt.sopt_val = val;
+	sopt.sopt_valsize = valsize;
+	switch (valseg) {
+	case UIO_USERSPACE:
+		sopt.sopt_td = td;
+		break;
+	case UIO_SYSSPACE:
+		sopt.sopt_td = NULL;
+		break;
+	default:
+		panic("kern_setsockopt called with bad valseg");
+	}
+
+	AUDIT_ARG_FD(s);
+	error = getsock_cap(td->td_proc->p_fd, s,
+	    cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL);
+	if (error == 0) {
+		so = fp->f_data;
+		error = sosetopt(so, &sopt);
+		fdrop(fp, td);
+	}
+	return(error);
+}
+
+/* ARGSUSED */
+int
+sys_getsockopt(td, uap)
+	struct thread *td;
+	struct getsockopt_args /* {
+		int	s;
+		int	level;
+		int	name;
+		void * __restrict	val;
+		socklen_t * __restrict avalsize;
+	} */ *uap;
+{
+	socklen_t valsize;
+	int error;
+
+	if (uap->val) {
+		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
+		if (error != 0)
+			return (error);
+	}
+
+	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
+	    uap->val, UIO_USERSPACE, &valsize);
+
+	if (error == 0)
+		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
+	return (error);
+}
+
+/*
+ * Kernel version of getsockopt.
+ * optval can be a userland or userspace. optlen is always a kernel pointer.
+ */
+int
+kern_getsockopt(td, s, level, name, val, valseg, valsize)
+	struct thread *td;
+	int s;
+	int level;
+	int name;
+	void *val;
+	enum uio_seg valseg;
+	socklen_t *valsize;
+{
+	struct socket *so;
+	struct file *fp;
+	struct sockopt sopt;
+	cap_rights_t rights;
+	int error;
+
+	if (val == NULL)
+		*valsize = 0;
+	if ((int)*valsize < 0)
+		return (EINVAL);
+
+	sopt.sopt_dir = SOPT_GET;
+	sopt.sopt_level = level;
+	sopt.sopt_name = name;
+	sopt.sopt_val = val;
+	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
+	switch (valseg) {
+	case UIO_USERSPACE:
+		sopt.sopt_td = td;
+		break;
+	case UIO_SYSSPACE:
+		sopt.sopt_td = NULL;
+		break;
+	default:
+		panic("kern_getsockopt called with bad valseg");
+	}
+
+	AUDIT_ARG_FD(s);
+	error = getsock_cap(td->td_proc->p_fd, s,
+	    cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL);
+	if (error == 0) {
+		so = fp->f_data;
+		error = sogetopt(so, &sopt);
+		*valsize = sopt.sopt_valsize;
+		fdrop(fp, td);
+	}
+	return (error);
+}
+
+/*
+ * getsockname1() - Get socket name.
+ */
+/* ARGSUSED */
+static int
+getsockname1(td, uap, compat)
+	struct thread *td;
+	struct getsockname_args /* {
+		int	fdes;
+		struct sockaddr * __restrict asa;
+		socklen_t * __restrict alen;
+	} */ *uap;
+	int compat;
+{
+	struct sockaddr *sa;
+	socklen_t len;
+	int error;
+
+	error = copyin(uap->alen, &len, sizeof(len));
+	if (error != 0)
+		return (error);
+
+	error = kern_getsockname(td, uap->fdes, &sa, &len);
+	if (error != 0)
+		return (error);
+
+	if (len != 0) {
+#ifdef COMPAT_OLDSOCK
+		if (compat)
+			((struct osockaddr *)sa)->sa_family = sa->sa_family;
+#endif
+		error = copyout(sa, uap->asa, (u_int)len);
+	}
+	free(sa, M_SONAME);
+	if (error == 0)
+		error = copyout(&len, uap->alen, sizeof(len));
+	return (error);
+}
+
+int
+kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
+    socklen_t *alen)
+{
+	struct socket *so;
+	struct file *fp;
+	cap_rights_t rights;
+	socklen_t len;
+	int error;
+
+	AUDIT_ARG_FD(fd);
+	error = getsock_cap(td->td_proc->p_fd, fd,
+	    cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL);
+	if (error != 0)
+		return (error);
+	so = fp->f_data;
+	*sa = NULL;
+	CURVNET_SET(so->so_vnet);
+	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
+	CURVNET_RESTORE();
+	if (error != 0)
+		goto bad;
+	if (*sa == NULL)
+		len = 0;
+	else
+		len = MIN(*alen, (*sa)->sa_len);
+	*alen = len;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_STRUCT))
+		ktrsockaddr(*sa);
+#endif
+bad:
+	fdrop(fp, td);
+	if (error != 0 && *sa != NULL) {
+		free(*sa, M_SONAME);
+		*sa = NULL;
+	}
+	return (error);
+}
+
+int
+sys_getsockname(td, uap)
+	struct thread *td;
+	struct getsockname_args *uap;
+{
+
+	return (getsockname1(td, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+ogetsockname(td, uap)
+	struct thread *td;
+	struct getsockname_args *uap;
+{
+
+	return (getsockname1(td, uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+/*
+ * getpeername1() - Get name of peer for connected socket.
+ */
+/* ARGSUSED */
+static int
+getpeername1(td, uap, compat)
+	struct thread *td;
+	struct getpeername_args /* {
+		int	fdes;
+		struct sockaddr * __restrict	asa;
+		socklen_t * __restrict	alen;
+	} */ *uap;
+	int compat;
+{
+	struct sockaddr *sa;
+	socklen_t len;
+	int error;
+
+	error = copyin(uap->alen, &len, sizeof (len));
+	if (error != 0)
+		return (error);
+
+	error = kern_getpeername(td, uap->fdes, &sa, &len);
+	if (error != 0)
+		return (error);
+
+	if (len != 0) {
+#ifdef COMPAT_OLDSOCK
+		if (compat)
+			((struct osockaddr *)sa)->sa_family = sa->sa_family;
+#endif
+		error = copyout(sa, uap->asa, (u_int)len);
+	}
+	free(sa, M_SONAME);
+	if (error == 0)
+		error = copyout(&len, uap->alen, sizeof(len));
+	return (error);
+}
+
+int
+kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
+    socklen_t *alen)
+{
+	struct socket *so;
+	struct file *fp;
+	cap_rights_t rights;
+	socklen_t len;
+	int error;
+
+	AUDIT_ARG_FD(fd);
+	error = getsock_cap(td->td_proc->p_fd, fd,
+	    cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL);
+	if (error != 0)
+		return (error);
+	so = fp->f_data;
+	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
+		error = ENOTCONN;
+		goto done;
+	}
+	*sa = NULL;
+	CURVNET_SET(so->so_vnet);
+	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
+	CURVNET_RESTORE();
+	if (error != 0)
+		goto bad;
+	if (*sa == NULL)
+		len = 0;
+	else
+		len = MIN(*alen, (*sa)->sa_len);
+	*alen = len;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_STRUCT))
+		ktrsockaddr(*sa);
+#endif
+bad:
+	if (error != 0 && *sa != NULL) {
+		free(*sa, M_SONAME);
+		*sa = NULL;
+	}
+done:
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_getpeername(td, uap)
+	struct thread *td;
+	struct getpeername_args *uap;
+{
+
+	return (getpeername1(td, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+ogetpeername(td, uap)
+	struct thread *td;
+	struct ogetpeername_args *uap;
+{
+
+	/* XXX uap should have type `getpeername_args *' to begin with. */
+	return (getpeername1(td, (struct getpeername_args *)uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+int
+sockargs(mp, buf, buflen, type)
+	struct mbuf **mp;
+	caddr_t buf;
+	int buflen, type;
+{
+	struct sockaddr *sa;
+	struct mbuf *m;
+	int error;
+
+	if (buflen > MLEN) {
+#ifdef COMPAT_OLDSOCK
+		if (type == MT_SONAME && buflen <= 112)
+			buflen = MLEN;		/* unix domain compat. hack */
+		else
+#endif
+			if (buflen > MCLBYTES)
+				return (EINVAL);
+	}
+	m = m_get2(buflen, M_WAITOK, type, 0);
+	m->m_len = buflen;
+	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
+	if (error != 0)
+		(void) m_free(m);
+	else {
+		*mp = m;
+		if (type == MT_SONAME) {
+			sa = mtod(m, struct sockaddr *);
+
+#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
+			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+				sa->sa_family = sa->sa_len;
+#endif
+			sa->sa_len = buflen;
+		}
+	}
+	return (error);
+}
+
+int
+getsockaddr(namp, uaddr, len)
+	struct sockaddr **namp;
+	caddr_t uaddr;
+	size_t len;
+{
+	struct sockaddr *sa;
+	int error;
+
+	if (len > SOCK_MAXADDRLEN)
+		return (ENAMETOOLONG);
+	if (len < offsetof(struct sockaddr, sa_data[0]))
+		return (EINVAL);
+	sa = malloc(len, M_SONAME, M_WAITOK);
+	error = copyin(uaddr, sa, len);
+	if (error != 0) {
+		free(sa, M_SONAME);
+	} else {
+#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
+		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+			sa->sa_family = sa->sa_len;
+#endif
+		sa->sa_len = len;
+		*namp = sa;
+	}
+	return (error);
+}
+
+#include <sys/condvar.h>
+
+struct sendfile_sync {
+	struct mtx	mtx;
+	struct cv	cv;
+	unsigned	count;
+};
+
+/*
+ * Detach mapped page and release resources back to the system.
+ */
+int
+sf_buf_mext(struct mbuf *mb, void *addr, void *args)
+{
+	vm_page_t m;
+	struct sendfile_sync *sfs;
+
+	m = sf_buf_page(args);
+	sf_buf_free(args);
+	vm_page_lock(m);
+	vm_page_unwire(m, 0);
+	/*
+	 * Check for the object going away on us. This can
+	 * happen since we don't hold a reference to it.
+	 * If so, we're responsible for freeing the page.
+	 */
+	if (m->wire_count == 0 && m->object == NULL)
+		vm_page_free(m);
+	vm_page_unlock(m);
+	if (addr == NULL)
+		return (EXT_FREE_OK);
+	sfs = addr;
+	mtx_lock(&sfs->mtx);
+	KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
+	if (--sfs->count == 0)
+		cv_signal(&sfs->cv);
+	mtx_unlock(&sfs->mtx);
+	return (EXT_FREE_OK);
+}
+
+/*
+ * sendfile(2)
+ *
+ * int sendfile(int fd, int s, off_t offset, size_t nbytes,
+ *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
+ *
+ * Send a file specified by 'fd' and starting at 'offset' to a socket
+ * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
+ * 0.  Optionally add a header and/or trailer to the socket output.  If
+ * specified, write the total number of bytes sent into *sbytes.
+ */
+int
+sys_sendfile(struct thread *td, struct sendfile_args *uap)
+{
+
+	return (do_sendfile(td, uap, 0));
+}
+
+static int
+do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
+{
+	struct sf_hdtr hdtr;
+	struct uio *hdr_uio, *trl_uio;
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	if (uap->offset < 0)
+		return (EINVAL);
+
+	hdr_uio = trl_uio = NULL;
+
+	if (uap->hdtr != NULL) {
+		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
+		if (error != 0)
+			goto out;
+		if (hdtr.headers != NULL) {
+			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
+			if (error != 0)
+				goto out;
+		}
+		if (hdtr.trailers != NULL) {
+			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
+			if (error != 0)
+				goto out;
+
+		}
+	}
+
+	AUDIT_ARG_FD(uap->fd);
+
+	/*
+	 * sendfile(2) can start at any offset within a file so we require
+	 * CAP_READ+CAP_SEEK = CAP_PREAD.
+	 */
+	if ((error = fget_read(td, uap->fd,
+	    cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) {
+		goto out;
+	}
+
+	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
+	    uap->nbytes, uap->sbytes, uap->flags, compat ? SFK_COMPAT : 0, td);
+	fdrop(fp, td);
+
+out:
+	free(hdr_uio, M_IOV);
+	free(trl_uio, M_IOV);
+	return (error);
+}
+
+#ifdef COMPAT_FREEBSD4
+int
+freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
+{
+	struct sendfile_args args;
+
+	args.fd = uap->fd;
+	args.s = uap->s;
+	args.offset = uap->offset;
+	args.nbytes = uap->nbytes;
+	args.hdtr = uap->hdtr;
+	args.sbytes = uap->sbytes;
+	args.flags = uap->flags;
+
+	return (do_sendfile(td, &args, 1));
+}
+#endif /* COMPAT_FREEBSD4 */
+
+int
+vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+    int kflags, struct thread *td)
+{
+	struct vnode *vp = fp->f_vnode;
+	struct file *sock_fp;
+	struct vm_object *obj = NULL;
+	struct socket *so = NULL;
+	struct mbuf *m = NULL;
+	struct sf_buf *sf;
+	struct vm_page *pg;
+	struct vattr va;
+	struct sendfile_sync *sfs = NULL;
+	cap_rights_t rights;
+	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
+	int bsize, error, hdrlen = 0, mnw = 0;
+
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	if (vp->v_type == VREG) {
+		bsize = vp->v_mount->mnt_stat.f_iosize;
+		if (nbytes == 0) {
+			error = VOP_GETATTR(vp, &va, td->td_ucred);
+			if (error != 0) {
+				VOP_UNLOCK(vp, 0);
+				obj = NULL;
+				goto out;
+			}
+			rem = va.va_size;
+		} else
+			rem = nbytes;
+		obj = vp->v_object;
+		if (obj != NULL) {
+			/*
+			 * Temporarily increase the backing VM
+			 * object's reference count so that a forced
+			 * reclamation of its vnode does not
+			 * immediately destroy it.
+			 */
+			VM_OBJECT_WLOCK(obj);
+			if ((obj->flags & OBJ_DEAD) == 0) {
+				vm_object_reference_locked(obj);
+				VM_OBJECT_WUNLOCK(obj);
+			} else {
+				VM_OBJECT_WUNLOCK(obj);
+				obj = NULL;
+			}
+		}
+	} else
+		bsize = 0;	/* silence gcc */
+	VOP_UNLOCK(vp, 0);
+	if (obj == NULL) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * The socket must be a stream socket and connected.
+	 * Remember if it a blocking or non-blocking socket.
+	 */
+	error = getsock_cap(td->td_proc->p_fd, sockfd,
+	    cap_rights_init(&rights, CAP_SEND), &sock_fp, NULL);
+	if (error != 0)
+		goto out;
+	so = sock_fp->f_data;
+	if (so->so_type != SOCK_STREAM) {
+		error = EINVAL;
+		goto out;
+	}
+	if ((so->so_state & SS_ISCONNECTED) == 0) {
+		error = ENOTCONN;
+		goto out;
+	}
+	/*
+	 * Do not wait on memory allocations but return ENOMEM for
+	 * caller to retry later.
+	 * XXX: Experimental.
+	 */
+	if (flags & SF_MNOWAIT)
+		mnw = 1;
+
+	if (flags & SF_SYNC) {
+		sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
+		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
+		cv_init(&sfs->cv, "sendfile");
+	}
+
+#ifdef MAC
+	error = mac_socket_check_send(td->td_ucred, so);
+	if (error != 0)
+		goto out;
+#endif
+
+	/* If headers are specified copy them into mbufs. */
+	if (hdr_uio != NULL) {
+		hdr_uio->uio_td = td;
+		hdr_uio->uio_rw = UIO_WRITE;
+		if (hdr_uio->uio_resid > 0) {
+			/*
+			 * In FBSD < 5.0 the nbytes to send also included
+			 * the header.  If compat is specified subtract the
+			 * header size from nbytes.
+			 */
+			if (kflags & SFK_COMPAT) {
+				if (nbytes > hdr_uio->uio_resid)
+					nbytes -= hdr_uio->uio_resid;
+				else
+					nbytes = 0;
+			}
+			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
+			    0, 0, 0);
+			if (m == NULL) {
+				error = mnw ? EAGAIN : ENOBUFS;
+				goto out;
+			}
+			hdrlen = m_length(m, NULL);
+		}
+	}
+
+	/*
+	 * Protect against multiple writers to the socket.
+	 *
+	 * XXXRW: Historically this has assumed non-interruptibility, so now
+	 * we implement that, but possibly shouldn't.
+	 */
+	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
+
+	/*
+	 * Loop through the pages of the file, starting with the requested
+	 * offset. Get a file page (do I/O if necessary), map the file page
+	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
+	 * it on the socket.
+	 * This is done in two loops.  The inner loop turns as many pages
+	 * as it can, up to available socket buffer space, without blocking
+	 * into mbufs to have it bulk delivered into the socket send buffer.
+	 * The outer loop checks the state and available space of the socket
+	 * and takes care of the overall progress.
+	 */
+	for (off = offset; ; ) {
+		struct mbuf *mtail;
+		int loopbytes;
+		int space;
+		int done;
+
+		if ((nbytes != 0 && nbytes == fsbytes) ||
+		    (nbytes == 0 && va.va_size == fsbytes))
+			break;
+
+		mtail = NULL;
+		loopbytes = 0;
+		space = 0;
+		done = 0;
+
+		/*
+		 * Check the socket state for ongoing connection,
+		 * no errors and space in socket buffer.
+		 * If space is low allow for the remainder of the
+		 * file to be processed if it fits the socket buffer.
+		 * Otherwise block in waiting for sufficient space
+		 * to proceed, or if the socket is nonblocking, return
+		 * to userland with EAGAIN while reporting how far
+		 * we've come.
+		 * We wait until the socket buffer has significant free
+		 * space to do bulk sends.  This makes good use of file
+		 * system read ahead and allows packet segmentation
+		 * offloading hardware to take over lots of work.  If
+		 * we were not careful here we would send off only one
+		 * sfbuf at a time.
+		 */
+		SOCKBUF_LOCK(&so->so_snd);
+		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
+			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
+retry_space:
+		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+			error = EPIPE;
+			SOCKBUF_UNLOCK(&so->so_snd);
+			goto done;
+		} else if (so->so_error) {
+			error = so->so_error;
+			so->so_error = 0;
+			SOCKBUF_UNLOCK(&so->so_snd);
+			goto done;
+		}
+		space = sbspace(&so->so_snd);
+		if (space < rem &&
+		    (space <= 0 ||
+		     space < so->so_snd.sb_lowat)) {
+			if (so->so_state & SS_NBIO) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				error = EAGAIN;
+				goto done;
+			}
+			/*
+			 * sbwait drops the lock while sleeping.
+			 * When we loop back to retry_space the
+			 * state may have changed and we retest
+			 * for it.
+			 */
+			error = sbwait(&so->so_snd);
+			/*
+			 * An error from sbwait usually indicates that we've
+			 * been interrupted by a signal. If we've sent anything
+			 * then return bytes sent, otherwise return the error.
+			 */
+			if (error != 0) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				goto done;
+			}
+			goto retry_space;
+		}
+		SOCKBUF_UNLOCK(&so->so_snd);
+
+		/*
+		 * Reduce space in the socket buffer by the size of
+		 * the header mbuf chain.
+		 * hdrlen is set to 0 after the first loop.
+		 */
+		space -= hdrlen;
+
+		error = vn_lock(vp, LK_SHARED);
+		if (error != 0)
+			goto done;
+		error = VOP_GETATTR(vp, &va, td->td_ucred);
+		if (error != 0 || off >= va.va_size) {
+			VOP_UNLOCK(vp, 0);
+			goto done;
+		}
+
+		/*
+		 * Loop and construct maximum sized mbuf chain to be bulk
+		 * dumped into socket buffer.
+		 */
+		while (space > loopbytes) {
+			vm_pindex_t pindex;
+			vm_offset_t pgoff;
+			struct mbuf *m0;
+
+			/*
+			 * Calculate the amount to transfer.
+			 * Not to exceed a page, the EOF,
+			 * or the passed in nbytes.
+			 */
+			pgoff = (vm_offset_t)(off & PAGE_MASK);
+			if (nbytes)
+				rem = (nbytes - fsbytes - loopbytes);
+			else
+				rem = va.va_size -
+				    offset - fsbytes - loopbytes;
+			xfsize = omin(PAGE_SIZE - pgoff, rem);
+			xfsize = omin(space - loopbytes, xfsize);
+			if (xfsize <= 0) {
+				done = 1;		/* all data sent */
+				break;
+			}
+
+			/*
+			 * Attempt to look up the page.  Allocate
+			 * if not found or wait and loop if busy.
+			 */
+			pindex = OFF_TO_IDX(off);
+			VM_OBJECT_WLOCK(obj);
+			pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
+			    VM_ALLOC_IGN_SBUSY | VM_ALLOC_NORMAL |
+			    VM_ALLOC_WIRED);
+
+			/*
+			 * Check if page is valid for what we need,
+			 * otherwise initiate I/O.
+			 * If we already turned some pages into mbufs,
+			 * send them off before we come here again and
+			 * block.
+			 */
+			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
+				VM_OBJECT_WUNLOCK(obj);
+			else if (m != NULL)
+				error = EAGAIN;	/* send what we already got */
+			else if (flags & SF_NODISKIO)
+				error = EBUSY;
+			else {
+				ssize_t resid;
+				int readahead = sfreadahead * MAXBSIZE;
+
+				VM_OBJECT_WUNLOCK(obj);
+
+				/*
+				 * Get the page from backing store.
+				 * XXXMAC: Because we don't have fp->f_cred
+				 * here, we pass in NOCRED.  This is probably
+				 * wrong, but is consistent with our original
+				 * implementation.
+				 */
+				error = vn_rdwr(UIO_READ, vp, NULL, readahead,
+				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
+				    IO_VMIO | ((readahead / bsize) << IO_SEQSHIFT),
+				    td->td_ucred, NOCRED, &resid, td);
+				SFSTAT_INC(sf_iocnt);
+				if (error != 0)
+					VM_OBJECT_WLOCK(obj);
+			}
+			if (error != 0) {
+				vm_page_lock(pg);
+				vm_page_unwire(pg, 0);
+				/*
+				 * See if anyone else might know about
+				 * this page.  If not and it is not valid,
+				 * then free it.
+				 */
+				if (pg->wire_count == 0 && pg->valid == 0 &&
+				    !vm_page_busied(pg))
+					vm_page_free(pg);
+				vm_page_unlock(pg);
+				VM_OBJECT_WUNLOCK(obj);
+				if (error == EAGAIN)
+					error = 0;	/* not a real error */
+				break;
+			}
+
+			/*
+			 * Get a sendfile buf.  When allocating the
+			 * first buffer for mbuf chain, we usually
+			 * wait as long as necessary, but this wait
+			 * can be interrupted.  For consequent
+			 * buffers, do not sleep, since several
+			 * threads might exhaust the buffers and then
+			 * deadlock.
+			 */
+			sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
+			    SFB_CATCH);
+			if (sf == NULL) {
+				SFSTAT_INC(sf_allocfail);
+				vm_page_lock(pg);
+				vm_page_unwire(pg, 0);
+				KASSERT(pg->object != NULL,
+				    ("%s: object disappeared", __func__));
+				vm_page_unlock(pg);
+				if (m == NULL)
+					error = (mnw ? EAGAIN : EINTR);
+				break;
+			}
+
+			/*
+			 * Get an mbuf and set it up as having
+			 * external storage.
+			 */
+			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
+			if (m0 == NULL) {
+				error = (mnw ? EAGAIN : ENOBUFS);
+				(void)sf_buf_mext(NULL, NULL, sf);
+				break;
+			}
+			if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
+			    sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
+			    (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
+				error = (mnw ? EAGAIN : ENOBUFS);
+				(void)sf_buf_mext(NULL, NULL, sf);
+				m_freem(m0);
+				break;
+			}
+			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
+			m0->m_len = xfsize;
+
+			/* Append to mbuf chain. */
+			if (mtail != NULL)
+				mtail->m_next = m0;
+			else if (m != NULL)
+				m_last(m)->m_next = m0;
+			else
+				m = m0;
+			mtail = m0;
+
+			/* Keep track of bits processed. */
+			loopbytes += xfsize;
+			off += xfsize;
+
+			if (sfs != NULL) {
+				mtx_lock(&sfs->mtx);
+				sfs->count++;
+				mtx_unlock(&sfs->mtx);
+			}
+		}
+
+		VOP_UNLOCK(vp, 0);
+
+		/* Add the buffer chain to the socket buffer. */
+		if (m != NULL) {
+			int mlen, err;
+
+			mlen = m_length(m, NULL);
+			SOCKBUF_LOCK(&so->so_snd);
+			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+				error = EPIPE;
+				SOCKBUF_UNLOCK(&so->so_snd);
+				goto done;
+			}
+			SOCKBUF_UNLOCK(&so->so_snd);
+			CURVNET_SET(so->so_vnet);
+			/* Avoid error aliasing. */
+			err = (*so->so_proto->pr_usrreqs->pru_send)
+				    (so, 0, m, NULL, NULL, td);
+			CURVNET_RESTORE();
+			if (err == 0) {
+				/*
+				 * We need two counters to get the
+				 * file offset and nbytes to send
+				 * right:
+				 * - sbytes contains the total amount
+				 *   of bytes sent, including headers.
+				 * - fsbytes contains the total amount
+				 *   of bytes sent from the file.
+				 */
+				sbytes += mlen;
+				fsbytes += mlen;
+				if (hdrlen) {
+					fsbytes -= hdrlen;
+					hdrlen = 0;
+				}
+			} else if (error == 0)
+				error = err;
+			m = NULL;	/* pru_send always consumes */
+		}
+
+		/* Quit outer loop on error or when we're done. */
+		if (done)
+			break;
+		if (error != 0)
+			goto done;
+	}
+
+	/*
+	 * Send trailers. Wimp out and use writev(2).
+	 */
+	if (trl_uio != NULL) {
+		sbunlock(&so->so_snd);
+		error = kern_writev(td, sockfd, trl_uio);
+		if (error == 0)
+			sbytes += td->td_retval[0];
+		goto out;
+	}
+
+done:
+	sbunlock(&so->so_snd);
+out:
+	/*
+	 * If there was no error we have to clear td->td_retval[0]
+	 * because it may have been set by writev.
+	 */
+	if (error == 0) {
+		td->td_retval[0] = 0;
+	}
+	if (sent != NULL) {
+		copyout(&sbytes, sent, sizeof(off_t));
+	}
+	if (obj != NULL)
+		vm_object_deallocate(obj);
+	if (so)
+		fdrop(sock_fp, td);
+	if (m)
+		m_freem(m);
+
+	if (sfs != NULL) {
+		mtx_lock(&sfs->mtx);
+		if (sfs->count != 0)
+			cv_wait(&sfs->cv, &sfs->mtx);
+		KASSERT(sfs->count == 0, ("sendfile sync still busy"));
+		cv_destroy(&sfs->cv);
+		mtx_destroy(&sfs->mtx);
+		free(sfs, M_TEMP);
+	}
+
+	if (error == ERESTART)
+		error = EINTR;
+
+	return (error);
+}
+
+/*
+ * SCTP syscalls.
+ * Functionality only compiled in if SCTP is defined in the kernel Makefile,
+ * otherwise all return EOPNOTSUPP.
+ * XXX: We should make this loadable one day.
+ */
+int
+sys_sctp_peeloff(td, uap)
+	struct thread *td;
+	struct sctp_peeloff_args /* {
+		int	sd;
+		caddr_t	name;
+	} */ *uap;
+{
+#if (defined(INET) || defined(INET6)) && defined(SCTP)
+	struct file *nfp = NULL;
+	struct socket *head, *so;
+	cap_rights_t rights;
+	u_int fflag;
+	int error, fd;
+
+	AUDIT_ARG_FD(uap->sd);
+	error = fgetsock(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF),
+	    &head, &fflag);
+	if (error != 0)
+		goto done2;
+	if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
+		error = EOPNOTSUPP;
+		goto done;
+	}
+	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
+	if (error != 0)
+		goto done;
+	/*
+	 * At this point we know we do have a assoc to pull
+	 * we proceed to get the fd setup. This may block
+	 * but that is ok.
+	 */
+
+	error = falloc(td, &nfp, &fd, 0);
+	if (error != 0)
+		goto done;
+	td->td_retval[0] = fd;
+
+	CURVNET_SET(head->so_vnet);
+	so = sonewconn(head, SS_ISCONNECTED);
+	if (so == NULL) {
+		error = ENOMEM;
+		goto noconnection;
+	}
+	/*
+	 * Before changing the flags on the socket, we have to bump the
+	 * reference count.  Otherwise, if the protocol calls sofree(),
+	 * the socket will be released due to a zero refcount.
+	 */
+        SOCK_LOCK(so);
+        soref(so);                      /* file descriptor reference */
+        SOCK_UNLOCK(so);
+
+	ACCEPT_LOCK();
+
+	TAILQ_REMOVE(&head->so_comp, so, so_list);
+	head->so_qlen--;
+	so->so_state |= (head->so_state & SS_NBIO);
+	so->so_state &= ~SS_NOFDREF;
+	so->so_qstate &= ~SQ_COMP;
+	so->so_head = NULL;
+	ACCEPT_UNLOCK();
+	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
+	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
+	if (error != 0)
+		goto noconnection;
+	if (head->so_sigio != NULL)
+		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
+
+noconnection:
+	/*
+	 * close the new descriptor, assuming someone hasn't ripped it
+	 * out from under us.
+	 */
+	if (error != 0)
+		fdclose(td->td_proc->p_fd, nfp, fd, td);
+
+	/*
+	 * Release explicitly held references before returning.
+	 */
+	CURVNET_RESTORE();
+done:
+	if (nfp != NULL)
+		fdrop(nfp, td);
+	fputsock(head);
+done2:
+	return (error);
+#else  /* SCTP */
+	return (EOPNOTSUPP);
+#endif /* SCTP */
+}
+
+int
+sys_sctp_generic_sendmsg (td, uap)
+	struct thread *td;
+	struct sctp_generic_sendmsg_args /* {
+		int sd,
+		caddr_t msg,
+		int mlen,
+		caddr_t to,
+		__socklen_t tolen,
+		struct sctp_sndrcvinfo *sinfo,
+		int flags
+	} */ *uap;
+{
+#if (defined(INET) || defined(INET6)) && defined(SCTP)
+	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
+	struct socket *so;
+	struct file *fp = NULL;
+	struct sockaddr *to = NULL;
+#ifdef KTRACE
+	struct uio *ktruio = NULL;
+#endif
+	struct uio auio;
+	struct iovec iov[1];
+	cap_rights_t rights;
+	int error = 0, len;
+
+	if (uap->sinfo != NULL) {
+		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
+		if (error != 0)
+			return (error);
+		u_sinfo = &sinfo;
+	}
+
+	cap_rights_init(&rights, CAP_SEND);
+	if (uap->tolen != 0) {
+		error = getsockaddr(&to, uap->to, uap->tolen);
+		if (error != 0) {
+			to = NULL;
+			goto sctp_bad2;
+		}
+		cap_rights_set(&rights, CAP_CONNECT);
+	}
+
+	AUDIT_ARG_FD(uap->sd);
+	error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL);
+	if (error != 0)
+		goto sctp_bad;
+#ifdef KTRACE
+	if (to && (KTRPOINT(td, KTR_STRUCT)))
+		ktrsockaddr(to);
+#endif
+
+	iov[0].iov_base = uap->msg;
+	iov[0].iov_len = uap->mlen;
+
+	so = (struct socket *)fp->f_data;
+	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
+		error = EOPNOTSUPP;
+		goto sctp_bad;
+	}
+#ifdef MAC
+	error = mac_socket_check_send(td->td_ucred, so);
+	if (error != 0)
+		goto sctp_bad;
+#endif /* MAC */
+
+	auio.uio_iov =  iov;
+	auio.uio_iovcnt = 1;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_td = td;
+	auio.uio_offset = 0;			/* XXX */
+	auio.uio_resid = 0;
+	len = auio.uio_resid = uap->mlen;
+	CURVNET_SET(so->so_vnet);
+	error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL,
+	    (struct mbuf *)NULL, uap->flags, u_sinfo, td);
+	CURVNET_RESTORE();
+	if (error != 0) {
+		if (auio.uio_resid != len && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+		/* Generation of SIGPIPE can be controlled per socket. */
+		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
+		    !(uap->flags & MSG_NOSIGNAL)) {
+			PROC_LOCK(td->td_proc);
+			tdsignal(td, SIGPIPE);
+			PROC_UNLOCK(td->td_proc);
+		}
+	}
+	if (error == 0)
+		td->td_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+	if (ktruio != NULL) {
+		ktruio->uio_resid = td->td_retval[0];
+		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
+	}
+#endif /* KTRACE */
+sctp_bad:
+	if (fp != NULL)
+		fdrop(fp, td);
+sctp_bad2:
+	free(to, M_SONAME);
+	return (error);
+#else  /* SCTP */
+	return (EOPNOTSUPP);
+#endif /* SCTP */
+}
+
+int
+sys_sctp_generic_sendmsg_iov(td, uap)
+	struct thread *td;
+	struct sctp_generic_sendmsg_iov_args /* {
+		int sd,
+		struct iovec *iov,
+		int iovlen,
+		caddr_t to,
+		__socklen_t tolen,
+		struct sctp_sndrcvinfo *sinfo,
+		int flags
+	} */ *uap;
+{
+#if (defined(INET) || defined(INET6)) && defined(SCTP)
+	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
+	struct socket *so;
+	struct file *fp = NULL;
+	struct sockaddr *to = NULL;
+#ifdef KTRACE
+	struct uio *ktruio = NULL;
+#endif
+	struct uio auio;
+	struct iovec *iov, *tiov;
+	cap_rights_t rights;
+	ssize_t len;
+	int error, i;
+
+	if (uap->sinfo != NULL) {
+		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
+		if (error != 0)
+			return (error);
+		u_sinfo = &sinfo;
+	}
+	cap_rights_init(&rights, CAP_SEND);
+	if (uap->tolen != 0) {
+		error = getsockaddr(&to, uap->to, uap->tolen);
+		if (error != 0) {
+			to = NULL;
+			goto sctp_bad2;
+		}
+		cap_rights_set(&rights, CAP_CONNECT);
+	}
+
+	AUDIT_ARG_FD(uap->sd);
+	error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL);
+	if (error != 0)
+		goto sctp_bad1;
+
+#ifdef COMPAT_FREEBSD32
+	if (SV_CURPROC_FLAG(SV_ILP32))
+		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
+		    uap->iovlen, &iov, EMSGSIZE);
+	else
+#endif
+		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
+	if (error != 0)
+		goto sctp_bad1;
+#ifdef KTRACE
+	if (to && (KTRPOINT(td, KTR_STRUCT)))
+		ktrsockaddr(to);
+#endif
+
+	so = (struct socket *)fp->f_data;
+	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
+		error = EOPNOTSUPP;
+		goto sctp_bad;
+	}
+#ifdef MAC
+	error = mac_socket_check_send(td->td_ucred, so);
+	if (error != 0)
+		goto sctp_bad;
+#endif /* MAC */
+
+	auio.uio_iov = iov;
+	auio.uio_iovcnt = uap->iovlen;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_td = td;
+	auio.uio_offset = 0;			/* XXX */
+	auio.uio_resid = 0;
+	tiov = iov;
+	for (i = 0; i <uap->iovlen; i++, tiov++) {
+		if ((auio.uio_resid += tiov->iov_len) < 0) {
+			error = EINVAL;
+			goto sctp_bad;
+		}
+	}
+	len = auio.uio_resid;
+	CURVNET_SET(so->so_vnet);
+	error = sctp_lower_sosend(so, to, &auio,
+		    (struct mbuf *)NULL, (struct mbuf *)NULL,
+		    uap->flags, u_sinfo, td);
+	CURVNET_RESTORE();
+	if (error != 0) {
+		if (auio.uio_resid != len && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+		/* Generation of SIGPIPE can be controlled per socket */
+		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
+		    !(uap->flags & MSG_NOSIGNAL)) {
+			PROC_LOCK(td->td_proc);
+			tdsignal(td, SIGPIPE);
+			PROC_UNLOCK(td->td_proc);
+		}
+	}
+	if (error == 0)
+		td->td_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+	if (ktruio != NULL) {
+		ktruio->uio_resid = td->td_retval[0];
+		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
+	}
+#endif /* KTRACE */
+sctp_bad:
+	free(iov, M_IOV);
+sctp_bad1:
+	if (fp != NULL)
+		fdrop(fp, td);
+sctp_bad2:
+	free(to, M_SONAME);
+	return (error);
+#else  /* SCTP */
+	return (EOPNOTSUPP);
+#endif /* SCTP */
+}
+
+int
+sys_sctp_generic_recvmsg(td, uap)
+	struct thread *td;
+	struct sctp_generic_recvmsg_args /* {
+		int sd,
+		struct iovec *iov,
+		int iovlen,
+		struct sockaddr *from,
+		__socklen_t *fromlenaddr,
+		struct sctp_sndrcvinfo *sinfo,
+		int *msg_flags
+	} */ *uap;
+{
+#if (defined(INET) || defined(INET6)) && defined(SCTP)
+	uint8_t sockbufstore[256];
+	struct uio auio;
+	struct iovec *iov, *tiov;
+	struct sctp_sndrcvinfo sinfo;
+	struct socket *so;
+	struct file *fp = NULL;
+	struct sockaddr *fromsa;
+	cap_rights_t rights;
+#ifdef KTRACE
+	struct uio *ktruio = NULL;
+#endif
+	ssize_t len;
+	int error, fromlen, i, msg_flags;
+
+	AUDIT_ARG_FD(uap->sd);
+	error = getsock_cap(td->td_proc->p_fd, uap->sd,
+	    cap_rights_init(&rights, CAP_RECV), &fp, NULL);
+	if (error != 0)
+		return (error);
+#ifdef COMPAT_FREEBSD32
+	if (SV_CURPROC_FLAG(SV_ILP32))
+		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
+		    uap->iovlen, &iov, EMSGSIZE);
+	else
+#endif
+		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
+	if (error != 0)
+		goto out1;
+
+	so = fp->f_data;
+	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
+		error = EOPNOTSUPP;
+		goto out;
+	}
+#ifdef MAC
+	error = mac_socket_check_receive(td->td_ucred, so);
+	if (error != 0)
+		goto out;
+#endif /* MAC */
+
+	if (uap->fromlenaddr != NULL) {
+		error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen));
+		if (error != 0)
+			goto out;
+	} else {
+		fromlen = 0;
+	}
+	if (uap->msg_flags) {
+		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
+		if (error != 0)
+			goto out;
+	} else {
+		msg_flags = 0;
+	}
+	auio.uio_iov = iov;
+	auio.uio_iovcnt = uap->iovlen;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_rw = UIO_READ;
+	auio.uio_td = td;
+	auio.uio_offset = 0;			/* XXX */
+	auio.uio_resid = 0;
+	tiov = iov;
+	for (i = 0; i <uap->iovlen; i++, tiov++) {
+		if ((auio.uio_resid += tiov->iov_len) < 0) {
+			error = EINVAL;
+			goto out;
+		}
+	}
+	len = auio.uio_resid;
+	fromsa = (struct sockaddr *)sockbufstore;
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_GENIO))
+		ktruio = cloneuio(&auio);
+#endif /* KTRACE */
+	memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
+	CURVNET_SET(so->so_vnet);
+	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
+		    fromsa, fromlen, &msg_flags,
+		    (struct sctp_sndrcvinfo *)&sinfo, 1);
+	CURVNET_RESTORE();
+	if (error != 0) {
+		if (auio.uio_resid != len && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+	} else {
+		if (uap->sinfo)
+			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
+	}
+#ifdef KTRACE
+	if (ktruio != NULL) {
+		ktruio->uio_resid = len - auio.uio_resid;
+		ktrgenio(uap->sd, UIO_READ, ktruio, error);
+	}
+#endif /* KTRACE */
+	if (error != 0)
+		goto out;
+	td->td_retval[0] = len - auio.uio_resid;
+
+	if (fromlen && uap->from) {
+		len = fromlen;
+		if (len <= 0 || fromsa == 0)
+			len = 0;
+		else {
+			len = MIN(len, fromsa->sa_len);
+			error = copyout(fromsa, uap->from, (size_t)len);
+			if (error != 0)
+				goto out;
+		}
+		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
+		if (error != 0)
+			goto out;
+	}
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_STRUCT))
+		ktrsockaddr(fromsa);
+#endif
+	if (uap->msg_flags) {
+		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
+		if (error != 0)
+			goto out;
+	}
+out:
+	free(iov, M_IOV);
+out1:
+	if (fp != NULL)
+		fdrop(fp, td);
+
+	return (error);
+#else  /* SCTP */
+	return (EOPNOTSUPP);
+#endif /* SCTP */
+}
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
new file mode 100644
index 0000000..c0a5d2e
--- /dev/null
+++ b/sys/kern/uipc_usrreq.c
@@ -0,0 +1,2505 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.
+ * Copyright (c) 2004-2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
+ */
+
+/*
+ * UNIX Domain (Local) Sockets
+ *
+ * This is an implementation of UNIX (local) domain sockets.  Each socket has
+ * an associated struct unpcb (UNIX protocol control block).  Stream sockets
+ * may be connected to 0 or 1 other socket.  Datagram sockets may be
+ * connected to 0, 1, or many other sockets.  Sockets may be created and
+ * connected in pairs (socketpair(2)), or bound/connected to using the file
+ * system name space.  For most purposes, only the receive socket buffer is
+ * used, as sending on one socket delivers directly to the receive socket
+ * buffer of a second socket.
+ *
+ * The implementation is substantially complicated by the fact that
+ * "ancillary data", such as file descriptors or credentials, may be passed
+ * across UNIX domain sockets.  The potential for passing UNIX domain sockets
+ * over other UNIX domain sockets requires the implementation of a simple
+ * garbage collector to find and tear down cycles of disconnected sockets.
+ *
+ * TODO:
+ *	RDM
+ *	distinguish datagram size limits from flow control limits in SEQPACKET
+ *	rethink name space problems
+ *	need a proper out-of-band
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/domain.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
+#include <sys/eventhandler.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/queue.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+#include <sys/un.h>
+#include <sys/unpcb.h>
+#include <sys/vnode.h>
+
+#include <net/vnet.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/uma.h>
+
+MALLOC_DECLARE(M_FILECAPS);
+
+/*
+ * Locking key:
+ * (l)	Locked using list lock
+ * (g)	Locked using linkage lock
+ */
+
+static uma_zone_t	unp_zone;
+static unp_gen_t	unp_gencnt;	/* (l) */
+static u_int		unp_count;	/* (l) Count of local sockets. */
+static ino_t		unp_ino;	/* Prototype for fake inode numbers. */
+static int		unp_rights;	/* (g) File descriptors in flight. */
+static struct unp_head	unp_shead;	/* (l) List of stream sockets. */
+static struct unp_head	unp_dhead;	/* (l) List of datagram sockets. */
+static struct unp_head	unp_sphead;	/* (l) List of seqpacket sockets. */
+
+struct unp_defer {
+	SLIST_ENTRY(unp_defer) ud_link;
+	struct file *ud_fp;
+};
+static SLIST_HEAD(, unp_defer) unp_defers;
+static int unp_defers_count;
+
+static const struct sockaddr	sun_noname = { sizeof(sun_noname), AF_LOCAL };
+
+/*
+ * Garbage collection of cyclic file descriptor/socket references occurs
+ * asynchronously in a taskqueue context in order to avoid recursion and
+ * reentrance in the UNIX domain socket, file descriptor, and socket layer
+ * code.  See unp_gc() for a full description.
+ */
+static struct timeout_task unp_gc_task;
+
+/*
+ * The close of unix domain sockets attached as SCM_RIGHTS is
+ * postponed to the taskqueue, to avoid arbitrary recursion depth.
+ * The attached sockets might have another sockets attached.
+ */
+static struct task	unp_defer_task;
+
+/*
+ * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
+ * stream sockets, although the total for sender and receiver is actually
+ * only PIPSIZ.
+ *
+ * Datagram sockets really use the sendspace as the maximum datagram size,
+ * and don't really want to reserve the sendspace.  Their recvspace should be
+ * large enough for at least one max-size datagram plus address.
+ */
+#ifndef PIPSIZ
+#define	PIPSIZ	8192
+#endif
+static u_long	unpst_sendspace = PIPSIZ;
+static u_long	unpst_recvspace = PIPSIZ;
+static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
+static u_long	unpdg_recvspace = 4*1024;
+static u_long	unpsp_sendspace = PIPSIZ;	/* really max datagram size */
+static u_long	unpsp_recvspace = PIPSIZ;
+
+static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
+static SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0,
+    "SOCK_STREAM");
+static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
+static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket, CTLFLAG_RW, 0,
+    "SOCK_SEQPACKET");
+
+SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
+	   &unpst_sendspace, 0, "Default stream send space.");
+SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpst_recvspace, 0, "Default stream receive space.");
+SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
+	   &unpdg_sendspace, 0, "Default datagram send space.");
+SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpdg_recvspace, 0, "Default datagram receive space.");
+SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW,
+	   &unpsp_sendspace, 0, "Default seqpacket send space.");
+SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpsp_recvspace, 0, "Default seqpacket receive space.");
+SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
+    "File descriptors in flight.");
+SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
+    &unp_defers_count, 0,
+    "File descriptors deferred to taskqueue for close.");
+
+/*
+ * Locking and synchronization:
+ *
+ * Three types of locks exit in the local domain socket implementation: a
+ * global list mutex, a global linkage rwlock, and per-unpcb mutexes.  Of the
+ * global locks, the list lock protects the socket count, global generation
+ * number, and stream/datagram global lists.  The linkage lock protects the
+ * interconnection of unpcbs, the v_socket and unp_vnode pointers, and can be
+ * held exclusively over the acquisition of multiple unpcb locks to prevent
+ * deadlock.
+ *
+ * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
+ * allocated in pru_attach() and freed in pru_detach().  The validity of that
+ * pointer is an invariant, so no lock is required to dereference the so_pcb
+ * pointer if a valid socket reference is held by the caller.  In practice,
+ * this is always true during operations performed on a socket.  Each unpcb
+ * has a back-pointer to its socket, unp_socket, which will be stable under
+ * the same circumstances.
+ *
+ * This pointer may only be safely dereferenced as long as a valid reference
+ * to the unpcb is held.  Typically, this reference will be from the socket,
+ * or from another unpcb when the referring unpcb's lock is held (in order
+ * that the reference not be invalidated during use).  For example, to follow
+ * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn,
+ * as unp_socket remains valid as long as the reference to unp_conn is valid.
+ *
+ * Fields of unpcbss are locked using a per-unpcb lock, unp_mtx.  Individual
+ * atomic reads without the lock may be performed "lockless", but more
+ * complex reads and read-modify-writes require the mutex to be held.  No
+ * lock order is defined between unpcb locks -- multiple unpcb locks may be
+ * acquired at the same time only when holding the linkage rwlock
+ * exclusively, which prevents deadlocks.
+ *
+ * Blocking with UNIX domain sockets is a tricky issue: unlike most network
+ * protocols, bind() is a non-atomic operation, and connect() requires
+ * potential sleeping in the protocol, due to potentially waiting on local or
+ * distributed file systems.  We try to separate "lookup" operations, which
+ * may sleep, and the IPC operations themselves, which typically can occur
+ * with relative atomicity as locks can be held over the entire operation.
+ *
+ * Another tricky issue is simultaneous multi-threaded or multi-process
+ * access to a single UNIX domain socket.  These are handled by the flags
+ * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
+ * binding, both of which involve dropping UNIX domain socket locks in order
+ * to perform namei() and other file system operations.
+ */
+static struct rwlock	unp_link_rwlock;
+static struct mtx	unp_list_lock;
+static struct mtx	unp_defers_lock;
+
+#define	UNP_LINK_LOCK_INIT()		rw_init(&unp_link_rwlock,	\
+					    "unp_link_rwlock")
+
+#define	UNP_LINK_LOCK_ASSERT()	rw_assert(&unp_link_rwlock,	\
+					    RA_LOCKED)
+#define	UNP_LINK_UNLOCK_ASSERT()	rw_assert(&unp_link_rwlock,	\
+					    RA_UNLOCKED)
+
+#define	UNP_LINK_RLOCK()		rw_rlock(&unp_link_rwlock)
+#define	UNP_LINK_RUNLOCK()		rw_runlock(&unp_link_rwlock)
+#define	UNP_LINK_WLOCK()		rw_wlock(&unp_link_rwlock)
+#define	UNP_LINK_WUNLOCK()		rw_wunlock(&unp_link_rwlock)
+#define	UNP_LINK_WLOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
+					    RA_WLOCKED)
+
+#define	UNP_LIST_LOCK_INIT()		mtx_init(&unp_list_lock,	\
+					    "unp_list_lock", NULL, MTX_DEF)
+#define	UNP_LIST_LOCK()			mtx_lock(&unp_list_lock)
+#define	UNP_LIST_UNLOCK()		mtx_unlock(&unp_list_lock)
+
+#define	UNP_DEFERRED_LOCK_INIT()	mtx_init(&unp_defers_lock, \
+					    "unp_defer", NULL, MTX_DEF)
+#define	UNP_DEFERRED_LOCK()		mtx_lock(&unp_defers_lock)
+#define	UNP_DEFERRED_UNLOCK()		mtx_unlock(&unp_defers_lock)
+
+#define UNP_PCB_LOCK_INIT(unp)		mtx_init(&(unp)->unp_mtx,	\
+					    "unp_mtx", "unp_mtx",	\
+					    MTX_DUPOK|MTX_DEF|MTX_RECURSE)
+#define	UNP_PCB_LOCK_DESTROY(unp)	mtx_destroy(&(unp)->unp_mtx)
+#define	UNP_PCB_LOCK(unp)		mtx_lock(&(unp)->unp_mtx)
+#define	UNP_PCB_UNLOCK(unp)		mtx_unlock(&(unp)->unp_mtx)
+#define	UNP_PCB_LOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_OWNED)
+
+static int	uipc_connect2(struct socket *, struct socket *);
+static int	uipc_ctloutput(struct socket *, struct sockopt *);
+static int	unp_connect(struct socket *, struct sockaddr *,
+		    struct thread *);
+static int	unp_connectat(int, struct socket *, struct sockaddr *,
+		    struct thread *);
+static int	unp_connect2(struct socket *so, struct socket *so2, int);
+static void	unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
+static void	unp_dispose(struct mbuf *);
+static void	unp_shutdown(struct unpcb *);
+static void	unp_drop(struct unpcb *, int);
+static void	unp_gc(__unused void *, int);
+static void	unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
+static void	unp_discard(struct file *);
+static void	unp_freerights(struct filedescent **, int);
+static void	unp_init(void);
+static int	unp_internalize(struct mbuf **, struct thread *);
+static void	unp_internalize_fp(struct file *);
+static int	unp_externalize(struct mbuf *, struct mbuf **, int);
+static int	unp_externalize_fp(struct file *);
+static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *);
+static void	unp_process_defers(void * __unused, int);
+
+/*
+ * Definitions of protocols supported in the LOCAL domain.
+ */
+static struct domain localdomain;
+static struct pr_usrreqs uipc_usrreqs_dgram, uipc_usrreqs_stream;
+static struct pr_usrreqs uipc_usrreqs_seqpacket;
+static struct protosw localsw[] = {
+{
+	.pr_type =		SOCK_STREAM,
+	.pr_domain =		&localdomain,
+	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
+	.pr_ctloutput =		&uipc_ctloutput,
+	.pr_usrreqs =		&uipc_usrreqs_stream
+},
+{
+	.pr_type =		SOCK_DGRAM,
+	.pr_domain =		&localdomain,
+	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
+	.pr_ctloutput =		&uipc_ctloutput,
+	.pr_usrreqs =		&uipc_usrreqs_dgram
+},
+{
+	.pr_type =		SOCK_SEQPACKET,
+	.pr_domain =		&localdomain,
+
+	/*
+	 * XXXRW: For now, PR_ADDR because soreceive will bump into them
+	 * due to our use of sbappendaddr.  A new sbappend variants is needed
+	 * that supports both atomic record writes and control data.
+	 */
+	.pr_flags =		PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|PR_WANTRCVD|
+				    PR_RIGHTS,
+	.pr_usrreqs =		&uipc_usrreqs_seqpacket,
+},
+};
+
+static struct domain localdomain = {
+	.dom_family =		AF_LOCAL,
+	.dom_name =		"local",
+	.dom_init =		unp_init,
+	.dom_externalize =	unp_externalize,
+	.dom_dispose =		unp_dispose,
+	.dom_protosw =		localsw,
+	.dom_protoswNPROTOSW =	&localsw[sizeof(localsw)/sizeof(localsw[0])]
+};
+DOMAIN_SET(local);
+
+static void
+uipc_abort(struct socket *so)
+{
+	struct unpcb *unp, *unp2;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
+
+	UNP_LINK_WLOCK();
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL) {
+		UNP_PCB_LOCK(unp2);
+		unp_drop(unp2, ECONNABORTED);
+		UNP_PCB_UNLOCK(unp2);
+	}
+	UNP_PCB_UNLOCK(unp);
+	UNP_LINK_WUNLOCK();
+}
+
+static int
+uipc_accept(struct socket *so, struct sockaddr **nam)
+{
+	struct unpcb *unp, *unp2;
+	const struct sockaddr *sa;
+
+	/*
+	 * Pass back name of connected socket, if it was bound and we are
+	 * still connected (our peer may have closed already!).
+	 */
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
+
+	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+	UNP_LINK_RLOCK();
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL && unp2->unp_addr != NULL) {
+		UNP_PCB_LOCK(unp2);
+		sa = (struct sockaddr *) unp2->unp_addr;
+		bcopy(sa, *nam, sa->sa_len);
+		UNP_PCB_UNLOCK(unp2);
+	} else {
+		sa = &sun_noname;
+		bcopy(sa, *nam, sa->sa_len);
+	}
+	UNP_LINK_RUNLOCK();
+	return (0);
+}
+
+static int
+uipc_attach(struct socket *so, int proto, struct thread *td)
+{
+	u_long sendspace, recvspace;
+	struct unpcb *unp;
+	int error;
+
+	KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
+	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+		switch (so->so_type) {
+		case SOCK_STREAM:
+			sendspace = unpst_sendspace;
+			recvspace = unpst_recvspace;
+			break;
+
+		case SOCK_DGRAM:
+			sendspace = unpdg_sendspace;
+			recvspace = unpdg_recvspace;
+			break;
+
+		case SOCK_SEQPACKET:
+			sendspace = unpsp_sendspace;
+			recvspace = unpsp_recvspace;
+			break;
+
+		default:
+			panic("uipc_attach");
+		}
+		error = soreserve(so, sendspace, recvspace);
+		if (error)
+			return (error);
+	}
+	unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
+	if (unp == NULL)
+		return (ENOBUFS);
+	LIST_INIT(&unp->unp_refs);
+	UNP_PCB_LOCK_INIT(unp);
+	unp->unp_socket = so;
+	so->so_pcb = unp;
+	unp->unp_refcount = 1;
+
+	UNP_LIST_LOCK();
+	unp->unp_gencnt = ++unp_gencnt;
+	unp_count++;
+	switch (so->so_type) {
+	case SOCK_STREAM:
+		LIST_INSERT_HEAD(&unp_shead, unp, unp_link);
+		break;
+
+	case SOCK_DGRAM:
+		LIST_INSERT_HEAD(&unp_dhead, unp, unp_link);
+		break;
+
+	case SOCK_SEQPACKET:
+		LIST_INSERT_HEAD(&unp_sphead, unp, unp_link);
+		break;
+
+	default:
+		panic("uipc_attach");
+	}
+	UNP_LIST_UNLOCK();
+
+	return (0);
+}
+
+static int
+uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+	struct vattr vattr;
+	int error, namelen;
+	struct nameidata nd;
+	struct unpcb *unp;
+	struct vnode *vp;
+	struct mount *mp;
+	cap_rights_t rights;
+	char *buf;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
+
+	if (soun->sun_len > sizeof(struct sockaddr_un))
+		return (EINVAL);
+	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
+	if (namelen <= 0)
+		return (EINVAL);
+
+	/*
+	 * We don't allow simultaneous bind() calls on a single UNIX domain
+	 * socket, so flag in-progress operations, and return an error if an
+	 * operation is already in progress.
+	 *
+	 * Historically, we have not allowed a socket to be rebound, so this
+	 * also returns an error.  Not allowing re-binding simplifies the
+	 * implementation and avoids a great many possible failure modes.
+	 */
+	UNP_PCB_LOCK(unp);
+	if (unp->unp_vnode != NULL) {
+		UNP_PCB_UNLOCK(unp);
+		return (EINVAL);
+	}
+	if (unp->unp_flags & UNP_BINDING) {
+		UNP_PCB_UNLOCK(unp);
+		return (EALREADY);
+	}
+	unp->unp_flags |= UNP_BINDING;
+	UNP_PCB_UNLOCK(unp);
+
+	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
+	bcopy(soun->sun_path, buf, namelen);
+	buf[namelen] = 0;
+
+restart:
+	NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME,
+	    UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_BINDAT), td);
+/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
+	error = namei(&nd);
+	if (error)
+		goto error;
+	vp = nd.ni_vp;
+	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if (vp != NULL) {
+			vrele(vp);
+			error = EADDRINUSE;
+			goto error;
+		}
+		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
+		if (error)
+			goto error;
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VSOCK;
+	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
+#ifdef MAC
+	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+	    &vattr);
+#endif
+	if (error == 0)
+		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	if (error) {
+		vn_finished_write(mp);
+		goto error;
+	}
+	vp = nd.ni_vp;
+	ASSERT_VOP_ELOCKED(vp, "uipc_bind");
+	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
+
+	UNP_LINK_WLOCK();
+	UNP_PCB_LOCK(unp);
+	VOP_UNP_BIND(vp, unp->unp_socket);
+	unp->unp_vnode = vp;
+	unp->unp_addr = soun;
+	unp->unp_flags &= ~UNP_BINDING;
+	UNP_PCB_UNLOCK(unp);
+	UNP_LINK_WUNLOCK();
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	free(buf, M_TEMP);
+	return (0);
+
+error:
+	UNP_PCB_LOCK(unp);
+	unp->unp_flags &= ~UNP_BINDING;
+	UNP_PCB_UNLOCK(unp);
+	free(buf, M_TEMP);
+	return (error);
+}
+
+static int
+uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+	return (uipc_bindat(AT_FDCWD, so, nam, td));
+}
+
+static int
+uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	int error;
+
+	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
+	UNP_LINK_WLOCK();
+	error = unp_connect(so, nam, td);
+	UNP_LINK_WUNLOCK();
+	return (error);
+}
+
+static int
+uipc_connectat(int fd, struct socket *so, struct sockaddr *nam,
+    struct thread *td)
+{
+	int error;
+
+	KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
+	UNP_LINK_WLOCK();
+	error = unp_connectat(fd, so, nam, td);
+	UNP_LINK_WUNLOCK();
+	return (error);
+}
+
+static void
+uipc_close(struct socket *so)
+{
+	struct unpcb *unp, *unp2;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
+
+	UNP_LINK_WLOCK();
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL) {
+		UNP_PCB_LOCK(unp2);
+		unp_disconnect(unp, unp2);
+		UNP_PCB_UNLOCK(unp2);
+	}
+	UNP_PCB_UNLOCK(unp);
+	UNP_LINK_WUNLOCK();
+}
+
+static int
+uipc_connect2(struct socket *so1, struct socket *so2)
+{
+	struct unpcb *unp, *unp2;
+	int error;
+
+	UNP_LINK_WLOCK();
+	unp = so1->so_pcb;
+	KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
+	UNP_PCB_LOCK(unp);
+	unp2 = so2->so_pcb;
+	KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
+	UNP_PCB_LOCK(unp2);
+	error = unp_connect2(so1, so2, PRU_CONNECT2);
+	UNP_PCB_UNLOCK(unp2);
+	UNP_PCB_UNLOCK(unp);
+	UNP_LINK_WUNLOCK();
+	return (error);
+}
+
+static void
+uipc_detach(struct socket *so)
+{
+	struct unpcb *unp, *unp2;
+	struct sockaddr_un *saved_unp_addr;
+	struct vnode *vp;
+	int freeunp, local_unp_rights;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
+
+	UNP_LINK_WLOCK();
+	UNP_LIST_LOCK();
+	UNP_PCB_LOCK(unp);
+	LIST_REMOVE(unp, unp_link);
+	unp->unp_gencnt = ++unp_gencnt;
+	--unp_count;
+	UNP_LIST_UNLOCK();
+
+	/*
+	 * XXXRW: Should assert vp->v_socket == so.
+	 */
+	if ((vp = unp->unp_vnode) != NULL) {
+		VOP_UNP_DETACH(vp);
+		unp->unp_vnode = NULL;
+	}
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL) {
+		UNP_PCB_LOCK(unp2);
+		unp_disconnect(unp, unp2);
+		UNP_PCB_UNLOCK(unp2);
+	}
+
+	/*
+	 * We hold the linkage lock exclusively, so it's OK to acquire
+	 * multiple pcb locks at a time.
+	 */
+	while (!LIST_EMPTY(&unp->unp_refs)) {
+		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
+
+		UNP_PCB_LOCK(ref);
+		unp_drop(ref, ECONNRESET);
+		UNP_PCB_UNLOCK(ref);
+	}
+	local_unp_rights = unp_rights;
+	UNP_LINK_WUNLOCK();
+	unp->unp_socket->so_pcb = NULL;
+	saved_unp_addr = unp->unp_addr;
+	unp->unp_addr = NULL;
+	unp->unp_refcount--;
+	freeunp = (unp->unp_refcount == 0);
+	if (saved_unp_addr != NULL)
+		free(saved_unp_addr, M_SONAME);
+	if (freeunp) {
+		UNP_PCB_LOCK_DESTROY(unp);
+		uma_zfree(unp_zone, unp);
+	} else
+		UNP_PCB_UNLOCK(unp);
+	if (vp)
+		vrele(vp);
+	if (local_unp_rights)
+		taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
+}
+
+static int
+uipc_disconnect(struct socket *so)
+{
+	struct unpcb *unp, *unp2;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
+
+	UNP_LINK_WLOCK();
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL) {
+		UNP_PCB_LOCK(unp2);
+		unp_disconnect(unp, unp2);
+		UNP_PCB_UNLOCK(unp2);
+	}
+	UNP_PCB_UNLOCK(unp);
+	UNP_LINK_WUNLOCK();
+	return (0);
+}
+
+static int
+uipc_listen(struct socket *so, int backlog, struct thread *td)
+{
+	struct unpcb *unp;
+	int error;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_listen: unp == NULL"));
+
+	UNP_PCB_LOCK(unp);
+	if (unp->unp_vnode == NULL) {
+		UNP_PCB_UNLOCK(unp);
+		return (EINVAL);
+	}
+
+	SOCK_LOCK(so);
+	error = solisten_proto_check(so);
+	if (error == 0) {
+		cru2x(td->td_ucred, &unp->unp_peercred);
+		unp->unp_flags |= UNP_HAVEPCCACHED;
+		solisten_proto(so, backlog);
+	}
+	SOCK_UNLOCK(so);
+	UNP_PCB_UNLOCK(unp);
+	return (error);
+}
+
+static int
+uipc_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+	struct unpcb *unp, *unp2;
+	const struct sockaddr *sa;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
+
+	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+	UNP_LINK_RLOCK();
+	/*
+	 * XXX: It seems that this test always fails even when connection is
+	 * established.  So, this else clause is added as workaround to
+	 * return PF_LOCAL sockaddr.
+	 */
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL) {
+		UNP_PCB_LOCK(unp2);
+		if (unp2->unp_addr != NULL)
+			sa = (struct sockaddr *) unp2->unp_addr;
+		else
+			sa = &sun_noname;
+		bcopy(sa, *nam, sa->sa_len);
+		UNP_PCB_UNLOCK(unp2);
+	} else {
+		sa = &sun_noname;
+		bcopy(sa, *nam, sa->sa_len);
+	}
+	UNP_LINK_RUNLOCK();
+	return (0);
+}
+
+static int
+uipc_rcvd(struct socket *so, int flags)
+{
+	struct unpcb *unp, *unp2;
+	struct socket *so2;
+	u_int mbcnt, sbcc;
+	u_long newhiwat;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL"));
+
+	if (so->so_type != SOCK_STREAM && so->so_type != SOCK_SEQPACKET)
+		panic("uipc_rcvd socktype %d", so->so_type);
+
+	/*
+	 * Adjust backpressure on sender and wakeup any waiting to write.
+	 *
+	 * The unp lock is acquired to maintain the validity of the unp_conn
+	 * pointer; no lock on unp2 is required as unp2->unp_socket will be
+	 * static as long as we don't permit unp2 to disconnect from unp,
+	 * which is prevented by the lock on unp.  We cache values from
+	 * so_rcv to avoid holding the so_rcv lock over the entire
+	 * transaction on the remote so_snd.
+	 */
+	SOCKBUF_LOCK(&so->so_rcv);
+	mbcnt = so->so_rcv.sb_mbcnt;
+	sbcc = so->so_rcv.sb_cc;
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+	if (unp2 == NULL) {
+		UNP_PCB_UNLOCK(unp);
+		return (0);
+	}
+	so2 = unp2->unp_socket;
+	SOCKBUF_LOCK(&so2->so_snd);
+	so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt;
+	newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc;
+	(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
+	    newhiwat, RLIM_INFINITY);
+	sowwakeup_locked(so2);
+	unp->unp_mbcnt = mbcnt;
+	unp->unp_cc = sbcc;
+	UNP_PCB_UNLOCK(unp);
+	return (0);
+}
+
+static int
+uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
+    struct mbuf *control, struct thread *td)
+{
+	struct unpcb *unp, *unp2;
+	struct socket *so2;
+	u_int mbcnt_delta, sbcc;
+	u_int newhiwat;
+	int error = 0;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_send: unp == NULL"));
+
+	if (flags & PRUS_OOB) {
+		error = EOPNOTSUPP;
+		goto release;
+	}
+	if (control != NULL && (error = unp_internalize(&control, td)))
+		goto release;
+	if ((nam != NULL) || (flags & PRUS_EOF))
+		UNP_LINK_WLOCK();
+	else
+		UNP_LINK_RLOCK();
+	switch (so->so_type) {
+	case SOCK_DGRAM:
+	{
+		const struct sockaddr *from;
+
+		unp2 = unp->unp_conn;
+		if (nam != NULL) {
+			UNP_LINK_WLOCK_ASSERT();
+			if (unp2 != NULL) {
+				error = EISCONN;
+				break;
+			}
+			error = unp_connect(so, nam, td);
+			if (error)
+				break;
+			unp2 = unp->unp_conn;
+		}
+
+		/*
+		 * Because connect() and send() are non-atomic in a sendto()
+		 * with a target address, it's possible that the socket will
+		 * have disconnected before the send() can run.  In that case
+		 * return the slightly counter-intuitive but otherwise
+		 * correct error that the socket is not connected.
+		 */
+		if (unp2 == NULL) {
+			error = ENOTCONN;
+			break;
+		}
+		/* Lockless read. */
+		if (unp2->unp_flags & UNP_WANTCRED)
+			control = unp_addsockcred(td, control);
+		UNP_PCB_LOCK(unp);
+		if (unp->unp_addr != NULL)
+			from = (struct sockaddr *)unp->unp_addr;
+		else
+			from = &sun_noname;
+		so2 = unp2->unp_socket;
+		SOCKBUF_LOCK(&so2->so_rcv);
+		if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
+			sorwakeup_locked(so2);
+			m = NULL;
+			control = NULL;
+		} else {
+			SOCKBUF_UNLOCK(&so2->so_rcv);
+			error = ENOBUFS;
+		}
+		if (nam != NULL) {
+			UNP_LINK_WLOCK_ASSERT();
+			UNP_PCB_LOCK(unp2);
+			unp_disconnect(unp, unp2);
+			UNP_PCB_UNLOCK(unp2);
+		}
+		UNP_PCB_UNLOCK(unp);
+		break;
+	}
+
+	case SOCK_SEQPACKET:
+	case SOCK_STREAM:
+		if ((so->so_state & SS_ISCONNECTED) == 0) {
+			if (nam != NULL) {
+				UNP_LINK_WLOCK_ASSERT();
+				error = unp_connect(so, nam, td);
+				if (error)
+					break;	/* XXX */
+			} else {
+				error = ENOTCONN;
+				break;
+			}
+		}
+
+		/* Lockless read. */
+		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+			error = EPIPE;
+			break;
+		}
+
+		/*
+		 * Because connect() and send() are non-atomic in a sendto()
+		 * with a target address, it's possible that the socket will
+		 * have disconnected before the send() can run.  In that case
+		 * return the slightly counter-intuitive but otherwise
+		 * correct error that the socket is not connected.
+		 *
+		 * Locking here must be done carefully: the linkage lock
+		 * prevents interconnections between unpcbs from changing, so
+		 * we can traverse from unp to unp2 without acquiring unp's
+		 * lock.  Socket buffer locks follow unpcb locks, so we can
+		 * acquire both remote and lock socket buffer locks.
+		 */
+		unp2 = unp->unp_conn;
+		if (unp2 == NULL) {
+			error = ENOTCONN;
+			break;
+		}
+		so2 = unp2->unp_socket;
+		UNP_PCB_LOCK(unp2);
+		SOCKBUF_LOCK(&so2->so_rcv);
+		if (unp2->unp_flags & UNP_WANTCRED) {
+			/*
+			 * Credentials are passed only once on SOCK_STREAM
+			 * and SOCK_SEQPACKET.
+			 */
+			unp2->unp_flags &= ~UNP_WANTCRED;
+			control = unp_addsockcred(td, control);
+		}
+		/*
+		 * Send to paired receive port, and then reduce send buffer
+		 * hiwater marks to maintain backpressure.  Wake up readers.
+		 */
+		switch (so->so_type) {
+		case SOCK_STREAM:
+			if (control != NULL) {
+				if (sbappendcontrol_locked(&so2->so_rcv, m,
+				    control))
+					control = NULL;
+			} else
+				sbappend_locked(&so2->so_rcv, m);
+			break;
+
+		case SOCK_SEQPACKET: {
+			const struct sockaddr *from;
+
+			from = &sun_noname;
+			if (sbappendaddr_locked(&so2->so_rcv, from, m,
+			    control))
+				control = NULL;
+			break;
+			}
+		}
+
+		/*
+		 * XXXRW: While fine for SOCK_STREAM, this conflates maximum
+		 * datagram size and back-pressure for SOCK_SEQPACKET, which
+		 * can lead to undesired return of EMSGSIZE on send instead
+		 * of more desirable blocking.
+		 */
+		mbcnt_delta = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt;
+		unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt;
+		sbcc = so2->so_rcv.sb_cc;
+		sorwakeup_locked(so2);
+
+		SOCKBUF_LOCK(&so->so_snd);
+		if ((int)so->so_snd.sb_hiwat >= (int)(sbcc - unp2->unp_cc))
+			newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc);
+		else
+			newhiwat = 0;
+		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
+		    newhiwat, RLIM_INFINITY);
+		so->so_snd.sb_mbmax -= mbcnt_delta;
+		SOCKBUF_UNLOCK(&so->so_snd);
+		unp2->unp_cc = sbcc;
+		UNP_PCB_UNLOCK(unp2);
+		m = NULL;
+		break;
+
+	default:
+		panic("uipc_send unknown socktype");
+	}
+
+	/*
+	 * PRUS_EOF is equivalent to pru_send followed by pru_shutdown.
+	 */
+	if (flags & PRUS_EOF) {
+		UNP_PCB_LOCK(unp);
+		socantsendmore(so);
+		unp_shutdown(unp);
+		UNP_PCB_UNLOCK(unp);
+	}
+
+	if ((nam != NULL) || (flags & PRUS_EOF))
+		UNP_LINK_WUNLOCK();
+	else
+		UNP_LINK_RUNLOCK();
+
+	if (control != NULL && error != 0)
+		unp_dispose(control);
+
+release:
+	if (control != NULL)
+		m_freem(control);
+	if (m != NULL)
+		m_freem(m);
+	return (error);
+}
+
+static int
+uipc_sense(struct socket *so, struct stat *sb)
+{
+	struct unpcb *unp, *unp2;
+	struct socket *so2;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
+
+	sb->st_blksize = so->so_snd.sb_hiwat;
+	UNP_LINK_RLOCK();
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) &&
+	    unp2 != NULL) {
+		so2 = unp2->unp_socket;
+		sb->st_blksize += so2->so_rcv.sb_cc;
+	}
+	sb->st_dev = NODEV;
+	if (unp->unp_ino == 0)
+		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
+	sb->st_ino = unp->unp_ino;
+	UNP_PCB_UNLOCK(unp);
+	UNP_LINK_RUNLOCK();
+	return (0);
+}
+
+static int
+uipc_shutdown(struct socket *so)
+{
+	struct unpcb *unp;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
+
+	UNP_LINK_WLOCK();
+	UNP_PCB_LOCK(unp);
+	socantsendmore(so);
+	unp_shutdown(unp);
+	UNP_PCB_UNLOCK(unp);
+	UNP_LINK_WUNLOCK();
+	return (0);
+}
+
+static int
+uipc_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+	struct unpcb *unp;
+	const struct sockaddr *sa;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
+
+	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+	UNP_PCB_LOCK(unp);
+	if (unp->unp_addr != NULL)
+		sa = (struct sockaddr *) unp->unp_addr;
+	else
+		sa = &sun_noname;
+	bcopy(sa, *nam, sa->sa_len);
+	UNP_PCB_UNLOCK(unp);
+	return (0);
+}
+
+static struct pr_usrreqs uipc_usrreqs_dgram = {
+	.pru_abort = 		uipc_abort,
+	.pru_accept =		uipc_accept,
+	.pru_attach =		uipc_attach,
+	.pru_bind =		uipc_bind,
+	.pru_bindat =		uipc_bindat,
+	.pru_connect =		uipc_connect,
+	.pru_connectat =	uipc_connectat,
+	.pru_connect2 =		uipc_connect2,
+	.pru_detach =		uipc_detach,
+	.pru_disconnect =	uipc_disconnect,
+	.pru_listen =		uipc_listen,
+	.pru_peeraddr =		uipc_peeraddr,
+	.pru_rcvd =		uipc_rcvd,
+	.pru_send =		uipc_send,
+	.pru_sense =		uipc_sense,
+	.pru_shutdown =		uipc_shutdown,
+	.pru_sockaddr =		uipc_sockaddr,
+	.pru_soreceive =	soreceive_dgram,
+	.pru_close =		uipc_close,
+};
+
+static struct pr_usrreqs uipc_usrreqs_seqpacket = {
+	.pru_abort =		uipc_abort,
+	.pru_accept =		uipc_accept,
+	.pru_attach =		uipc_attach,
+	.pru_bind =		uipc_bind,
+	.pru_bindat =		uipc_bindat,
+	.pru_connect =		uipc_connect,
+	.pru_connectat =	uipc_connectat,
+	.pru_connect2 =		uipc_connect2,
+	.pru_detach =		uipc_detach,
+	.pru_disconnect =	uipc_disconnect,
+	.pru_listen =		uipc_listen,
+	.pru_peeraddr =		uipc_peeraddr,
+	.pru_rcvd =		uipc_rcvd,
+	.pru_send =		uipc_send,
+	.pru_sense =		uipc_sense,
+	.pru_shutdown =		uipc_shutdown,
+	.pru_sockaddr =		uipc_sockaddr,
+	.pru_soreceive =	soreceive_generic,	/* XXX: or...? */
+	.pru_close =		uipc_close,
+};
+
+static struct pr_usrreqs uipc_usrreqs_stream = {
+	.pru_abort = 		uipc_abort,
+	.pru_accept =		uipc_accept,
+	.pru_attach =		uipc_attach,
+	.pru_bind =		uipc_bind,
+	.pru_bindat =		uipc_bindat,
+	.pru_connect =		uipc_connect,
+	.pru_connectat =	uipc_connectat,
+	.pru_connect2 =		uipc_connect2,
+	.pru_detach =		uipc_detach,
+	.pru_disconnect =	uipc_disconnect,
+	.pru_listen =		uipc_listen,
+	.pru_peeraddr =		uipc_peeraddr,
+	.pru_rcvd =		uipc_rcvd,
+	.pru_send =		uipc_send,
+	.pru_sense =		uipc_sense,
+	.pru_shutdown =		uipc_shutdown,
+	.pru_sockaddr =		uipc_sockaddr,
+	.pru_soreceive =	soreceive_generic,
+	.pru_close =		uipc_close,
+};
+
+static int
+uipc_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+	struct unpcb *unp;
+	struct xucred xu;
+	int error, optval;
+
+	if (sopt->sopt_level != 0)
+		return (EINVAL);
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
+	error = 0;
+	switch (sopt->sopt_dir) {
+	case SOPT_GET:
+		switch (sopt->sopt_name) {
+		case LOCAL_PEERCRED:
+			UNP_PCB_LOCK(unp);
+			if (unp->unp_flags & UNP_HAVEPC)
+				xu = unp->unp_peercred;
+			else {
+				if (so->so_type == SOCK_STREAM)
+					error = ENOTCONN;
+				else
+					error = EINVAL;
+			}
+			UNP_PCB_UNLOCK(unp);
+			if (error == 0)
+				error = sooptcopyout(sopt, &xu, sizeof(xu));
+			break;
+
+		case LOCAL_CREDS:
+			/* Unlocked read. */
+			optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
+			error = sooptcopyout(sopt, &optval, sizeof(optval));
+			break;
+
+		case LOCAL_CONNWAIT:
+			/* Unlocked read. */
+			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
+			error = sooptcopyout(sopt, &optval, sizeof(optval));
+			break;
+
+		default:
+			error = EOPNOTSUPP;
+			break;
+		}
+		break;
+
+	case SOPT_SET:
+		switch (sopt->sopt_name) {
+		case LOCAL_CREDS:
+		case LOCAL_CONNWAIT:
+			error = sooptcopyin(sopt, &optval, sizeof(optval),
+					    sizeof(optval));
+			if (error)
+				break;
+
+#define	OPTSET(bit) do {						\
+	UNP_PCB_LOCK(unp);						\
+	if (optval)							\
+		unp->unp_flags |= bit;					\
+	else								\
+		unp->unp_flags &= ~bit;					\
+	UNP_PCB_UNLOCK(unp);						\
+} while (0)
+
+			switch (sopt->sopt_name) {
+			case LOCAL_CREDS:
+				OPTSET(UNP_WANTCRED);
+				break;
+
+			case LOCAL_CONNWAIT:
+				OPTSET(UNP_CONNWAIT);
+				break;
+
+			default:
+				break;
+			}
+			break;
+#undef	OPTSET
+		default:
+			error = ENOPROTOOPT;
+			break;
+		}
+		break;
+
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+	return (error);
+}
+
+static int
+unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+	return (unp_connectat(AT_FDCWD, so, nam, td));
+}
+
+static int
+unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
+    struct thread *td)
+{
+	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+	struct vnode *vp;
+	struct socket *so2, *so3;
+	struct unpcb *unp, *unp2, *unp3;
+	struct nameidata nd;
+	char buf[SOCK_MAXADDRLEN];
+	struct sockaddr *sa;
+	cap_rights_t rights;
+	int error, len;
+
+	UNP_LINK_WLOCK_ASSERT();
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
+
+	if (nam->sa_len > sizeof(struct sockaddr_un))
+		return (EINVAL);
+	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
+	if (len <= 0)
+		return (EINVAL);
+	bcopy(soun->sun_path, buf, len);
+	buf[len] = 0;
+
+	UNP_PCB_LOCK(unp);
+	if (unp->unp_flags & UNP_CONNECTING) {
+		UNP_PCB_UNLOCK(unp);
+		return (EALREADY);
+	}
+	UNP_LINK_WUNLOCK();
+	unp->unp_flags |= UNP_CONNECTING;
+	UNP_PCB_UNLOCK(unp);
+
+	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
+	    UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_CONNECTAT), td);
+	error = namei(&nd);
+	if (error)
+		vp = NULL;
+	else
+		vp = nd.ni_vp;
+	ASSERT_VOP_LOCKED(vp, "unp_connect");
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (error)
+		goto bad;
+
+	if (vp->v_type != VSOCK) {
+		error = ENOTSOCK;
+		goto bad;
+	}
+#ifdef MAC
+	error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD);
+	if (error)
+		goto bad;
+#endif
+	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
+	if (error)
+		goto bad;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
+
+	/*
+	 * Lock linkage lock for two reasons: make sure v_socket is stable,
+	 * and to protect simultaneous locking of multiple pcbs.
+	 */
+	UNP_LINK_WLOCK();
+	VOP_UNP_CONNECT(vp, &so2);
+	if (so2 == NULL) {
+		error = ECONNREFUSED;
+		goto bad2;
+	}
+	if (so->so_type != so2->so_type) {
+		error = EPROTOTYPE;
+		goto bad2;
+	}
+	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
+		if (so2->so_options & SO_ACCEPTCONN) {
+			CURVNET_SET(so2->so_vnet);
+			so3 = sonewconn(so2, 0);
+			CURVNET_RESTORE();
+		} else
+			so3 = NULL;
+		if (so3 == NULL) {
+			error = ECONNREFUSED;
+			goto bad2;
+		}
+		unp = sotounpcb(so);
+		unp2 = sotounpcb(so2);
+		unp3 = sotounpcb(so3);
+		UNP_PCB_LOCK(unp);
+		UNP_PCB_LOCK(unp2);
+		UNP_PCB_LOCK(unp3);
+		if (unp2->unp_addr != NULL) {
+			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
+			unp3->unp_addr = (struct sockaddr_un *) sa;
+			sa = NULL;
+		}
+
+		/*
+		 * The connector's (client's) credentials are copied from its
+		 * process structure at the time of connect() (which is now).
+		 */
+		cru2x(td->td_ucred, &unp3->unp_peercred);
+		unp3->unp_flags |= UNP_HAVEPC;
+
+		/*
+		 * The receiver's (server's) credentials are copied from the
+		 * unp_peercred member of socket on which the former called
+		 * listen(); uipc_listen() cached that process's credentials
+		 * at that time so we can use them now.
+		 */
+		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
+		    ("unp_connect: listener without cached peercred"));
+		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
+		    sizeof(unp->unp_peercred));
+		unp->unp_flags |= UNP_HAVEPC;
+		if (unp2->unp_flags & UNP_WANTCRED)
+			unp3->unp_flags |= UNP_WANTCRED;
+		UNP_PCB_UNLOCK(unp3);
+		UNP_PCB_UNLOCK(unp2);
+		UNP_PCB_UNLOCK(unp);
+#ifdef MAC
+		mac_socketpeer_set_from_socket(so, so3);
+		mac_socketpeer_set_from_socket(so3, so);
+#endif
+
+		so2 = so3;
+	}
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
+	unp2 = sotounpcb(so2);
+	KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL"));
+	UNP_PCB_LOCK(unp);
+	UNP_PCB_LOCK(unp2);
+	error = unp_connect2(so, so2, PRU_CONNECT);
+	UNP_PCB_UNLOCK(unp2);
+	UNP_PCB_UNLOCK(unp);
+bad2:
+	UNP_LINK_WUNLOCK();
+bad:
+	if (vp != NULL)
+		vput(vp);
+	free(sa, M_SONAME);
+	UNP_LINK_WLOCK();
+	UNP_PCB_LOCK(unp);
+	unp->unp_flags &= ~UNP_CONNECTING;
+	UNP_PCB_UNLOCK(unp);
+	return (error);
+}
+
+static int
+unp_connect2(struct socket *so, struct socket *so2, int req)
+{
+	struct unpcb *unp;
+	struct unpcb *unp2;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
+	unp2 = sotounpcb(so2);
+	KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
+
+	UNP_LINK_WLOCK_ASSERT();
+	UNP_PCB_LOCK_ASSERT(unp);
+	UNP_PCB_LOCK_ASSERT(unp2);
+
+	if (so2->so_type != so->so_type)
+		return (EPROTOTYPE);
+	unp->unp_conn = unp2;
+
+	switch (so->so_type) {
+	case SOCK_DGRAM:
+		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
+		soisconnected(so);
+		break;
+
+	case SOCK_STREAM:
+	case SOCK_SEQPACKET:
+		unp2->unp_conn = unp;
+		if (req == PRU_CONNECT &&
+		    ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
+			soisconnecting(so);
+		else
+			soisconnected(so);
+		soisconnected(so2);
+		break;
+
+	default:
+		panic("unp_connect2");
+	}
+	return (0);
+}
+
+static void
+unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
+{
+	struct socket *so;
+
+	KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL"));
+
+	UNP_LINK_WLOCK_ASSERT();
+	UNP_PCB_LOCK_ASSERT(unp);
+	UNP_PCB_LOCK_ASSERT(unp2);
+
+	unp->unp_conn = NULL;
+	switch (unp->unp_socket->so_type) {
+	case SOCK_DGRAM:
+		LIST_REMOVE(unp, unp_reflink);
+		so = unp->unp_socket;
+		SOCK_LOCK(so);
+		so->so_state &= ~SS_ISCONNECTED;
+		SOCK_UNLOCK(so);
+		break;
+
+	case SOCK_STREAM:
+	case SOCK_SEQPACKET:
+		soisdisconnected(unp->unp_socket);
+		unp2->unp_conn = NULL;
+		soisdisconnected(unp2->unp_socket);
+		break;
+	}
+}
+
+/*
+ * unp_pcblist() walks the global list of struct unpcb's to generate a
+ * pointer list, bumping the refcount on each unpcb.  It then copies them out
+ * sequentially, validating the generation number on each to see if it has
+ * been detached.  All of this is necessary because copyout() may sleep on
+ * disk I/O.
+ */
+static int
+unp_pcblist(SYSCTL_HANDLER_ARGS)
+{
+	int error, i, n;
+	int freeunp;
+	struct unpcb *unp, **unp_list;
+	unp_gen_t gencnt;
+	struct xunpgen *xug;
+	struct unp_head *head;
+	struct xunpcb *xu;
+
+	switch ((intptr_t)arg1) {
+	case SOCK_STREAM:
+		head = &unp_shead;
+		break;
+
+	case SOCK_DGRAM:
+		head = &unp_dhead;
+		break;
+
+	case SOCK_SEQPACKET:
+		head = &unp_sphead;
+		break;
+
+	default:
+		panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1);
+	}
+
+	/*
+	 * The process of preparing the PCB list is too time-consuming and
+	 * resource-intensive to repeat twice on every request.
+	 */
+	if (req->oldptr == NULL) {
+		n = unp_count;
+		req->oldidx = 2 * (sizeof *xug)
+			+ (n + n/8) * sizeof(struct xunpcb);
+		return (0);
+	}
+
+	if (req->newptr != NULL)
+		return (EPERM);
+
+	/*
+	 * OK, now we're committed to doing something.
+	 */
+	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
+	UNP_LIST_LOCK();
+	gencnt = unp_gencnt;
+	n = unp_count;
+	UNP_LIST_UNLOCK();
+
+	xug->xug_len = sizeof *xug;
+	xug->xug_count = n;
+	xug->xug_gen = gencnt;
+	xug->xug_sogen = so_gencnt;
+	error = SYSCTL_OUT(req, xug, sizeof *xug);
+	if (error) {
+		free(xug, M_TEMP);
+		return (error);
+	}
+
+	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
+
+	UNP_LIST_LOCK();
+	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
+	     unp = LIST_NEXT(unp, unp_link)) {
+		UNP_PCB_LOCK(unp);
+		if (unp->unp_gencnt <= gencnt) {
+			if (cr_cansee(req->td->td_ucred,
+			    unp->unp_socket->so_cred)) {
+				UNP_PCB_UNLOCK(unp);
+				continue;
+			}
+			unp_list[i++] = unp;
+			unp->unp_refcount++;
+		}
+		UNP_PCB_UNLOCK(unp);
+	}
+	UNP_LIST_UNLOCK();
+	n = i;			/* In case we lost some during malloc. */
+
+	error = 0;
+	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
+	for (i = 0; i < n; i++) {
+		unp = unp_list[i];
+		UNP_PCB_LOCK(unp);
+		unp->unp_refcount--;
+	        if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) {
+			xu->xu_len = sizeof *xu;
+			xu->xu_unpp = unp;
+			/*
+			 * XXX - need more locking here to protect against
+			 * connect/disconnect races for SMP.
+			 */
+			if (unp->unp_addr != NULL)
+				bcopy(unp->unp_addr, &xu->xu_addr,
+				      unp->unp_addr->sun_len);
+			if (unp->unp_conn != NULL &&
+			    unp->unp_conn->unp_addr != NULL)
+				bcopy(unp->unp_conn->unp_addr,
+				      &xu->xu_caddr,
+				      unp->unp_conn->unp_addr->sun_len);
+			bcopy(unp, &xu->xu_unp, sizeof *unp);
+			sotoxsocket(unp->unp_socket, &xu->xu_socket);
+			UNP_PCB_UNLOCK(unp);
+			error = SYSCTL_OUT(req, xu, sizeof *xu);
+		} else {
+			freeunp = (unp->unp_refcount == 0);
+			UNP_PCB_UNLOCK(unp);
+			if (freeunp) {
+				UNP_PCB_LOCK_DESTROY(unp);
+				uma_zfree(unp_zone, unp);
+			}
+		}
+	}
+	free(xu, M_TEMP);
+	if (!error) {
+		/*
+		 * Give the user an updated idea of our state.  If the
+		 * generation differs from what we told her before, she knows
+		 * that something happened while we were processing this
+		 * request, and it might be necessary to retry.
+		 */
+		xug->xug_gen = unp_gencnt;
+		xug->xug_sogen = so_gencnt;
+		xug->xug_count = unp_count;
+		error = SYSCTL_OUT(req, xug, sizeof *xug);
+	}
+	free(unp_list, M_TEMP);
+	free(xug, M_TEMP);
+	return (error);
+}
+
+SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD,
+    (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
+    "List of active local datagram sockets");
+SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD,
+    (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
+    "List of active local stream sockets");
+SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist,
+    CTLTYPE_OPAQUE | CTLFLAG_RD,
+    (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb",
+    "List of active local seqpacket sockets");
+
+static void
+unp_shutdown(struct unpcb *unp)
+{
+	struct unpcb *unp2;
+	struct socket *so;
+
+	UNP_LINK_WLOCK_ASSERT();
+	UNP_PCB_LOCK_ASSERT(unp);
+
+	unp2 = unp->unp_conn;
+	if ((unp->unp_socket->so_type == SOCK_STREAM ||
+	    (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) {
+		so = unp2->unp_socket;
+		if (so != NULL)
+			socantrcvmore(so);
+	}
+}
+
+static void
+unp_drop(struct unpcb *unp, int errno)
+{
+	struct socket *so = unp->unp_socket;
+	struct unpcb *unp2;
+
+	UNP_LINK_WLOCK_ASSERT();
+	UNP_PCB_LOCK_ASSERT(unp);
+
+	so->so_error = errno;
+	unp2 = unp->unp_conn;
+	if (unp2 == NULL)
+		return;
+	UNP_PCB_LOCK(unp2);
+	unp_disconnect(unp, unp2);
+	UNP_PCB_UNLOCK(unp2);
+}
+
+static void
+unp_freerights(struct filedescent **fdep, int fdcount)
+{
+	struct file *fp;
+	int i;
+
+	KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount));
+
+	for (i = 0; i < fdcount; i++) {
+		fp = fdep[i]->fde_file;
+		filecaps_free(&fdep[i]->fde_caps);
+		unp_discard(fp);
+	}
+	free(fdep[0], M_FILECAPS);
+}
+
+static int
+unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
+{
+	struct thread *td = curthread;		/* XXX */
+	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
+	int i;
+	int *fdp;
+	struct filedesc *fdesc = td->td_proc->p_fd;
+	struct filedescent *fde, **fdep;
+	void *data;
+	socklen_t clen = control->m_len, datalen;
+	int error, newfds;
+	u_int newlen;
+
+	UNP_LINK_UNLOCK_ASSERT();
+
+	error = 0;
+	if (controlp != NULL) /* controlp == NULL => free control messages */
+		*controlp = NULL;
+	while (cm != NULL) {
+		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
+			error = EINVAL;
+			break;
+		}
+		data = CMSG_DATA(cm);
+		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
+		if (cm->cmsg_level == SOL_SOCKET
+		    && cm->cmsg_type == SCM_RIGHTS) {
+			newfds = datalen / sizeof(*fdep);
+			if (newfds == 0)
+				goto next;
+			fdep = data;
+
+			/* If we're not outputting the descriptors free them. */
+			if (error || controlp == NULL) {
+				unp_freerights(fdep, newfds);
+				goto next;
+			}
+			FILEDESC_XLOCK(fdesc);
+
+			/*
+			 * Now change each pointer to an fd in the global
+			 * table to an integer that is the index to the local
+			 * fd table entry that we set up to point to the
+			 * global one we are transferring.
+			 */
+			newlen = newfds * sizeof(int);
+			*controlp = sbcreatecontrol(NULL, newlen,
+			    SCM_RIGHTS, SOL_SOCKET);
+			if (*controlp == NULL) {
+				FILEDESC_XUNLOCK(fdesc);
+				error = E2BIG;
+				unp_freerights(fdep, newfds);
+				goto next;
+			}
+
+			fdp = (int *)
+			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+			if (fdallocn(td, 0, fdp, newfds) != 0) {
+				FILEDESC_XUNLOCK(td->td_proc->p_fd);
+				error = EMSGSIZE;
+				unp_freerights(fdep, newfds);
+				m_freem(*controlp);
+				*controlp = NULL;
+				goto next;
+			}
+			for (i = 0; i < newfds; i++, fdp++) {
+				fde = &fdesc->fd_ofiles[*fdp];
+				fde->fde_file = fdep[i]->fde_file;
+				filecaps_move(&fdep[i]->fde_caps,
+				    &fde->fde_caps);
+				if ((flags & MSG_CMSG_CLOEXEC) != 0)
+					fde->fde_flags |= UF_EXCLOSE;
+				unp_externalize_fp(fde->fde_file);
+			}
+			FILEDESC_XUNLOCK(fdesc);
+			free(fdep[0], M_FILECAPS);
+		} else {
+			/* We can just copy anything else across. */
+			if (error || controlp == NULL)
+				goto next;
+			*controlp = sbcreatecontrol(NULL, datalen,
+			    cm->cmsg_type, cm->cmsg_level);
+			if (*controlp == NULL) {
+				error = ENOBUFS;
+				goto next;
+			}
+			bcopy(data,
+			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
+			    datalen);
+		}
+		controlp = &(*controlp)->m_next;
+
+next:
+		if (CMSG_SPACE(datalen) < clen) {
+			clen -= CMSG_SPACE(datalen);
+			cm = (struct cmsghdr *)
+			    ((caddr_t)cm + CMSG_SPACE(datalen));
+		} else {
+			clen = 0;
+			cm = NULL;
+		}
+	}
+
+	m_freem(control);
+	return (error);
+}
+
+static void
+unp_zone_change(void *tag)
+{
+
+	uma_zone_set_max(unp_zone, maxsockets);
+}
+
+static void
+unp_init(void)
+{
+
+#ifdef VIMAGE
+	if (!IS_DEFAULT_VNET(curvnet))
+		return;
+#endif
+	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	if (unp_zone == NULL)
+		panic("unp_init");
+	uma_zone_set_max(unp_zone, maxsockets);
+	uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached");
+	EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
+	    NULL, EVENTHANDLER_PRI_ANY);
+	LIST_INIT(&unp_dhead);
+	LIST_INIT(&unp_shead);
+	LIST_INIT(&unp_sphead);
+	SLIST_INIT(&unp_defers);
+	TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL);
+	TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL);
+	UNP_LINK_LOCK_INIT();
+	UNP_LIST_LOCK_INIT();
+	UNP_DEFERRED_LOCK_INIT();
+}
+
+static int
+unp_internalize(struct mbuf **controlp, struct thread *td)
+{
+	struct mbuf *control = *controlp;
+	struct proc *p = td->td_proc;
+	struct filedesc *fdesc = p->p_fd;
+	struct bintime *bt;
+	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
+	struct cmsgcred *cmcred;
+	struct filedescent *fde, **fdep, *fdev;
+	struct file *fp;
+	struct timeval *tv;
+	int i, fd, *fdp;
+	void *data;
+	socklen_t clen = control->m_len, datalen;
+	int error, oldfds;
+	u_int newlen;
+
+	UNP_LINK_UNLOCK_ASSERT();
+
+	error = 0;
+	*controlp = NULL;
+	while (cm != NULL) {
+		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
+		    || cm->cmsg_len > clen) {
+			error = EINVAL;
+			goto out;
+		}
+		data = CMSG_DATA(cm);
+		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
+
+		switch (cm->cmsg_type) {
+		/*
+		 * Fill in credential information.
+		 */
+		case SCM_CREDS:
+			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
+			    SCM_CREDS, SOL_SOCKET);
+			if (*controlp == NULL) {
+				error = ENOBUFS;
+				goto out;
+			}
+			cmcred = (struct cmsgcred *)
+			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+			cmcred->cmcred_pid = p->p_pid;
+			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
+			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
+			cmcred->cmcred_euid = td->td_ucred->cr_uid;
+			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
+			    CMGROUP_MAX);
+			for (i = 0; i < cmcred->cmcred_ngroups; i++)
+				cmcred->cmcred_groups[i] =
+				    td->td_ucred->cr_groups[i];
+			break;
+
+		case SCM_RIGHTS:
+			oldfds = datalen / sizeof (int);
+			if (oldfds == 0)
+				break;
+			/*
+			 * Check that all the FDs passed in refer to legal
+			 * files.  If not, reject the entire operation.
+			 */
+			fdp = data;
+			FILEDESC_SLOCK(fdesc);
+			for (i = 0; i < oldfds; i++) {
+				fd = *fdp++;
+				if (fget_locked(fdesc, fd) == NULL) {
+					FILEDESC_SUNLOCK(fdesc);
+					error = EBADF;
+					goto out;
+				}
+				fp = fdesc->fd_ofiles[fd].fde_file;
+				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
+					FILEDESC_SUNLOCK(fdesc);
+					error = EOPNOTSUPP;
+					goto out;
+				}
+
+			}
+
+			/*
+			 * Now replace the integer FDs with pointers to the
+			 * file structure and capability rights.
+			 */
+			newlen = oldfds * sizeof(fdep[0]);
+			*controlp = sbcreatecontrol(NULL, newlen,
+			    SCM_RIGHTS, SOL_SOCKET);
+			if (*controlp == NULL) {
+				FILEDESC_SUNLOCK(fdesc);
+				error = E2BIG;
+				goto out;
+			}
+			fdp = data;
+			fdep = (struct filedescent **)
+			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+			fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS,
+			    M_WAITOK);
+			for (i = 0; i < oldfds; i++, fdev++, fdp++) {
+				fde = &fdesc->fd_ofiles[*fdp];
+				fdep[i] = fdev;
+				fdep[i]->fde_file = fde->fde_file;
+				filecaps_copy(&fde->fde_caps,
+				    &fdep[i]->fde_caps);
+				unp_internalize_fp(fdep[i]->fde_file);
+			}
+			FILEDESC_SUNLOCK(fdesc);
+			break;
+
+		case SCM_TIMESTAMP:
+			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
+			    SCM_TIMESTAMP, SOL_SOCKET);
+			if (*controlp == NULL) {
+				error = ENOBUFS;
+				goto out;
+			}
+			tv = (struct timeval *)
+			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+			microtime(tv);
+			break;
+
+		case SCM_BINTIME:
+			*controlp = sbcreatecontrol(NULL, sizeof(*bt),
+			    SCM_BINTIME, SOL_SOCKET);
+			if (*controlp == NULL) {
+				error = ENOBUFS;
+				goto out;
+			}
+			bt = (struct bintime *)
+			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+			bintime(bt);
+			break;
+
+		default:
+			error = EINVAL;
+			goto out;
+		}
+
+		controlp = &(*controlp)->m_next;
+		if (CMSG_SPACE(datalen) < clen) {
+			clen -= CMSG_SPACE(datalen);
+			cm = (struct cmsghdr *)
+			    ((caddr_t)cm + CMSG_SPACE(datalen));
+		} else {
+			clen = 0;
+			cm = NULL;
+		}
+	}
+
+out:
+	m_freem(control);
+	return (error);
+}
+
+static struct mbuf *
+unp_addsockcred(struct thread *td, struct mbuf *control)
+{
+	struct mbuf *m, *n, *n_prev;
+	struct sockcred *sc;
+	const struct cmsghdr *cm;
+	int ngroups;
+	int i;
+
+	ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
+	m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET);
+	if (m == NULL)
+		return (control);
+
+	sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *));
+	sc->sc_uid = td->td_ucred->cr_ruid;
+	sc->sc_euid = td->td_ucred->cr_uid;
+	sc->sc_gid = td->td_ucred->cr_rgid;
+	sc->sc_egid = td->td_ucred->cr_gid;
+	sc->sc_ngroups = ngroups;
+	for (i = 0; i < sc->sc_ngroups; i++)
+		sc->sc_groups[i] = td->td_ucred->cr_groups[i];
+
+	/*
+	 * Unlink SCM_CREDS control messages (struct cmsgcred), since just
+	 * created SCM_CREDS control message (struct sockcred) has another
+	 * format.
+	 */
+	if (control != NULL)
+		for (n = control, n_prev = NULL; n != NULL;) {
+			cm = mtod(n, struct cmsghdr *);
+    			if (cm->cmsg_level == SOL_SOCKET &&
+			    cm->cmsg_type == SCM_CREDS) {
+    				if (n_prev == NULL)
+					control = n->m_next;
+				else
+					n_prev->m_next = n->m_next;
+				n = m_free(n);
+			} else {
+				n_prev = n;
+				n = n->m_next;
+			}
+		}
+
+	/* Prepend it to the head. */
+	m->m_next = control;
+	return (m);
+}
+
+static struct unpcb *
+fptounp(struct file *fp)
+{
+	struct socket *so;
+
+	if (fp->f_type != DTYPE_SOCKET)
+		return (NULL);
+	if ((so = fp->f_data) == NULL)
+		return (NULL);
+	if (so->so_proto->pr_domain != &localdomain)
+		return (NULL);
+	return sotounpcb(so);
+}
+
+static void
+unp_discard(struct file *fp)
+{
+	struct unp_defer *dr;
+
+	if (unp_externalize_fp(fp)) {
+		dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK);
+		dr->ud_fp = fp;
+		UNP_DEFERRED_LOCK();
+		SLIST_INSERT_HEAD(&unp_defers, dr, ud_link);
+		UNP_DEFERRED_UNLOCK();
+		atomic_add_int(&unp_defers_count, 1);
+		taskqueue_enqueue(taskqueue_thread, &unp_defer_task);
+	} else
+		(void) closef(fp, (struct thread *)NULL);
+}
+
+static void
+unp_process_defers(void *arg __unused, int pending)
+{
+	struct unp_defer *dr;
+	SLIST_HEAD(, unp_defer) drl;
+	int count;
+
+	SLIST_INIT(&drl);
+	for (;;) {
+		UNP_DEFERRED_LOCK();
+		if (SLIST_FIRST(&unp_defers) == NULL) {
+			UNP_DEFERRED_UNLOCK();
+			break;
+		}
+		SLIST_SWAP(&unp_defers, &drl, unp_defer);
+		UNP_DEFERRED_UNLOCK();
+		count = 0;
+		while ((dr = SLIST_FIRST(&drl)) != NULL) {
+			SLIST_REMOVE_HEAD(&drl, ud_link);
+			closef(dr->ud_fp, NULL);
+			free(dr, M_TEMP);
+			count++;
+		}
+		atomic_add_int(&unp_defers_count, -count);
+	}
+}
+
+static void
+unp_internalize_fp(struct file *fp)
+{
+	struct unpcb *unp;
+
+	UNP_LINK_WLOCK();
+	if ((unp = fptounp(fp)) != NULL) {
+		unp->unp_file = fp;
+		unp->unp_msgcount++;
+	}
+	fhold(fp);
+	unp_rights++;
+	UNP_LINK_WUNLOCK();
+}
+
+static int
+unp_externalize_fp(struct file *fp)
+{
+	struct unpcb *unp;
+	int ret;
+
+	UNP_LINK_WLOCK();
+	if ((unp = fptounp(fp)) != NULL) {
+		unp->unp_msgcount--;
+		ret = 1;
+	} else
+		ret = 0;
+	unp_rights--;
+	UNP_LINK_WUNLOCK();
+	return (ret);
+}
+
+/*
+ * unp_defer indicates whether additional work has been defered for a future
+ * pass through unp_gc().  It is thread local and does not require explicit
+ * synchronization.
+ */
+static int	unp_marked;
+static int	unp_unreachable;
+
+static void
+unp_accessable(struct filedescent **fdep, int fdcount)
+{
+	struct unpcb *unp;
+	struct file *fp;
+	int i;
+
+	for (i = 0; i < fdcount; i++) {
+		fp = fdep[i]->fde_file;
+		if ((unp = fptounp(fp)) == NULL)
+			continue;
+		if (unp->unp_gcflag & UNPGC_REF)
+			continue;
+		unp->unp_gcflag &= ~UNPGC_DEAD;
+		unp->unp_gcflag |= UNPGC_REF;
+		unp_marked++;
+	}
+}
+
+static void
+unp_gc_process(struct unpcb *unp)
+{
+	struct socket *soa;
+	struct socket *so;
+	struct file *fp;
+
+	/* Already processed. */
+	if (unp->unp_gcflag & UNPGC_SCANNED)
+		return;
+	fp = unp->unp_file;
+
+	/*
+	 * Check for a socket potentially in a cycle.  It must be in a
+	 * queue as indicated by msgcount, and this must equal the file
+	 * reference count.  Note that when msgcount is 0 the file is NULL.
+	 */
+	if ((unp->unp_gcflag & UNPGC_REF) == 0 && fp &&
+	    unp->unp_msgcount != 0 && fp->f_count == unp->unp_msgcount) {
+		unp->unp_gcflag |= UNPGC_DEAD;
+		unp_unreachable++;
+		return;
+	}
+
+	/*
+	 * Mark all sockets we reference with RIGHTS.
+	 */
+	so = unp->unp_socket;
+	SOCKBUF_LOCK(&so->so_rcv);
+	unp_scan(so->so_rcv.sb_mb, unp_accessable);
+	SOCKBUF_UNLOCK(&so->so_rcv);
+
+	/*
+	 * Mark all sockets in our accept queue.
+	 */
+	ACCEPT_LOCK();
+	TAILQ_FOREACH(soa, &so->so_comp, so_list) {
+		SOCKBUF_LOCK(&soa->so_rcv);
+		unp_scan(soa->so_rcv.sb_mb, unp_accessable);
+		SOCKBUF_UNLOCK(&soa->so_rcv);
+	}
+	ACCEPT_UNLOCK();
+	unp->unp_gcflag |= UNPGC_SCANNED;
+}
+
+static int unp_recycled;
+SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, 
+    "Number of unreachable sockets claimed by the garbage collector.");
+
+static int unp_taskcount;
+SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, 
+    "Number of times the garbage collector has run.");
+
+static void
+unp_gc(__unused void *arg, int pending)
+{
+	struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead,
+				    NULL };
+	struct unp_head **head;
+	struct file *f, **unref;
+	struct unpcb *unp;
+	int i, total;
+
+	unp_taskcount++;
+	UNP_LIST_LOCK();
+	/*
+	 * First clear all gc flags from previous runs.
+	 */
+	for (head = heads; *head != NULL; head++)
+		LIST_FOREACH(unp, *head, unp_link)
+			unp->unp_gcflag = 0;
+
+	/*
+	 * Scan marking all reachable sockets with UNPGC_REF.  Once a socket
+	 * is reachable all of the sockets it references are reachable.
+	 * Stop the scan once we do a complete loop without discovering
+	 * a new reachable socket.
+	 */
+	do {
+		unp_unreachable = 0;
+		unp_marked = 0;
+		for (head = heads; *head != NULL; head++)
+			LIST_FOREACH(unp, *head, unp_link)
+				unp_gc_process(unp);
+	} while (unp_marked);
+	UNP_LIST_UNLOCK();
+	if (unp_unreachable == 0)
+		return;
+
+	/*
+	 * Allocate space for a local list of dead unpcbs.
+	 */
+	unref = malloc(unp_unreachable * sizeof(struct file *),
+	    M_TEMP, M_WAITOK);
+
+	/*
+	 * Iterate looking for sockets which have been specifically marked
+	 * as as unreachable and store them locally.
+	 */
+	UNP_LINK_RLOCK();
+	UNP_LIST_LOCK();
+	for (total = 0, head = heads; *head != NULL; head++)
+		LIST_FOREACH(unp, *head, unp_link)
+			if ((unp->unp_gcflag & UNPGC_DEAD) != 0) {
+				f = unp->unp_file;
+				if (unp->unp_msgcount == 0 || f == NULL ||
+				    f->f_count != unp->unp_msgcount)
+					continue;
+				unref[total++] = f;
+				fhold(f);
+				KASSERT(total <= unp_unreachable,
+				    ("unp_gc: incorrect unreachable count."));
+			}
+	UNP_LIST_UNLOCK();
+	UNP_LINK_RUNLOCK();
+
+	/*
+	 * Now flush all sockets, free'ing rights.  This will free the
+	 * struct files associated with these sockets but leave each socket
+	 * with one remaining ref.
+	 */
+	for (i = 0; i < total; i++) {
+		struct socket *so;
+
+		so = unref[i]->f_data;
+		CURVNET_SET(so->so_vnet);
+		sorflush(so);
+		CURVNET_RESTORE();
+	}
+
+	/*
+	 * And finally release the sockets so they can be reclaimed.
+	 */
+	for (i = 0; i < total; i++)
+		fdrop(unref[i], NULL);
+	unp_recycled += total;
+	free(unref, M_TEMP);
+}
+
+static void
+unp_dispose(struct mbuf *m)
+{
+
+	if (m)
+		unp_scan(m, unp_freerights);
+}
+
+static void
+unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
+{
+	struct mbuf *m;
+	struct cmsghdr *cm;
+	void *data;
+	socklen_t clen, datalen;
+
+	while (m0 != NULL) {
+		for (m = m0; m; m = m->m_next) {
+			if (m->m_type != MT_CONTROL)
+				continue;
+
+			cm = mtod(m, struct cmsghdr *);
+			clen = m->m_len;
+
+			while (cm != NULL) {
+				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
+					break;
+
+				data = CMSG_DATA(cm);
+				datalen = (caddr_t)cm + cm->cmsg_len
+				    - (caddr_t)data;
+
+				if (cm->cmsg_level == SOL_SOCKET &&
+				    cm->cmsg_type == SCM_RIGHTS) {
+					(*op)(data, datalen /
+					    sizeof(struct filedescent *));
+				}
+
+				if (CMSG_SPACE(datalen) < clen) {
+					clen -= CMSG_SPACE(datalen);
+					cm = (struct cmsghdr *)
+					    ((caddr_t)cm + CMSG_SPACE(datalen));
+				} else {
+					clen = 0;
+					cm = NULL;
+				}
+			}
+		}
+		m0 = m0->m_act;
+	}
+}
+
+/*
+ * A helper function called by VFS before socket-type vnode reclamation.
+ * For an active vnode it clears unp_vnode pointer and decrements unp_vnode
+ * use count.
+ */
+void
+vfs_unp_reclaim(struct vnode *vp)
+{
+	struct socket *so;
+	struct unpcb *unp;
+	int active;
+
+	ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim");
+	KASSERT(vp->v_type == VSOCK,
+	    ("vfs_unp_reclaim: vp->v_type != VSOCK"));
+
+	active = 0;
+	UNP_LINK_WLOCK();
+	VOP_UNP_CONNECT(vp, &so);
+	if (so == NULL)
+		goto done;
+	unp = sotounpcb(so);
+	if (unp == NULL)
+		goto done;
+	UNP_PCB_LOCK(unp);
+	if (unp->unp_vnode == vp) {
+		VOP_UNP_DETACH(vp);
+		unp->unp_vnode = NULL;
+		active = 1;
+	}
+	UNP_PCB_UNLOCK(unp);
+done:
+	UNP_LINK_WUNLOCK();
+	if (active)
+		vunref(vp);
+}
+
+#ifdef DDB
+static void
+db_print_indent(int indent)
+{
+	int i;
+
+	for (i = 0; i < indent; i++)
+		db_printf(" ");
+}
+
+static void
+db_print_unpflags(int unp_flags)
+{
+	int comma;
+
+	comma = 0;
+	if (unp_flags & UNP_HAVEPC) {
+		db_printf("%sUNP_HAVEPC", comma ? ", " : "");
+		comma = 1;
+	}
+	if (unp_flags & UNP_HAVEPCCACHED) {
+		db_printf("%sUNP_HAVEPCCACHED", comma ? ", " : "");
+		comma = 1;
+	}
+	if (unp_flags & UNP_WANTCRED) {
+		db_printf("%sUNP_WANTCRED", comma ? ", " : "");
+		comma = 1;
+	}
+	if (unp_flags & UNP_CONNWAIT) {
+		db_printf("%sUNP_CONNWAIT", comma ? ", " : "");
+		comma = 1;
+	}
+	if (unp_flags & UNP_CONNECTING) {
+		db_printf("%sUNP_CONNECTING", comma ? ", " : "");
+		comma = 1;
+	}
+	if (unp_flags & UNP_BINDING) {
+		db_printf("%sUNP_BINDING", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_xucred(int indent, struct xucred *xu)
+{
+	int comma, i;
+
+	db_print_indent(indent);
+	db_printf("cr_version: %u   cr_uid: %u   cr_ngroups: %d\n",
+	    xu->cr_version, xu->cr_uid, xu->cr_ngroups);
+	db_print_indent(indent);
+	db_printf("cr_groups: ");
+	comma = 0;
+	for (i = 0; i < xu->cr_ngroups; i++) {
+		db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
+		comma = 1;
+	}
+	db_printf("\n");
+}
+
+static void
+db_print_unprefs(int indent, struct unp_head *uh)
+{
+	struct unpcb *unp;
+	int counter;
+
+	counter = 0;
+	LIST_FOREACH(unp, uh, unp_reflink) {
+		if (counter % 4 == 0)
+			db_print_indent(indent);
+		db_printf("%p  ", unp);
+		if (counter % 4 == 3)
+			db_printf("\n");
+		counter++;
+	}
+	if (counter != 0 && counter % 4 != 0)
+		db_printf("\n");
+}
+
+DB_SHOW_COMMAND(unpcb, db_show_unpcb)
+{
+	struct unpcb *unp;
+
+        if (!have_addr) {
+                db_printf("usage: show unpcb <addr>\n");
+                return;
+        }
+        unp = (struct unpcb *)addr;
+
+	db_printf("unp_socket: %p   unp_vnode: %p\n", unp->unp_socket,
+	    unp->unp_vnode);
+
+	db_printf("unp_ino: %ju   unp_conn: %p\n", (uintmax_t)unp->unp_ino,
+	    unp->unp_conn);
+
+	db_printf("unp_refs:\n");
+	db_print_unprefs(2, &unp->unp_refs);
+
+	/* XXXRW: Would be nice to print the full address, if any. */
+	db_printf("unp_addr: %p\n", unp->unp_addr);
+
+	db_printf("unp_cc: %d   unp_mbcnt: %d   unp_gencnt: %llu\n",
+	    unp->unp_cc, unp->unp_mbcnt,
+	    (unsigned long long)unp->unp_gencnt);
+
+	db_printf("unp_flags: %x (", unp->unp_flags);
+	db_print_unpflags(unp->unp_flags);
+	db_printf(")\n");
+
+	db_printf("unp_peercred:\n");
+	db_print_xucred(2, &unp->unp_peercred);
+
+	db_printf("unp_refcount: %u\n", unp->unp_refcount);
+}
+#endif
diff --git a/sys/kern/vfs_acl.c b/sys/kern/vfs_acl.c
new file mode 100644
index 0000000..362792b
--- /dev/null
+++ b/sys/kern/vfs_acl.c
@@ -0,0 +1,562 @@
+/*-
+ * Copyright (c) 1999-2006 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ *
+ * ACL system calls and other functions common across different ACL types.
+ * Type-specific routines go into subr_acl_<type>.c.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/capability.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/acl.h>
+
+#include <security/mac/mac_framework.h>
+
+CTASSERT(ACL_MAX_ENTRIES >= OLDACL_MAX_ENTRIES);
+
+MALLOC_DEFINE(M_ACL, "acl", "Access Control Lists");
+
+static int	vacl_set_acl(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+static int	vacl_get_acl(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+
+int
+acl_copy_oldacl_into_acl(const struct oldacl *source, struct acl *dest)
+{
+	int i;
+
+	if (source->acl_cnt < 0 || source->acl_cnt > OLDACL_MAX_ENTRIES)
+		return (EINVAL);
+	
+	bzero(dest, sizeof(*dest));
+
+	dest->acl_cnt = source->acl_cnt;
+	dest->acl_maxcnt = ACL_MAX_ENTRIES;
+
+	for (i = 0; i < dest->acl_cnt; i++) {
+		dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
+		dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
+		dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
+	}
+
+	return (0);
+}
+
+int
+acl_copy_acl_into_oldacl(const struct acl *source, struct oldacl *dest)
+{
+	int i;
+
+	if (source->acl_cnt > OLDACL_MAX_ENTRIES)
+		return (EINVAL);
+
+	bzero(dest, sizeof(*dest));
+
+	dest->acl_cnt = source->acl_cnt;
+
+	for (i = 0; i < dest->acl_cnt; i++) {
+		dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
+		dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
+		dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
+	}
+
+	return (0);
+}
+
+/*
+ * At one time, "struct ACL" was extended in order to add support for NFSv4
+ * ACLs.  Instead of creating compatibility versions of all the ACL-related
+ * syscalls, they were left intact.  It's possible to find out what the code
+ * calling these syscalls (libc) expects basing on "type" argument - if it's
+ * either ACL_TYPE_ACCESS_OLD or ACL_TYPE_DEFAULT_OLD (which previously were
+ * known as ACL_TYPE_ACCESS and ACL_TYPE_DEFAULT), then it's the "struct
+ * oldacl".  If it's something else, then it's the new "struct acl".  In the
+ * latter case, the routines below just copyin/copyout the contents.  In the
+ * former case, they copyin the "struct oldacl" and convert it to the new
+ * format.
+ */
+static int
+acl_copyin(void *user_acl, struct acl *kernel_acl, acl_type_t type)
+{
+	int error;
+	struct oldacl old;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS_OLD:
+	case ACL_TYPE_DEFAULT_OLD:
+		error = copyin(user_acl, &old, sizeof(old));
+		if (error != 0)
+			break;
+		acl_copy_oldacl_into_acl(&old, kernel_acl);
+		break;
+
+	default:
+		error = copyin(user_acl, kernel_acl, sizeof(*kernel_acl));
+		if (kernel_acl->acl_maxcnt != ACL_MAX_ENTRIES)
+			return (EINVAL);
+	}
+
+	return (error);
+}
+
+static int
+acl_copyout(struct acl *kernel_acl, void *user_acl, acl_type_t type)
+{
+	int error;
+	struct oldacl old;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS_OLD:
+	case ACL_TYPE_DEFAULT_OLD:
+		error = acl_copy_acl_into_oldacl(kernel_acl, &old);
+		if (error != 0)
+			break;
+
+		error = copyout(&old, user_acl, sizeof(old));
+		break;
+
+	default:
+		if (fuword32((char *)user_acl +
+		    offsetof(struct acl, acl_maxcnt)) != ACL_MAX_ENTRIES)
+			return (EINVAL);
+
+		error = copyout(kernel_acl, user_acl, sizeof(*kernel_acl));
+	}
+
+	return (error);
+}
+
+/*
+ * Convert "old" type - ACL_TYPE_{ACCESS,DEFAULT}_OLD - into its "new"
+ * counterpart.  It's required for old (pre-NFSv4 ACLs) libc to work
+ * with new kernel.  Fixing 'type' for old binaries with new libc
+ * is being done in lib/libc/posix1e/acl_support.c:_acl_type_unold().
+ */
+static int
+acl_type_unold(int type)
+{
+	switch (type) {
+	case ACL_TYPE_ACCESS_OLD:
+		return (ACL_TYPE_ACCESS);
+
+	case ACL_TYPE_DEFAULT_OLD:
+		return (ACL_TYPE_DEFAULT);
+
+	default:
+		return (type);
+	}
+}
+
+/*
+ * These calls wrap the real vnode operations, and are called by the syscall
+ * code once the syscall has converted the path or file descriptor to a vnode
+ * (unlocked).  The aclp pointer is assumed still to point to userland, so
+ * this should not be consumed within the kernel except by syscall code.
+ * Other code should directly invoke VOP_{SET,GET}ACL.
+ */
+
+/*
+ * Given a vnode, set its ACL.
+ */
+static int
+vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl *inkernelacl;
+	struct mount *mp;
+	int error;
+
+	inkernelacl = acl_alloc(M_WAITOK);
+	error = acl_copyin(aclp, inkernelacl, type);
+	if (error != 0)
+		goto out;
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error != 0)
+		goto out;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef MAC
+	error = mac_vnode_check_setacl(td->td_ucred, vp, type, inkernelacl);
+	if (error != 0)
+		goto out_unlock;
+#endif
+	error = VOP_SETACL(vp, acl_type_unold(type), inkernelacl,
+	    td->td_ucred, td);
+#ifdef MAC
+out_unlock:
+#endif
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+out:
+	acl_free(inkernelacl);
+	return (error);
+}
+
+/*
+ * Given a vnode, get its ACL.
+ */
+static int
+vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl *inkernelacl;
+	int error;
+
+	inkernelacl = acl_alloc(M_WAITOK | M_ZERO);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef MAC
+	error = mac_vnode_check_getacl(td->td_ucred, vp, type);
+	if (error != 0)
+		goto out;
+#endif
+	error = VOP_GETACL(vp, acl_type_unold(type), inkernelacl,
+	    td->td_ucred, td);
+
+#ifdef MAC
+out:
+#endif
+	VOP_UNLOCK(vp, 0);
+	if (error == 0)
+		error = acl_copyout(inkernelacl, aclp, type);
+	acl_free(inkernelacl);
+	return (error);
+}
+
+/*
+ * Given a vnode, delete its ACL.
+ */
+static int
+vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
+{
+	struct mount *mp;
+	int error;
+
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error != 0)
+		return (error);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef MAC
+	error = mac_vnode_check_deleteacl(td->td_ucred, vp, type);
+	if (error != 0)
+		goto out;
+#endif
+	error = VOP_SETACL(vp, acl_type_unold(type), 0, td->td_ucred, td);
+#ifdef MAC
+out:
+#endif
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Given a vnode, check whether an ACL is appropriate for it
+ */
+static int
+vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl *inkernelacl;
+	int error;
+
+	inkernelacl = acl_alloc(M_WAITOK);
+	error = acl_copyin(aclp, inkernelacl, type);
+	if (error != 0)
+		goto out;
+	error = VOP_ACLCHECK(vp, acl_type_unold(type), inkernelacl,
+	    td->td_ucred, td);
+out:
+	acl_free(inkernelacl);
+	return (error);
+}
+
+/*
+ * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.  Don't
+ * need to lock, as the vacl_ code will get/release any locks required.
+ */
+
+/*
+ * Given a file path, get an ACL for it
+ */
+int
+sys___acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
+		NDFREE(&nd, 0);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, get an ACL for it; don't follow links.
+ */
+int
+sys___acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
+		NDFREE(&nd, 0);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it.
+ */
+int
+sys___acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
+		NDFREE(&nd, 0);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it; don't follow links.
+ */
+int
+sys___acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
+		NDFREE(&nd, 0);
+	}
+	return (error);
+}
+
+/*
+ * Given a file descriptor, get an ACL for it.
+ */
+int
+sys___acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	error = getvnode(td->td_proc->p_fd, uap->filedes,
+	    cap_rights_init(&rights, CAP_ACL_GET), &fp);
+	if (error == 0) {
+		error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
+		fdrop(fp, td);
+	}
+	return (error);
+}
+
+/*
+ * Given a file descriptor, set an ACL for it.
+ */
+int
+sys___acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	error = getvnode(td->td_proc->p_fd, uap->filedes,
+	    cap_rights_init(&rights, CAP_ACL_SET), &fp);
+	if (error == 0) {
+		error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
+		fdrop(fp, td);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ */
+int
+sys___acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_delete(td, nd.ni_vp, uap->type);
+		NDFREE(&nd, 0);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it; don't follow links.
+ */
+int
+sys___acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_delete(td, nd.ni_vp, uap->type);
+		NDFREE(&nd, 0);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ */
+int
+sys___acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	error = getvnode(td->td_proc->p_fd, uap->filedes,
+	    cap_rights_init(&rights, CAP_ACL_DELETE), &fp);
+	if (error == 0) {
+		error = vacl_delete(td, fp->f_vnode, uap->type);
+		fdrop(fp, td);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it.
+ */
+int
+sys___acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
+		NDFREE(&nd, 0);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it; don't follow links.
+ */
+int
+sys___acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
+		NDFREE(&nd, 0);
+	}
+	return (error);
+}
+
+/*
+ * Given a file descriptor, check an ACL for it.
+ */
+int
+sys___acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	error = getvnode(td->td_proc->p_fd, uap->filedes,
+	    cap_rights_init(&rights, CAP_ACL_CHECK), &fp);
+	if (error == 0) {
+		error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
+		fdrop(fp, td);
+	}
+	return (error);
+}
+
+struct acl *
+acl_alloc(int flags)
+{
+	struct acl *aclp;
+
+	aclp = malloc(sizeof(*aclp), M_ACL, flags);
+	if (aclp == NULL)
+		return (NULL);
+
+	aclp->acl_maxcnt = ACL_MAX_ENTRIES;
+
+	return (aclp);
+}
+
+void
+acl_free(struct acl *aclp)
+{
+
+	free(aclp, M_ACL);
+}
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
new file mode 100644
index 0000000..7f9f881
--- /dev/null
+++ b/sys/kern/vfs_aio.c
@@ -0,0 +1,3069 @@
+/*-
+ * Copyright (c) 1997 John S. Dyson.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. John S. Dyson's name may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
+ * bad that happens because of using this software isn't the responsibility
+ * of the author.  This software is distributed AS-IS.
+ */
+
+/*
+ * This file contains support for the POSIX 1003.1B AIO/LIO facility.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/capability.h>
+#include <sys/eventhandler.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/kthread.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/unistd.h>
+#include <sys/posix4.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/protosw.h>
+#include <sys/rwlock.h>
+#include <sys/sema.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+#include <sys/vnode.h>
+#include <sys/conf.h>
+#include <sys/event.h>
+#include <sys/mount.h>
+
+#include <machine/atomic.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/uma.h>
+#include <sys/aio.h>
+
+#include "opt_vfs_aio.h"
+
+/*
+ * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
+ * overflow. (XXX will be removed soon.)
+ */
+static u_long jobrefid;
+
+/*
+ * Counter for aio_fsync.
+ */
+static uint64_t jobseqno;
+
+#define JOBST_NULL		0
+#define JOBST_JOBQSOCK		1
+#define JOBST_JOBQGLOBAL	2
+#define JOBST_JOBRUNNING	3
+#define JOBST_JOBFINISHED	4
+#define JOBST_JOBQBUF		5
+#define JOBST_JOBQSYNC		6
+
+#ifndef MAX_AIO_PER_PROC
+#define MAX_AIO_PER_PROC	32
+#endif
+
+#ifndef MAX_AIO_QUEUE_PER_PROC
+#define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
+#endif
+
+#ifndef MAX_AIO_PROCS
+#define MAX_AIO_PROCS		32
+#endif
+
+#ifndef MAX_AIO_QUEUE
+#define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
+#endif
+
+#ifndef TARGET_AIO_PROCS
+#define TARGET_AIO_PROCS	4
+#endif
+
+#ifndef MAX_BUF_AIO
+#define MAX_BUF_AIO		16
+#endif
+
+#ifndef AIOD_TIMEOUT_DEFAULT
+#define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
+#endif
+
+#ifndef AIOD_LIFETIME_DEFAULT
+#define AIOD_LIFETIME_DEFAULT	(30 * hz)
+#endif
+
+FEATURE(aio, "Asynchronous I/O");
+
+static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
+
+static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
+
+static int max_aio_procs = MAX_AIO_PROCS;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
+	CTLFLAG_RW, &max_aio_procs, 0,
+	"Maximum number of kernel threads to use for handling async IO ");
+
+static int num_aio_procs = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
+	CTLFLAG_RD, &num_aio_procs, 0,
+	"Number of presently active kernel threads for async IO");
+
+/*
+ * The code will adjust the actual number of AIO processes towards this
+ * number when it gets a chance.
+ */
+static int target_aio_procs = TARGET_AIO_PROCS;
+SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
+	0, "Preferred number of ready kernel threads for async IO");
+
+static int max_queue_count = MAX_AIO_QUEUE;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
+    "Maximum number of aio requests to queue, globally");
+
+static int num_queue_count = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
+    "Number of queued aio requests");
+
+static int num_buf_aio = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
+    "Number of aio requests presently handled by the buf subsystem");
+
+/* Number of async I/O thread in the process of being started */
+/* XXX This should be local to aio_aqueue() */
+static int num_aio_resv_start = 0;
+
+static int aiod_timeout;
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
+    "Timeout value for synchronous aio operations");
+
+static int aiod_lifetime;
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
+    "Maximum lifetime for idle aiod");
+
+static int unloadable = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
+    "Allow unload of aio (not recommended)");
+
+
+static int max_aio_per_proc = MAX_AIO_PER_PROC;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
+    0, "Maximum active aio requests per process (stored in the process)");
+
+static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
+    &max_aio_queue_per_proc, 0,
+    "Maximum queued aio requests per process (stored in the process)");
+
+static int max_buf_aio = MAX_BUF_AIO;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
+    "Maximum buf aio requests per process (stored in the process)");
+
+typedef struct oaiocb {
+	int	aio_fildes;		/* File descriptor */
+	off_t	aio_offset;		/* File offset for I/O */
+	volatile void *aio_buf;         /* I/O buffer in process space */
+	size_t	aio_nbytes;		/* Number of bytes for I/O */
+	struct	osigevent aio_sigevent;	/* Signal to deliver */
+	int	aio_lio_opcode;		/* LIO opcode */
+	int	aio_reqprio;		/* Request priority -- ignored */
+	struct	__aiocb_private	_aiocb_private;
+} oaiocb_t;
+
+/*
+ * Below is a key of locks used to protect each member of struct aiocblist
+ * aioliojob and kaioinfo and any backends.
+ *
+ * * - need not protected
+ * a - locked by kaioinfo lock
+ * b - locked by backend lock, the backend lock can be null in some cases,
+ *     for example, BIO belongs to this type, in this case, proc lock is
+ *     reused.
+ * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
+ */
+
+/*
+ * Current, there is only two backends: BIO and generic file I/O.
+ * socket I/O is served by generic file I/O, this is not a good idea, since
+ * disk file I/O and any other types without O_NONBLOCK flag can block daemon
+ * threads, if there is no thread to serve socket I/O, the socket I/O will be
+ * delayed too long or starved, we should create some threads dedicated to
+ * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
+ * systems we really need non-blocking interface, fiddling O_NONBLOCK in file
+ * structure is not safe because there is race between userland and aio
+ * daemons.
+ */
+
+struct aiocblist {
+	TAILQ_ENTRY(aiocblist) list;	/* (b) internal list of for backend */
+	TAILQ_ENTRY(aiocblist) plist;	/* (a) list of jobs for each backend */
+	TAILQ_ENTRY(aiocblist) allist;  /* (a) list of all jobs in proc */
+	int	jobflags;		/* (a) job flags */
+	int	jobstate;		/* (b) job state */
+	int	inputcharge;		/* (*) input blockes */
+	int	outputcharge;		/* (*) output blockes */
+	struct	buf *bp;		/* (*) private to BIO backend,
+				  	 * buffer pointer
+					 */
+	struct	proc *userproc;		/* (*) user process */
+	struct  ucred *cred;		/* (*) active credential when created */
+	struct	file *fd_file;		/* (*) pointer to file structure */
+	struct	aioliojob *lio;		/* (*) optional lio job */
+	struct	aiocb *uuaiocb;		/* (*) pointer in userspace of aiocb */
+	struct	knlist klist;		/* (a) list of knotes */
+	struct	aiocb uaiocb;		/* (*) kernel I/O control block */
+	ksiginfo_t ksi;			/* (a) realtime signal info */
+	struct	task biotask;		/* (*) private to BIO backend */
+	uint64_t seqno;			/* (*) job number */
+	int	pending;		/* (a) number of pending I/O, aio_fsync only */
+};
+
+/* jobflags */
+#define AIOCBLIST_DONE		0x01
+#define AIOCBLIST_BUFDONE	0x02
+#define AIOCBLIST_RUNDOWN	0x04
+#define AIOCBLIST_CHECKSYNC	0x08
+
+/*
+ * AIO process info
+ */
+#define AIOP_FREE	0x1			/* proc on free queue */
+
+struct aiothreadlist {
+	int aiothreadflags;			/* (c) AIO proc flags */
+	TAILQ_ENTRY(aiothreadlist) list;	/* (c) list of processes */
+	struct thread *aiothread;		/* (*) the AIO thread */
+};
+
+/*
+ * data-structure for lio signal management
+ */
+struct aioliojob {
+	int	lioj_flags;			/* (a) listio flags */
+	int	lioj_count;			/* (a) listio flags */
+	int	lioj_finished_count;		/* (a) listio flags */
+	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
+	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
+	struct  knlist klist;			/* (a) list of knotes */
+	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
+};
+
+#define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
+#define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
+#define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
+
+/*
+ * per process aio data structure
+ */
+struct kaioinfo {
+	struct mtx	kaio_mtx;	/* the lock to protect this struct */
+	int	kaio_flags;		/* (a) per process kaio flags */
+	int	kaio_maxactive_count;	/* (*) maximum number of AIOs */
+	int	kaio_active_count;	/* (c) number of currently used AIOs */
+	int	kaio_qallowed_count;	/* (*) maxiumu size of AIO queue */
+	int	kaio_count;		/* (a) size of AIO queue */
+	int	kaio_ballowed_count;	/* (*) maximum number of buffers */
+	int	kaio_buffer_count;	/* (a) number of physio buffers */
+	TAILQ_HEAD(,aiocblist) kaio_all;	/* (a) all AIOs in the process */
+	TAILQ_HEAD(,aiocblist) kaio_done;	/* (a) done queue for process */
+	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
+	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* (a) job queue for process */
+	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* (a) buffer job queue for process */
+	TAILQ_HEAD(,aiocblist) kaio_sockqueue;  /* (a) queue for aios waiting on sockets,
+						 *  NOT USED YET.
+						 */
+	TAILQ_HEAD(,aiocblist) kaio_syncqueue;	/* (a) queue for aio_fsync */
+	struct	task	kaio_task;	/* (*) task to kick aio threads */
+};
+
+#define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
+#define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
+#define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
+#define AIO_MTX(ki)		(&(ki)->kaio_mtx)
+
+#define KAIO_RUNDOWN	0x1	/* process is being run down */
+#define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
+
+/*
+ * Operations used to interact with userland aio control blocks.
+ * Different ABIs provide their own operations.
+ */
+struct aiocb_ops {
+	int	(*copyin)(struct aiocb *ujob, struct aiocb *kjob);
+	long	(*fetch_status)(struct aiocb *ujob);
+	long	(*fetch_error)(struct aiocb *ujob);
+	int	(*store_status)(struct aiocb *ujob, long status);
+	int	(*store_error)(struct aiocb *ujob, long error);
+	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
+	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
+};
+
+static TAILQ_HEAD(,aiothreadlist) aio_freeproc;		/* (c) Idle daemons */
+static struct sema aio_newproc_sem;
+static struct mtx aio_job_mtx;
+static struct mtx aio_sock_mtx;
+static TAILQ_HEAD(,aiocblist) aio_jobs;			/* (c) Async job list */
+static struct unrhdr *aiod_unr;
+
+void		aio_init_aioinfo(struct proc *p);
+static int	aio_onceonly(void);
+static int	aio_free_entry(struct aiocblist *aiocbe);
+static void	aio_process_rw(struct aiocblist *aiocbe);
+static void	aio_process_sync(struct aiocblist *aiocbe);
+static void	aio_process_mlock(struct aiocblist *aiocbe);
+static int	aio_newproc(int *);
+int		aio_aqueue(struct thread *td, struct aiocb *job,
+			struct aioliojob *lio, int type, struct aiocb_ops *ops);
+static void	aio_physwakeup(struct buf *bp);
+static void	aio_proc_rundown(void *arg, struct proc *p);
+static void	aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
+static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
+static void	biohelper(void *, int);
+static void	aio_daemon(void *param);
+static void	aio_swake_cb(struct socket *, struct sockbuf *);
+static int	aio_unload(void);
+static void	aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type);
+#define DONE_BUF	1
+#define DONE_QUEUE	2
+static int	aio_kick(struct proc *userp);
+static void	aio_kick_nowait(struct proc *userp);
+static void	aio_kick_helper(void *context, int pending);
+static int	filt_aioattach(struct knote *kn);
+static void	filt_aiodetach(struct knote *kn);
+static int	filt_aio(struct knote *kn, long hint);
+static int	filt_lioattach(struct knote *kn);
+static void	filt_liodetach(struct knote *kn);
+static int	filt_lio(struct knote *kn, long hint);
+
+/*
+ * Zones for:
+ * 	kaio	Per process async io info
+ *	aiop	async io thread data
+ *	aiocb	async io jobs
+ *	aiol	list io job pointer - internal to aio_suspend XXX
+ *	aiolio	list io jobs
+ */
+static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
+
+/* kqueue filters for aio */
+static struct filterops aio_filtops = {
+	.f_isfd = 0,
+	.f_attach = filt_aioattach,
+	.f_detach = filt_aiodetach,
+	.f_event = filt_aio,
+};
+static struct filterops lio_filtops = {
+	.f_isfd = 0,
+	.f_attach = filt_lioattach,
+	.f_detach = filt_liodetach,
+	.f_event = filt_lio
+};
+
+static eventhandler_tag exit_tag, exec_tag;
+
+TASKQUEUE_DEFINE_THREAD(aiod_bio);
+
+/*
+ * Main operations function for use as a kernel module.
+ */
+static int
+aio_modload(struct module *module, int cmd, void *arg)
+{
+	int error = 0;
+
+	switch (cmd) {
+	case MOD_LOAD:
+		aio_onceonly();
+		break;
+	case MOD_UNLOAD:
+		error = aio_unload();
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t aio_mod = {
+	"aio",
+	&aio_modload,
+	NULL
+};
+
+static struct syscall_helper_data aio_syscalls[] = {
+	SYSCALL_INIT_HELPER(aio_cancel),
+	SYSCALL_INIT_HELPER(aio_error),
+	SYSCALL_INIT_HELPER(aio_fsync),
+	SYSCALL_INIT_HELPER(aio_mlock),
+	SYSCALL_INIT_HELPER(aio_read),
+	SYSCALL_INIT_HELPER(aio_return),
+	SYSCALL_INIT_HELPER(aio_suspend),
+	SYSCALL_INIT_HELPER(aio_waitcomplete),
+	SYSCALL_INIT_HELPER(aio_write),
+	SYSCALL_INIT_HELPER(lio_listio),
+	SYSCALL_INIT_HELPER(oaio_read),
+	SYSCALL_INIT_HELPER(oaio_write),
+	SYSCALL_INIT_HELPER(olio_listio),
+	SYSCALL_INIT_LAST
+};
+
+#ifdef COMPAT_FREEBSD32
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+static struct syscall_helper_data aio32_syscalls[] = {
+	SYSCALL32_INIT_HELPER(freebsd32_aio_return),
+	SYSCALL32_INIT_HELPER(freebsd32_aio_suspend),
+	SYSCALL32_INIT_HELPER(freebsd32_aio_cancel),
+	SYSCALL32_INIT_HELPER(freebsd32_aio_error),
+	SYSCALL32_INIT_HELPER(freebsd32_aio_fsync),
+	SYSCALL32_INIT_HELPER(freebsd32_aio_mlock),
+	SYSCALL32_INIT_HELPER(freebsd32_aio_read),
+	SYSCALL32_INIT_HELPER(freebsd32_aio_write),
+	SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete),
+	SYSCALL32_INIT_HELPER(freebsd32_lio_listio),
+	SYSCALL32_INIT_HELPER(freebsd32_oaio_read),
+	SYSCALL32_INIT_HELPER(freebsd32_oaio_write),
+	SYSCALL32_INIT_HELPER(freebsd32_olio_listio),
+	SYSCALL_INIT_LAST
+};
+#endif
+
+DECLARE_MODULE(aio, aio_mod,
+	SI_SUB_VFS, SI_ORDER_ANY);
+MODULE_VERSION(aio, 1);
+
+/*
+ * Startup initialization
+ */
+static int
+aio_onceonly(void)
+{
+	int error;
+
+	/* XXX: should probably just use so->callback */
+	aio_swake = &aio_swake_cb;
+	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
+	    EVENTHANDLER_PRI_ANY);
+	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL,
+	    EVENTHANDLER_PRI_ANY);
+	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
+	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
+	TAILQ_INIT(&aio_freeproc);
+	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
+	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
+	mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF);
+	TAILQ_INIT(&aio_jobs);
+	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
+	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
+	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
+	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
+	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
+	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
+	jobrefid = 1;
+	async_io_version = _POSIX_VERSION;
+	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
+	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
+	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
+
+	error = syscall_helper_register(aio_syscalls);
+	if (error)
+		return (error);
+#ifdef COMPAT_FREEBSD32
+	error = syscall32_helper_register(aio32_syscalls);
+	if (error)
+		return (error);
+#endif
+	return (0);
+}
+
+/*
+ * Callback for unload of AIO when used as a module.
+ */
+static int
+aio_unload(void)
+{
+	int error;
+
+	/*
+	 * XXX: no unloads by default, it's too dangerous.
+	 * perhaps we could do it if locked out callers and then
+	 * did an aio_proc_rundown() on each process.
+	 *
+	 * jhb: aio_proc_rundown() needs to run on curproc though,
+	 * so I don't think that would fly.
+	 */
+	if (!unloadable)
+		return (EOPNOTSUPP);
+
+#ifdef COMPAT_FREEBSD32
+	syscall32_helper_unregister(aio32_syscalls);
+#endif
+	syscall_helper_unregister(aio_syscalls);
+
+	error = kqueue_del_filteropts(EVFILT_AIO);
+	if (error)
+		return error;
+	error = kqueue_del_filteropts(EVFILT_LIO);
+	if (error)
+		return error;
+	async_io_version = 0;
+	aio_swake = NULL;
+	taskqueue_free(taskqueue_aiod_bio);
+	delete_unrhdr(aiod_unr);
+	uma_zdestroy(kaio_zone);
+	uma_zdestroy(aiop_zone);
+	uma_zdestroy(aiocb_zone);
+	uma_zdestroy(aiol_zone);
+	uma_zdestroy(aiolio_zone);
+	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
+	EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
+	mtx_destroy(&aio_job_mtx);
+	mtx_destroy(&aio_sock_mtx);
+	sema_destroy(&aio_newproc_sem);
+	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
+	p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
+	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
+	return (0);
+}
+
+/*
+ * Init the per-process aioinfo structure.  The aioinfo limits are set
+ * per-process for user limit (resource) management.
+ */
+void
+aio_init_aioinfo(struct proc *p)
+{
+	struct kaioinfo *ki;
+
+	ki = uma_zalloc(kaio_zone, M_WAITOK);
+	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF);
+	ki->kaio_flags = 0;
+	ki->kaio_maxactive_count = max_aio_per_proc;
+	ki->kaio_active_count = 0;
+	ki->kaio_qallowed_count = max_aio_queue_per_proc;
+	ki->kaio_count = 0;
+	ki->kaio_ballowed_count = max_buf_aio;
+	ki->kaio_buffer_count = 0;
+	TAILQ_INIT(&ki->kaio_all);
+	TAILQ_INIT(&ki->kaio_done);
+	TAILQ_INIT(&ki->kaio_jobqueue);
+	TAILQ_INIT(&ki->kaio_bufqueue);
+	TAILQ_INIT(&ki->kaio_liojoblist);
+	TAILQ_INIT(&ki->kaio_sockqueue);
+	TAILQ_INIT(&ki->kaio_syncqueue);
+	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
+	PROC_LOCK(p);
+	if (p->p_aioinfo == NULL) {
+		p->p_aioinfo = ki;
+		PROC_UNLOCK(p);
+	} else {
+		PROC_UNLOCK(p);
+		mtx_destroy(&ki->kaio_mtx);
+		uma_zfree(kaio_zone, ki);
+	}
+
+	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
+		aio_newproc(NULL);
+}
+
+static int
+aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
+{
+	struct thread *td;
+	int error;
+
+	error = sigev_findtd(p, sigev, &td);
+	if (error)
+		return (error);
+	if (!KSI_ONQ(ksi)) {
+		ksiginfo_set_sigev(ksi, sigev);
+		ksi->ksi_code = SI_ASYNCIO;
+		ksi->ksi_flags |= KSI_EXT | KSI_INS;
+		tdsendsignal(p, td, ksi->ksi_signo, ksi);
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+/*
+ * Free a job entry.  Wait for completion if it is currently active, but don't
+ * delay forever.  If we delay, we return a flag that says that we have to
+ * restart the queue scan.
+ */
+static int
+aio_free_entry(struct aiocblist *aiocbe)
+{
+	struct kaioinfo *ki;
+	struct aioliojob *lj;
+	struct proc *p;
+
+	p = aiocbe->userproc;
+	MPASS(curproc == p);
+	ki = p->p_aioinfo;
+	MPASS(ki != NULL);
+
+	AIO_LOCK_ASSERT(ki, MA_OWNED);
+	MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);
+
+	atomic_subtract_int(&num_queue_count, 1);
+
+	ki->kaio_count--;
+	MPASS(ki->kaio_count >= 0);
+
+	TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
+	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
+
+	lj = aiocbe->lio;
+	if (lj) {
+		lj->lioj_count--;
+		lj->lioj_finished_count--;
+
+		if (lj->lioj_count == 0) {
+			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+			/* lio is going away, we need to destroy any knotes */
+			knlist_delete(&lj->klist, curthread, 1);
+			PROC_LOCK(p);
+			sigqueue_take(&lj->lioj_ksi);
+			PROC_UNLOCK(p);
+			uma_zfree(aiolio_zone, lj);
+		}
+	}
+
+	/* aiocbe is going away, we need to destroy any knotes */
+	knlist_delete(&aiocbe->klist, curthread, 1);
+	PROC_LOCK(p);
+	sigqueue_take(&aiocbe->ksi);
+	PROC_UNLOCK(p);
+
+	MPASS(aiocbe->bp == NULL);
+	aiocbe->jobstate = JOBST_NULL;
+	AIO_UNLOCK(ki);
+
+	/*
+	 * The thread argument here is used to find the owning process
+	 * and is also passed to fo_close() which may pass it to various
+	 * places such as devsw close() routines.  Because of that, we
+	 * need a thread pointer from the process owning the job that is
+	 * persistent and won't disappear out from under us or move to
+	 * another process.
+	 *
+	 * Currently, all the callers of this function call it to remove
+	 * an aiocblist from the current process' job list either via a
+	 * syscall or due to the current process calling exit() or
+	 * execve().  Thus, we know that p == curproc.  We also know that
+	 * curthread can't exit since we are curthread.
+	 *
+	 * Therefore, we use curthread as the thread to pass to
+	 * knlist_delete().  This does mean that it is possible for the
+	 * thread pointer at close time to differ from the thread pointer
+	 * at open time, but this is already true of file descriptors in
+	 * a multithreaded process.
+	 */
+	if (aiocbe->fd_file)
+		fdrop(aiocbe->fd_file, curthread);
+	crfree(aiocbe->cred);
+	uma_zfree(aiocb_zone, aiocbe);
+	AIO_LOCK(ki);
+
+	return (0);
+}
+
+static void
+aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
+{
+   	aio_proc_rundown(arg, p);
+}
+
+/*
+ * Rundown the jobs for a given process.
+ */
+static void
+aio_proc_rundown(void *arg, struct proc *p)
+{
+	struct kaioinfo *ki;
+	struct aioliojob *lj;
+	struct aiocblist *cbe, *cbn;
+	struct file *fp;
+	struct socket *so;
+	int remove;
+
+	KASSERT(curthread->td_proc == p,
+	    ("%s: called on non-curproc", __func__));
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return;
+
+	AIO_LOCK(ki);
+	ki->kaio_flags |= KAIO_RUNDOWN;
+
+restart:
+
+	/*
+	 * Try to cancel all pending requests. This code simulates
+	 * aio_cancel on all pending I/O requests.
+	 */
+	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
+		remove = 0;
+		mtx_lock(&aio_job_mtx);
+		if (cbe->jobstate == JOBST_JOBQGLOBAL) {
+			TAILQ_REMOVE(&aio_jobs, cbe, list);
+			remove = 1;
+		} else if (cbe->jobstate == JOBST_JOBQSOCK) {
+			fp = cbe->fd_file;
+			MPASS(fp->f_type == DTYPE_SOCKET);
+			so = fp->f_data;
+			TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
+			remove = 1;
+		} else if (cbe->jobstate == JOBST_JOBQSYNC) {
+			TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
+			remove = 1;
+		}
+		mtx_unlock(&aio_job_mtx);
+
+		if (remove) {
+			cbe->jobstate = JOBST_JOBFINISHED;
+			cbe->uaiocb._aiocb_private.status = -1;
+			cbe->uaiocb._aiocb_private.error = ECANCELED;
+			TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
+			aio_bio_done_notify(p, cbe, DONE_QUEUE);
+		}
+	}
+
+	/* Wait for all running I/O to be finished */
+	if (TAILQ_FIRST(&ki->kaio_bufqueue) ||
+	    TAILQ_FIRST(&ki->kaio_jobqueue)) {
+		ki->kaio_flags |= KAIO_WAKEUP;
+		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
+		goto restart;
+	}
+
+	/* Free all completed I/O requests. */
+	while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
+		aio_free_entry(cbe);
+
+	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
+		if (lj->lioj_count == 0) {
+			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+			knlist_delete(&lj->klist, curthread, 1);
+			PROC_LOCK(p);
+			sigqueue_take(&lj->lioj_ksi);
+			PROC_UNLOCK(p);
+			uma_zfree(aiolio_zone, lj);
+		} else {
+			panic("LIO job not cleaned up: C:%d, FC:%d\n",
+			    lj->lioj_count, lj->lioj_finished_count);
+		}
+	}
+	AIO_UNLOCK(ki);
+	taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task);
+	mtx_destroy(&ki->kaio_mtx);
+	uma_zfree(kaio_zone, ki);
+	p->p_aioinfo = NULL;
+}
+
+/*
+ * Select a job to run (called by an AIO daemon).
+ */
+static struct aiocblist *
+aio_selectjob(struct aiothreadlist *aiop)
+{
+	struct aiocblist *aiocbe;
+	struct kaioinfo *ki;
+	struct proc *userp;
+
+	mtx_assert(&aio_job_mtx, MA_OWNED);
+	TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
+		userp = aiocbe->userproc;
+		ki = userp->p_aioinfo;
+
+		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
+			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
+			/* Account for currently active jobs. */
+			ki->kaio_active_count++;
+			aiocbe->jobstate = JOBST_JOBRUNNING;
+			break;
+		}
+	}
+	return (aiocbe);
+}
+
+/*
+ *  Move all data to a permanent storage device, this code
+ *  simulates fsync syscall.
+ */
+static int
+aio_fsync_vnode(struct thread *td, struct vnode *vp)
+{
+	struct mount *mp;
+	int error;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		goto drop;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	if (vp->v_object != NULL) {
+		VM_OBJECT_WLOCK(vp->v_object);
+		vm_object_page_clean(vp->v_object, 0, 0, 0);
+		VM_OBJECT_WUNLOCK(vp->v_object);
+	}
+	error = VOP_FSYNC(vp, MNT_WAIT, td);
+
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+drop:
+	return (error);
+}
+
+/*
+ * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
+ * does the I/O request for the non-physio version of the operations.  The
+ * normal vn operations are used, and this code should work in all instances
+ * for every type of file, including pipes, sockets, fifos, and regular files.
+ *
+ * XXX I don't think it works well for socket, pipe, and fifo.
+ */
+static void
+aio_process_rw(struct aiocblist *aiocbe)
+{
+	struct ucred *td_savedcred;
+	struct thread *td;
+	struct aiocb *cb;
+	struct file *fp;
+	struct socket *so;
+	struct uio auio;
+	struct iovec aiov;
+	int cnt;
+	int error;
+	int oublock_st, oublock_end;
+	int inblock_st, inblock_end;
+
+	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_READ ||
+	    aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE,
+	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
+
+	td = curthread;
+	td_savedcred = td->td_ucred;
+	td->td_ucred = aiocbe->cred;
+	cb = &aiocbe->uaiocb;
+	fp = aiocbe->fd_file;
+
+	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
+	aiov.iov_len = cb->aio_nbytes;
+
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = cb->aio_offset;
+	auio.uio_resid = cb->aio_nbytes;
+	cnt = cb->aio_nbytes;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+
+	inblock_st = td->td_ru.ru_inblock;
+	oublock_st = td->td_ru.ru_oublock;
+	/*
+	 * aio_aqueue() acquires a reference to the file that is
+	 * released in aio_free_entry().
+	 */
+	if (cb->aio_lio_opcode == LIO_READ) {
+		auio.uio_rw = UIO_READ;
+		if (auio.uio_resid == 0)
+			error = 0;
+		else
+			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+	} else {
+		if (fp->f_type == DTYPE_VNODE)
+			bwillwrite();
+		auio.uio_rw = UIO_WRITE;
+		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+	}
+	inblock_end = td->td_ru.ru_inblock;
+	oublock_end = td->td_ru.ru_oublock;
+
+	aiocbe->inputcharge = inblock_end - inblock_st;
+	aiocbe->outputcharge = oublock_end - oublock_st;
+
+	if ((error) && (auio.uio_resid != cnt)) {
+		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
+			error = 0;
+		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
+			int sigpipe = 1;
+			if (fp->f_type == DTYPE_SOCKET) {
+				so = fp->f_data;
+				if (so->so_options & SO_NOSIGPIPE)
+					sigpipe = 0;
+			}
+			if (sigpipe) {
+				PROC_LOCK(aiocbe->userproc);
+				kern_psignal(aiocbe->userproc, SIGPIPE);
+				PROC_UNLOCK(aiocbe->userproc);
+			}
+		}
+	}
+
+	cnt -= auio.uio_resid;
+	cb->_aiocb_private.error = error;
+	cb->_aiocb_private.status = cnt;
+	td->td_ucred = td_savedcred;
+}
+
+static void
+aio_process_sync(struct aiocblist *aiocbe)
+{
+	struct thread *td = curthread;
+	struct ucred *td_savedcred = td->td_ucred;
+	struct aiocb *cb = &aiocbe->uaiocb;
+	struct file *fp = aiocbe->fd_file;
+	int error = 0;
+
+	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_SYNC,
+	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
+
+	td->td_ucred = aiocbe->cred;
+	if (fp->f_vnode != NULL)
+		error = aio_fsync_vnode(td, fp->f_vnode);
+	cb->_aiocb_private.error = error;
+	cb->_aiocb_private.status = 0;
+	td->td_ucred = td_savedcred;
+}
+
+static void
+aio_process_mlock(struct aiocblist *aiocbe)
+{
+	struct aiocb *cb = &aiocbe->uaiocb;
+	int error;
+
+	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_MLOCK,
+	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
+
+	error = vm_mlock(aiocbe->userproc, aiocbe->cred,
+	    __DEVOLATILE(void *, cb->aio_buf), cb->aio_nbytes);
+	cb->_aiocb_private.error = error;
+	cb->_aiocb_private.status = 0;
+}
+
+static void
+aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
+{
+	struct aioliojob *lj;
+	struct kaioinfo *ki;
+	struct aiocblist *scb, *scbn;
+	int lj_done;
+
+	ki = userp->p_aioinfo;
+	AIO_LOCK_ASSERT(ki, MA_OWNED);
+	lj = aiocbe->lio;
+	lj_done = 0;
+	if (lj) {
+		lj->lioj_finished_count++;
+		if (lj->lioj_count == lj->lioj_finished_count)
+			lj_done = 1;
+	}
+	if (type == DONE_QUEUE) {
+		aiocbe->jobflags |= AIOCBLIST_DONE;
+	} else {
+		aiocbe->jobflags |= AIOCBLIST_BUFDONE;
+	}
+	TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
+	aiocbe->jobstate = JOBST_JOBFINISHED;
+
+	if (ki->kaio_flags & KAIO_RUNDOWN)
+		goto notification_done;
+
+	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
+	    aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
+		aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);
+
+	KNOTE_LOCKED(&aiocbe->klist, 1);
+
+	if (lj_done) {
+		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
+			KNOTE_LOCKED(&lj->klist, 1);
+		}
+		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
+		    == LIOJ_SIGNAL
+		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
+			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
+			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+		}
+	}
+
+notification_done:
+	if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
+		TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
+			if (aiocbe->fd_file == scb->fd_file &&
+			    aiocbe->seqno < scb->seqno) {
+				if (--scb->pending == 0) {
+					mtx_lock(&aio_job_mtx);
+					scb->jobstate = JOBST_JOBQGLOBAL;
+					TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list);
+					TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
+					aio_kick_nowait(userp);
+					mtx_unlock(&aio_job_mtx);
+				}
+			}
+		}
+	}
+	if (ki->kaio_flags & KAIO_WAKEUP) {
+		ki->kaio_flags &= ~KAIO_WAKEUP;
+		wakeup(&userp->p_aioinfo);
+	}
+}
+
+/*
+ * The AIO daemon, most of the actual work is done in aio_process_*,
+ * but the setup (and address space mgmt) is done in this routine.
+ */
+static void
+aio_daemon(void *_id)
+{
+	struct aiocblist *aiocbe;
+	struct aiothreadlist *aiop;
+	struct kaioinfo *ki;
+	struct proc *curcp, *mycp, *userp;
+	struct vmspace *myvm, *tmpvm;
+	struct thread *td = curthread;
+	int id = (intptr_t)_id;
+
+	/*
+	 * Local copies of curproc (cp) and vmspace (myvm)
+	 */
+	mycp = td->td_proc;
+	myvm = mycp->p_vmspace;
+
+	KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp"));
+
+	/*
+	 * Allocate and ready the aio control info.  There is one aiop structure
+	 * per daemon.
+	 */
+	aiop = uma_zalloc(aiop_zone, M_WAITOK);
+	aiop->aiothread = td;
+	aiop->aiothreadflags = 0;
+
+	/* The daemon resides in its own pgrp. */
+	sys_setsid(td, NULL);
+
+	/*
+	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
+	 * and creating too many daemons.)
+	 */
+	sema_post(&aio_newproc_sem);
+
+	mtx_lock(&aio_job_mtx);
+	for (;;) {
+		/*
+		 * curcp is the current daemon process context.
+		 * userp is the current user process context.
+		 */
+		curcp = mycp;
+
+		/*
+		 * Take daemon off of free queue
+		 */
+		if (aiop->aiothreadflags & AIOP_FREE) {
+			TAILQ_REMOVE(&aio_freeproc, aiop, list);
+			aiop->aiothreadflags &= ~AIOP_FREE;
+		}
+
+		/*
+		 * Check for jobs.
+		 */
+		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
+			mtx_unlock(&aio_job_mtx);
+			userp = aiocbe->userproc;
+
+			/*
+			 * Connect to process address space for user program.
+			 */
+			if (userp != curcp) {
+				/*
+				 * Save the current address space that we are
+				 * connected to.
+				 */
+				tmpvm = mycp->p_vmspace;
+
+				/*
+				 * Point to the new user address space, and
+				 * refer to it.
+				 */
+				mycp->p_vmspace = userp->p_vmspace;
+				atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1);
+
+				/* Activate the new mapping. */
+				pmap_activate(FIRST_THREAD_IN_PROC(mycp));
+
+				/*
+				 * If the old address space wasn't the daemons
+				 * own address space, then we need to remove the
+				 * daemon's reference from the other process
+				 * that it was acting on behalf of.
+				 */
+				if (tmpvm != myvm) {
+					vmspace_free(tmpvm);
+				}
+				curcp = userp;
+			}
+
+			ki = userp->p_aioinfo;
+
+			/* Do the I/O function. */
+			switch(aiocbe->uaiocb.aio_lio_opcode) {
+			case LIO_READ:
+			case LIO_WRITE:
+				aio_process_rw(aiocbe);
+				break;
+			case LIO_SYNC:
+				aio_process_sync(aiocbe);
+				break;
+			case LIO_MLOCK:
+				aio_process_mlock(aiocbe);
+				break;
+			}
+
+			mtx_lock(&aio_job_mtx);
+			/* Decrement the active job count. */
+			ki->kaio_active_count--;
+			mtx_unlock(&aio_job_mtx);
+
+			AIO_LOCK(ki);
+			TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
+			aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
+			AIO_UNLOCK(ki);
+
+			mtx_lock(&aio_job_mtx);
+		}
+
+		/*
+		 * Disconnect from user address space.
+		 */
+		if (curcp != mycp) {
+
+			mtx_unlock(&aio_job_mtx);
+
+			/* Get the user address space to disconnect from. */
+			tmpvm = mycp->p_vmspace;
+
+			/* Get original address space for daemon. */
+			mycp->p_vmspace = myvm;
+
+			/* Activate the daemon's address space. */
+			pmap_activate(FIRST_THREAD_IN_PROC(mycp));
+#ifdef DIAGNOSTIC
+			if (tmpvm == myvm) {
+				printf("AIOD: vmspace problem -- %d\n",
+				    mycp->p_pid);
+			}
+#endif
+			/* Remove our vmspace reference. */
+			vmspace_free(tmpvm);
+
+			curcp = mycp;
+
+			mtx_lock(&aio_job_mtx);
+			/*
+			 * We have to restart to avoid race, we only sleep if
+			 * no job can be selected, that should be
+			 * curcp == mycp.
+			 */
+			continue;
+		}
+
+		mtx_assert(&aio_job_mtx, MA_OWNED);
+
+		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
+		aiop->aiothreadflags |= AIOP_FREE;
+
+		/*
+		 * If daemon is inactive for a long time, allow it to exit,
+		 * thereby freeing resources.
+		 */
+		if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy",
+		    aiod_lifetime)) {
+			if (TAILQ_EMPTY(&aio_jobs)) {
+				if ((aiop->aiothreadflags & AIOP_FREE) &&
+				    (num_aio_procs > target_aio_procs)) {
+					TAILQ_REMOVE(&aio_freeproc, aiop, list);
+					num_aio_procs--;
+					mtx_unlock(&aio_job_mtx);
+					uma_zfree(aiop_zone, aiop);
+					free_unr(aiod_unr, id);
+#ifdef DIAGNOSTIC
+					if (mycp->p_vmspace->vm_refcnt <= 1) {
+						printf("AIOD: bad vm refcnt for"
+						    " exiting daemon: %d\n",
+						    mycp->p_vmspace->vm_refcnt);
+					}
+#endif
+					kproc_exit(0);
+				}
+			}
+		}
+	}
+	mtx_unlock(&aio_job_mtx);
+	panic("shouldn't be here\n");
+}
+
+/*
+ * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
+ * AIO daemon modifies its environment itself.
+ */
+static int
+aio_newproc(int *start)
+{
+	int error;
+	struct proc *p;
+	int id;
+
+	id = alloc_unr(aiod_unr);
+	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
+		RFNOWAIT, 0, "aiod%d", id);
+	if (error == 0) {
+		/*
+		 * Wait until daemon is started.
+		 */
+		sema_wait(&aio_newproc_sem);
+		mtx_lock(&aio_job_mtx);
+		num_aio_procs++;
+		if (start != NULL)
+			(*start)--;
+		mtx_unlock(&aio_job_mtx);
+	} else {
+		free_unr(aiod_unr, id);
+	}
+	return (error);
+}
+
+/*
+ * Try the high-performance, low-overhead physio method for eligible
+ * VCHR devices.  This method doesn't use an aio helper thread, and
+ * thus has very low overhead.
+ *
+ * Assumes that the caller, aio_aqueue(), has incremented the file
+ * structure's reference count, preventing its deallocation for the
+ * duration of this call.
+ */
+static int
+aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
+{
+	struct aiocb *cb;
+	struct file *fp;
+	struct buf *bp;
+	struct vnode *vp;
+	struct cdevsw *csw;
+	struct cdev *dev;
+	struct kaioinfo *ki;
+	struct aioliojob *lj;
+	int error, ref;
+
+	cb = &aiocbe->uaiocb;
+	fp = aiocbe->fd_file;
+
+	if (fp == NULL || fp->f_type != DTYPE_VNODE)
+		return (-1);
+
+	vp = fp->f_vnode;
+
+	/*
+	 * If its not a disk, we don't want to return a positive error.
+	 * It causes the aio code to not fall through to try the thread
+	 * way when you're talking to a regular file.
+	 */
+	if (!vn_isdisk(vp, &error)) {
+		if (error == ENOTBLK)
+			return (-1);
+		else
+			return (error);
+	}
+
+	if (vp->v_bufobj.bo_bsize == 0)
+		return (-1);
+
+ 	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
+		return (-1);
+
+	if (cb->aio_nbytes >
+	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
+		return (-1);
+
+	ki = p->p_aioinfo;
+	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
+		return (-1);
+
+	ref = 0;
+	csw = devvn_refthread(vp, &dev, &ref);
+	if (csw == NULL)
+		return (ENXIO);
+	if (cb->aio_nbytes > dev->si_iosize_max) {
+		error = -1;
+		goto unref;
+	}
+
+	/* Create and build a buffer header for a transfer. */
+	bp = (struct buf *)getpbuf(NULL);
+	BUF_KERNPROC(bp);
+
+	AIO_LOCK(ki);
+	ki->kaio_count++;
+	ki->kaio_buffer_count++;
+	lj = aiocbe->lio;
+	if (lj)
+		lj->lioj_count++;
+	AIO_UNLOCK(ki);
+
+	/*
+	 * Get a copy of the kva from the physical buffer.
+	 */
+	error = 0;
+
+	bp->b_bcount = cb->aio_nbytes;
+	bp->b_bufsize = cb->aio_nbytes;
+	bp->b_iodone = aio_physwakeup;
+	bp->b_saveaddr = bp->b_data;
+	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
+	bp->b_offset = cb->aio_offset;
+	bp->b_iooffset = cb->aio_offset;
+	bp->b_blkno = btodb(cb->aio_offset);
+	bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
+
+	/*
+	 * Bring buffer into kernel space.
+	 */
+	if (vmapbuf(bp, (dev->si_flags & SI_UNMAPPED) == 0) < 0) {
+		error = EFAULT;
+		goto doerror;
+	}
+
+	AIO_LOCK(ki);
+	aiocbe->bp = bp;
+	bp->b_caller1 = (void *)aiocbe;
+	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
+	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
+	aiocbe->jobstate = JOBST_JOBQBUF;
+	cb->_aiocb_private.status = cb->aio_nbytes;
+	AIO_UNLOCK(ki);
+
+	atomic_add_int(&num_queue_count, 1);
+	atomic_add_int(&num_buf_aio, 1);
+
+	bp->b_error = 0;
+
+	TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
+
+	/* Perform transfer. */
+	dev_strategy_csw(dev, csw, bp);
+	dev_relthread(dev, ref);
+	return (0);
+
+doerror:
+	AIO_LOCK(ki);
+	ki->kaio_count--;
+	ki->kaio_buffer_count--;
+	if (lj)
+		lj->lioj_count--;
+	aiocbe->bp = NULL;
+	AIO_UNLOCK(ki);
+	relpbuf(bp, NULL);
+unref:
+	dev_relthread(dev, ref);
+	return (error);
+}
+
+/*
+ * Wake up aio requests that may be serviceable now.
+ */
+static void
+aio_swake_cb(struct socket *so, struct sockbuf *sb)
+{
+	struct aiocblist *cb, *cbn;
+	int opcode;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+	if (sb == &so->so_snd)
+		opcode = LIO_WRITE;
+	else
+		opcode = LIO_READ;
+
+	sb->sb_flags &= ~SB_AIO;
+	mtx_lock(&aio_job_mtx);
+	TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
+		if (opcode == cb->uaiocb.aio_lio_opcode) {
+			if (cb->jobstate != JOBST_JOBQSOCK)
+				panic("invalid queue value");
+			/* XXX
+			 * We don't have actual sockets backend yet,
+			 * so we simply move the requests to the generic
+			 * file I/O backend.
+			 */
+			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
+			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
+			aio_kick_nowait(cb->userproc);
+		}
+	}
+	mtx_unlock(&aio_job_mtx);
+}
+
+static int
+convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
+{
+
+	/*
+	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
+	 * supported by AIO with the old sigevent structure.
+	 */
+	nsig->sigev_notify = osig->sigev_notify;
+	switch (nsig->sigev_notify) {
+	case SIGEV_NONE:
+		break;
+	case SIGEV_SIGNAL:
+		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
+		break;
+	case SIGEV_KEVENT:
+		nsig->sigev_notify_kqueue =
+		    osig->__sigev_u.__sigev_notify_kqueue;
+		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
+		break;
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+static int
+aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
+{
+	struct oaiocb *ojob;
+	int error;
+
+	bzero(kjob, sizeof(struct aiocb));
+	error = copyin(ujob, kjob, sizeof(struct oaiocb));
+	if (error)
+		return (error);
+	ojob = (struct oaiocb *)kjob;
+	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
+}
+
+static int
+aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
+{
+
+	return (copyin(ujob, kjob, sizeof(struct aiocb)));
+}
+
+static long
+aiocb_fetch_status(struct aiocb *ujob)
+{
+
+	return (fuword(&ujob->_aiocb_private.status));
+}
+
+static long
+aiocb_fetch_error(struct aiocb *ujob)
+{
+
+	return (fuword(&ujob->_aiocb_private.error));
+}
+
+static int
+aiocb_store_status(struct aiocb *ujob, long status)
+{
+
+	return (suword(&ujob->_aiocb_private.status, status));
+}
+
+static int
+aiocb_store_error(struct aiocb *ujob, long error)
+{
+
+	return (suword(&ujob->_aiocb_private.error, error));
+}
+
+static int
+aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
+{
+
+	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
+}
+
+static int
+aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
+{
+
+	return (suword(ujobp, (long)ujob));
+}
+
+static struct aiocb_ops aiocb_ops = {
+	.copyin = aiocb_copyin,
+	.fetch_status = aiocb_fetch_status,
+	.fetch_error = aiocb_fetch_error,
+	.store_status = aiocb_store_status,
+	.store_error = aiocb_store_error,
+	.store_kernelinfo = aiocb_store_kernelinfo,
+	.store_aiocb = aiocb_store_aiocb,
+};
+
+static struct aiocb_ops aiocb_ops_osigevent = {
+	.copyin = aiocb_copyin_old_sigevent,
+	.fetch_status = aiocb_fetch_status,
+	.fetch_error = aiocb_fetch_error,
+	.store_status = aiocb_store_status,
+	.store_error = aiocb_store_error,
+	.store_kernelinfo = aiocb_store_kernelinfo,
+	.store_aiocb = aiocb_store_aiocb,
+};
+
+/*
+ * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
+ * technique is done in this code.
+ */
+int
+aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
+	int type, struct aiocb_ops *ops)
+{
+	struct proc *p = td->td_proc;
+	cap_rights_t rights;
+	struct file *fp;
+	struct socket *so;
+	struct aiocblist *aiocbe, *cb;
+	struct kaioinfo *ki;
+	struct kevent kev;
+	struct sockbuf *sb;
+	int opcode;
+	int error;
+	int fd, kqfd;
+	int jid;
+	u_short evflags;
+
+	if (p->p_aioinfo == NULL)
+		aio_init_aioinfo(p);
+
+	ki = p->p_aioinfo;
+
+	ops->store_status(job, -1);
+	ops->store_error(job, 0);
+	ops->store_kernelinfo(job, -1);
+
+	if (num_queue_count >= max_queue_count ||
+	    ki->kaio_count >= ki->kaio_qallowed_count) {
+		ops->store_error(job, EAGAIN);
+		return (EAGAIN);
+	}
+
+	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
+	knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki));
+
+	error = ops->copyin(job, &aiocbe->uaiocb);
+	if (error) {
+		ops->store_error(job, error);
+		uma_zfree(aiocb_zone, aiocbe);
+		return (error);
+	}
+
+	/* XXX: aio_nbytes is later casted to signed types. */
+	if (aiocbe->uaiocb.aio_nbytes > INT_MAX) {
+		uma_zfree(aiocb_zone, aiocbe);
+		return (EINVAL);
+	}
+
+	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
+	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
+	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
+	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
+		ops->store_error(job, EINVAL);
+		uma_zfree(aiocb_zone, aiocbe);
+		return (EINVAL);
+	}
+
+	if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
+	     aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
+		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
+		uma_zfree(aiocb_zone, aiocbe);
+		return (EINVAL);
+	}
+
+	ksiginfo_init(&aiocbe->ksi);
+
+	/* Save userspace address of the job info. */
+	aiocbe->uuaiocb = job;
+
+	/* Get the opcode. */
+	if (type != LIO_NOP)
+		aiocbe->uaiocb.aio_lio_opcode = type;
+	opcode = aiocbe->uaiocb.aio_lio_opcode;
+
+	/*
+	 * Validate the opcode and fetch the file object for the specified
+	 * file descriptor.
+	 *
+	 * XXXRW: Moved the opcode validation up here so that we don't
+	 * retrieve a file descriptor without knowing what the capabiltity
+	 * should be.
+	 */
+	fd = aiocbe->uaiocb.aio_fildes;
+	switch (opcode) {
+	case LIO_WRITE:
+		error = fget_write(td, fd,
+		    cap_rights_init(&rights, CAP_PWRITE), &fp);
+		break;
+	case LIO_READ:
+		error = fget_read(td, fd,
+		    cap_rights_init(&rights, CAP_PREAD), &fp);
+		break;
+	case LIO_SYNC:
+		error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
+		break;
+	case LIO_MLOCK:
+		fp = NULL;
+		break;
+	case LIO_NOP:
+		error = fget(td, fd, cap_rights_init(&rights), &fp);
+		break;
+	default:
+		error = EINVAL;
+	}
+	if (error) {
+		uma_zfree(aiocb_zone, aiocbe);
+		ops->store_error(job, error);
+		return (error);
+	}
+
+	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
+		error = EINVAL;
+		goto aqueue_fail;
+	}
+
+	if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
+		error = EINVAL;
+		goto aqueue_fail;
+	}
+
+	aiocbe->fd_file = fp;
+
+	mtx_lock(&aio_job_mtx);
+	jid = jobrefid++;
+	aiocbe->seqno = jobseqno++;
+	mtx_unlock(&aio_job_mtx);
+	error = ops->store_kernelinfo(job, jid);
+	if (error) {
+		error = EINVAL;
+		goto aqueue_fail;
+	}
+	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
+
+	if (opcode == LIO_NOP) {
+		fdrop(fp, td);
+		uma_zfree(aiocb_zone, aiocbe);
+		return (0);
+	}
+
+	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
+		goto no_kqueue;
+	evflags = aiocbe->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
+	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
+		error = EINVAL;
+		goto aqueue_fail;
+	}
+	kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
+	kev.ident = (uintptr_t)aiocbe->uuaiocb;
+	kev.filter = EVFILT_AIO;
+	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
+	kev.data = (intptr_t)aiocbe;
+	kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
+	error = kqfd_register(kqfd, &kev, td, 1);
+aqueue_fail:
+	if (error) {
+		if (fp)
+			fdrop(fp, td);
+		uma_zfree(aiocb_zone, aiocbe);
+		ops->store_error(job, error);
+		goto done;
+	}
+no_kqueue:
+
+	ops->store_error(job, EINPROGRESS);
+	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
+	aiocbe->userproc = p;
+	aiocbe->cred = crhold(td->td_ucred);
+	aiocbe->jobflags = 0;
+	aiocbe->lio = lj;
+
+	if (opcode == LIO_SYNC)
+		goto queueit;
+
+	if (fp && fp->f_type == DTYPE_SOCKET) {
+		/*
+		 * Alternate queueing for socket ops: Reach down into the
+		 * descriptor to get the socket data.  Then check to see if the
+		 * socket is ready to be read or written (based on the requested
+		 * operation).
+		 *
+		 * If it is not ready for io, then queue the aiocbe on the
+		 * socket, and set the flags so we get a call when sbnotify()
+		 * happens.
+		 *
+		 * Note if opcode is neither LIO_WRITE nor LIO_READ we lock
+		 * and unlock the snd sockbuf for no reason.
+		 */
+		so = fp->f_data;
+		sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
+		SOCKBUF_LOCK(sb);
+		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
+		    LIO_WRITE) && (!sowriteable(so)))) {
+			sb->sb_flags |= SB_AIO;
+
+			mtx_lock(&aio_job_mtx);
+			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
+			mtx_unlock(&aio_job_mtx);
+
+			AIO_LOCK(ki);
+			TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
+			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
+			aiocbe->jobstate = JOBST_JOBQSOCK;
+			ki->kaio_count++;
+			if (lj)
+				lj->lioj_count++;
+			AIO_UNLOCK(ki);
+			SOCKBUF_UNLOCK(sb);
+			atomic_add_int(&num_queue_count, 1);
+			error = 0;
+			goto done;
+		}
+		SOCKBUF_UNLOCK(sb);
+	}
+
+	if ((error = aio_qphysio(p, aiocbe)) == 0)
+		goto done;
+#if 0
+	if (error > 0) {
+		aiocbe->uaiocb._aiocb_private.error = error;
+		ops->store_error(job, error);
+		goto done;
+	}
+#endif
+queueit:
+	/* No buffer for daemon I/O. */
+	aiocbe->bp = NULL;
+	atomic_add_int(&num_queue_count, 1);
+
+	AIO_LOCK(ki);
+	ki->kaio_count++;
+	if (lj)
+		lj->lioj_count++;
+	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
+	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
+	if (opcode == LIO_SYNC) {
+		TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
+			if (cb->fd_file == aiocbe->fd_file &&
+			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
+			    cb->seqno < aiocbe->seqno) {
+				cb->jobflags |= AIOCBLIST_CHECKSYNC;
+				aiocbe->pending++;
+			}
+		}
+		TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
+			if (cb->fd_file == aiocbe->fd_file &&
+			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
+			    cb->seqno < aiocbe->seqno) {
+				cb->jobflags |= AIOCBLIST_CHECKSYNC;
+				aiocbe->pending++;
+			}
+		}
+		if (aiocbe->pending != 0) {
+			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
+			aiocbe->jobstate = JOBST_JOBQSYNC;
+			AIO_UNLOCK(ki);
+			goto done;
+		}
+	}
+	mtx_lock(&aio_job_mtx);
+	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
+	aiocbe->jobstate = JOBST_JOBQGLOBAL;
+	aio_kick_nowait(p);
+	mtx_unlock(&aio_job_mtx);
+	AIO_UNLOCK(ki);
+	error = 0;
+done:
+	return (error);
+}
+
+static void
+aio_kick_nowait(struct proc *userp)
+{
+	struct kaioinfo *ki = userp->p_aioinfo;
+	struct aiothreadlist *aiop;
+
+	mtx_assert(&aio_job_mtx, MA_OWNED);
+	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
+		TAILQ_REMOVE(&aio_freeproc, aiop, list);
+		aiop->aiothreadflags &= ~AIOP_FREE;
+		wakeup(aiop->aiothread);
+	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
+	    ((ki->kaio_active_count + num_aio_resv_start) <
+	    ki->kaio_maxactive_count)) {
+		taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task);
+	}
+}
+
+static int
+aio_kick(struct proc *userp)
+{
+	struct kaioinfo *ki = userp->p_aioinfo;
+	struct aiothreadlist *aiop;
+	int error, ret = 0;
+
+	mtx_assert(&aio_job_mtx, MA_OWNED);
+retryproc:
+	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
+		TAILQ_REMOVE(&aio_freeproc, aiop, list);
+		aiop->aiothreadflags &= ~AIOP_FREE;
+		wakeup(aiop->aiothread);
+	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
+	    ((ki->kaio_active_count + num_aio_resv_start) <
+	    ki->kaio_maxactive_count)) {
+		num_aio_resv_start++;
+		mtx_unlock(&aio_job_mtx);
+		error = aio_newproc(&num_aio_resv_start);
+		mtx_lock(&aio_job_mtx);
+		if (error) {
+			num_aio_resv_start--;
+			goto retryproc;
+		}
+	} else {
+		ret = -1;
+	}
+	return (ret);
+}
+
+static void
+aio_kick_helper(void *context, int pending)
+{
+	struct proc *userp = context;
+
+	mtx_lock(&aio_job_mtx);
+	while (--pending >= 0) {
+		if (aio_kick(userp))
+			break;
+	}
+	mtx_unlock(&aio_job_mtx);
+}
+
+/*
+ * Support the aio_return system call, as a side-effect, kernel resources are
+ * released.
+ */
+static int
+kern_aio_return(struct thread *td, struct aiocb *uaiocb, struct aiocb_ops *ops)
+{
+	struct proc *p = td->td_proc;
+	struct aiocblist *cb;
+	struct kaioinfo *ki;
+	int status, error;
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return (EINVAL);
+	AIO_LOCK(ki);
+	TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
+		if (cb->uuaiocb == uaiocb)
+			break;
+	}
+	if (cb != NULL) {
+		MPASS(cb->jobstate == JOBST_JOBFINISHED);
+		status = cb->uaiocb._aiocb_private.status;
+		error = cb->uaiocb._aiocb_private.error;
+		td->td_retval[0] = status;
+		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+			td->td_ru.ru_oublock += cb->outputcharge;
+			cb->outputcharge = 0;
+		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+			td->td_ru.ru_inblock += cb->inputcharge;
+			cb->inputcharge = 0;
+		}
+		aio_free_entry(cb);
+		AIO_UNLOCK(ki);
+		ops->store_error(uaiocb, error);
+		ops->store_status(uaiocb, status);
+	} else {
+		error = EINVAL;
+		AIO_UNLOCK(ki);
+	}
+	return (error);
+}
+
+int
+sys_aio_return(struct thread *td, struct aio_return_args *uap)
+{
+
+	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
+}
+
+/*
+ * Allow a process to wakeup when any of the I/O requests are completed.
+ */
+static int
+kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
+    struct timespec *ts)
+{
+	struct proc *p = td->td_proc;
+	struct timeval atv;
+	struct kaioinfo *ki;
+	struct aiocblist *cb, *cbfirst;
+	int error, i, timo;
+
+	timo = 0;
+	if (ts) {
+		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
+			return (EINVAL);
+
+		TIMESPEC_TO_TIMEVAL(&atv, ts);
+		if (itimerfix(&atv))
+			return (EINVAL);
+		timo = tvtohz(&atv);
+	}
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return (EAGAIN);
+
+	if (njoblist == 0)
+		return (0);
+
+	AIO_LOCK(ki);
+	for (;;) {
+		cbfirst = NULL;
+		error = 0;
+		TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
+			for (i = 0; i < njoblist; i++) {
+				if (cb->uuaiocb == ujoblist[i]) {
+					if (cbfirst == NULL)
+						cbfirst = cb;
+					if (cb->jobstate == JOBST_JOBFINISHED)
+						goto RETURN;
+				}
+			}
+		}
+		/* All tasks were finished. */
+		if (cbfirst == NULL)
+			break;
+
+		ki->kaio_flags |= KAIO_WAKEUP;
+		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
+		    "aiospn", timo);
+		if (error == ERESTART)
+			error = EINTR;
+		if (error)
+			break;
+	}
+RETURN:
+	AIO_UNLOCK(ki);
+	return (error);
+}
+
+int
+sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
+{
+	struct timespec ts, *tsp;
+	struct aiocb **ujoblist;
+	int error;
+
+	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
+		return (EINVAL);
+
+	if (uap->timeout) {
+		/* Get timespec struct. */
+		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
+			return (error);
+		tsp = &ts;
+	} else
+		tsp = NULL;
+
+	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
+	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
+	if (error == 0)
+		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
+	uma_zfree(aiol_zone, ujoblist);
+	return (error);
+}
+
+/*
+ * aio_cancel cancels any non-physio aio operations not currently in
+ * progress.
+ */
+int
+sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct kaioinfo *ki;
+	struct aiocblist *cbe, *cbn;
+	struct file *fp;
+	struct socket *so;
+	int error;
+	int remove;
+	int cancelled = 0;
+	int notcancelled = 0;
+	struct vnode *vp;
+
+	/* Lookup file object. */
+	error = fget(td, uap->fd, NULL, &fp);
+	if (error)
+		return (error);
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		goto done;
+
+	if (fp->f_type == DTYPE_VNODE) {
+		vp = fp->f_vnode;
+		if (vn_isdisk(vp, &error)) {
+			fdrop(fp, td);
+			td->td_retval[0] = AIO_NOTCANCELED;
+			return (0);
+		}
+	}
+
+	AIO_LOCK(ki);
+	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
+		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
+		    ((uap->aiocbp == NULL) ||
+		     (uap->aiocbp == cbe->uuaiocb))) {
+			remove = 0;
+
+			mtx_lock(&aio_job_mtx);
+			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
+				TAILQ_REMOVE(&aio_jobs, cbe, list);
+				remove = 1;
+			} else if (cbe->jobstate == JOBST_JOBQSOCK) {
+				MPASS(fp->f_type == DTYPE_SOCKET);
+				so = fp->f_data;
+				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
+				remove = 1;
+			} else if (cbe->jobstate == JOBST_JOBQSYNC) {
+				TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
+				remove = 1;
+			}
+			mtx_unlock(&aio_job_mtx);
+
+			if (remove) {
+				TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
+				cbe->uaiocb._aiocb_private.status = -1;
+				cbe->uaiocb._aiocb_private.error = ECANCELED;
+				aio_bio_done_notify(p, cbe, DONE_QUEUE);
+				cancelled++;
+			} else {
+				notcancelled++;
+			}
+			if (uap->aiocbp != NULL)
+				break;
+		}
+	}
+	AIO_UNLOCK(ki);
+
+done:
+	fdrop(fp, td);
+
+	if (uap->aiocbp != NULL) {
+		if (cancelled) {
+			td->td_retval[0] = AIO_CANCELED;
+			return (0);
+		}
+	}
+
+	if (notcancelled) {
+		td->td_retval[0] = AIO_NOTCANCELED;
+		return (0);
+	}
+
+	if (cancelled) {
+		td->td_retval[0] = AIO_CANCELED;
+		return (0);
+	}
+
+	td->td_retval[0] = AIO_ALLDONE;
+
+	return (0);
+}
+
+/*
+ * aio_error is implemented in the kernel level for compatibility purposes
+ * only.  For a user mode async implementation, it would be best to do it in
+ * a userland subroutine.
+ */
+static int
+kern_aio_error(struct thread *td, struct aiocb *aiocbp, struct aiocb_ops *ops)
+{
+	struct proc *p = td->td_proc;
+	struct aiocblist *cb;
+	struct kaioinfo *ki;
+	int status;
+
+	ki = p->p_aioinfo;
+	if (ki == NULL) {
+		td->td_retval[0] = EINVAL;
+		return (0);
+	}
+
+	AIO_LOCK(ki);
+	TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
+		if (cb->uuaiocb == aiocbp) {
+			if (cb->jobstate == JOBST_JOBFINISHED)
+				td->td_retval[0] =
+					cb->uaiocb._aiocb_private.error;
+			else
+				td->td_retval[0] = EINPROGRESS;
+			AIO_UNLOCK(ki);
+			return (0);
+		}
+	}
+	AIO_UNLOCK(ki);
+
+	/*
+	 * Hack for failure of aio_aqueue.
+	 */
+	status = ops->fetch_status(aiocbp);
+	if (status == -1) {
+		td->td_retval[0] = ops->fetch_error(aiocbp);
+		return (0);
+	}
+
+	td->td_retval[0] = EINVAL;
+	return (0);
+}
+
+int
+sys_aio_error(struct thread *td, struct aio_error_args *uap)
+{
+
+	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
+}
+
+/* syscall - asynchronous read from a file (REALTIME) */
+int
+sys_oaio_read(struct thread *td, struct oaio_read_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
+	    &aiocb_ops_osigevent));
+}
+
+int
+sys_aio_read(struct thread *td, struct aio_read_args *uap)
+{
+
+	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
+}
+
+/* syscall - asynchronous write to a file (REALTIME) */
+int
+sys_oaio_write(struct thread *td, struct oaio_write_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
+	    &aiocb_ops_osigevent));
+}
+
+int
+sys_aio_write(struct thread *td, struct aio_write_args *uap)
+{
+
+	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
+}
+
+int
+sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
+{
+
+	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
+}
+
+static int
+kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
+    struct aiocb **acb_list, int nent, struct sigevent *sig,
+    struct aiocb_ops *ops)
+{
+	struct proc *p = td->td_proc;
+	struct aiocb *iocb;
+	struct kaioinfo *ki;
+	struct aioliojob *lj;
+	struct kevent kev;
+	int error;
+	int nerror;
+	int i;
+
+	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
+		return (EINVAL);
+
+	if (nent < 0 || nent > AIO_LISTIO_MAX)
+		return (EINVAL);
+
+	if (p->p_aioinfo == NULL)
+		aio_init_aioinfo(p);
+
+	ki = p->p_aioinfo;
+
+	lj = uma_zalloc(aiolio_zone, M_WAITOK);
+	lj->lioj_flags = 0;
+	lj->lioj_count = 0;
+	lj->lioj_finished_count = 0;
+	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
+	ksiginfo_init(&lj->lioj_ksi);
+
+	/*
+	 * Setup signal.
+	 */
+	if (sig && (mode == LIO_NOWAIT)) {
+		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
+		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+			/* Assume only new style KEVENT */
+			kev.filter = EVFILT_LIO;
+			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
+			kev.ident = (uintptr_t)uacb_list; /* something unique */
+			kev.data = (intptr_t)lj;
+			/* pass user defined sigval data */
+			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
+			error = kqfd_register(
+			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
+			if (error) {
+				uma_zfree(aiolio_zone, lj);
+				return (error);
+			}
+		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
+			;
+		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
+				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
+					uma_zfree(aiolio_zone, lj);
+					return EINVAL;
+				}
+				lj->lioj_flags |= LIOJ_SIGNAL;
+		} else {
+			uma_zfree(aiolio_zone, lj);
+			return EINVAL;
+		}
+	}
+
+	AIO_LOCK(ki);
+	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
+	/*
+	 * Add extra aiocb count to avoid the lio to be freed
+	 * by other threads doing aio_waitcomplete or aio_return,
+	 * and prevent event from being sent until we have queued
+	 * all tasks.
+	 */
+	lj->lioj_count = 1;
+	AIO_UNLOCK(ki);
+
+	/*
+	 * Get pointers to the list of I/O requests.
+	 */
+	nerror = 0;
+	for (i = 0; i < nent; i++) {
+		iocb = acb_list[i];
+		if (iocb != NULL) {
+			error = aio_aqueue(td, iocb, lj, LIO_NOP, ops);
+			if (error != 0)
+				nerror++;
+		}
+	}
+
+	error = 0;
+	AIO_LOCK(ki);
+	if (mode == LIO_WAIT) {
+		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
+			ki->kaio_flags |= KAIO_WAKEUP;
+			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
+			    PRIBIO | PCATCH, "aiospn", 0);
+			if (error == ERESTART)
+				error = EINTR;
+			if (error)
+				break;
+		}
+	} else {
+		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
+			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
+				KNOTE_LOCKED(&lj->klist, 1);
+			}
+			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
+			    == LIOJ_SIGNAL
+			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
+				aio_sendsig(p, &lj->lioj_signal,
+					    &lj->lioj_ksi);
+				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+			}
+		}
+	}
+	lj->lioj_count--;
+	if (lj->lioj_count == 0) {
+		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+		knlist_delete(&lj->klist, curthread, 1);
+		PROC_LOCK(p);
+		sigqueue_take(&lj->lioj_ksi);
+		PROC_UNLOCK(p);
+		AIO_UNLOCK(ki);
+		uma_zfree(aiolio_zone, lj);
+	} else
+		AIO_UNLOCK(ki);
+
+	if (nerror)
+		return (EIO);
+	return (error);
+}
+
+/* syscall - list directed I/O (REALTIME) */
+int
+sys_olio_listio(struct thread *td, struct olio_listio_args *uap)
+{
+	struct aiocb **acb_list;
+	struct sigevent *sigp, sig;
+	struct osigevent osig;
+	int error, nent;
+
+	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+		return (EINVAL);
+
+	nent = uap->nent;
+	if (nent < 0 || nent > AIO_LISTIO_MAX)
+		return (EINVAL);
+
+	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+		error = copyin(uap->sig, &osig, sizeof(osig));
+		if (error)
+			return (error);
+		error = convert_old_sigevent(&osig, &sig);
+		if (error)
+			return (error);
+		sigp = &sig;
+	} else
+		sigp = NULL;
+
+	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
+	if (error == 0)
+		error = kern_lio_listio(td, uap->mode,
+		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
+		    &aiocb_ops_osigevent);
+	free(acb_list, M_LIO);
+	return (error);
+}
+
+/* syscall - list directed I/O (REALTIME) */
+int
+sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
+{
+	struct aiocb **acb_list;
+	struct sigevent *sigp, sig;
+	int error, nent;
+
+	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+		return (EINVAL);
+
+	nent = uap->nent;
+	if (nent < 0 || nent > AIO_LISTIO_MAX)
+		return (EINVAL);
+
+	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+		error = copyin(uap->sig, &sig, sizeof(sig));
+		if (error)
+			return (error);
+		sigp = &sig;
+	} else
+		sigp = NULL;
+
+	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
+	if (error == 0)
+		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
+		    nent, sigp, &aiocb_ops);
+	free(acb_list, M_LIO);
+	return (error);
+}
+
+/*
+ * Called from interrupt thread for physio, we should return as fast
+ * as possible, so we schedule a biohelper task.
+ */
+static void
+aio_physwakeup(struct buf *bp)
+{
+	struct aiocblist *aiocbe;
+
+	aiocbe = (struct aiocblist *)bp->b_caller1;
+	taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
+}
+
+/*
+ * Task routine to perform heavy tasks, process wakeup, and signals.
+ */
+static void
+biohelper(void *context, int pending)
+{
+	struct aiocblist *aiocbe = context;
+	struct buf *bp;
+	struct proc *userp;
+	struct kaioinfo *ki;
+	int nblks;
+
+	bp = aiocbe->bp;
+	userp = aiocbe->userproc;
+	ki = userp->p_aioinfo;
+	AIO_LOCK(ki);
+	aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
+	aiocbe->uaiocb._aiocb_private.error = 0;
+	if (bp->b_ioflags & BIO_ERROR)
+		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+	nblks = btodb(aiocbe->uaiocb.aio_nbytes);
+	if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
+		aiocbe->outputcharge += nblks;
+	else
+		aiocbe->inputcharge += nblks;
+	aiocbe->bp = NULL;
+	TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
+	ki->kaio_buffer_count--;
+	aio_bio_done_notify(userp, aiocbe, DONE_BUF);
+	AIO_UNLOCK(ki);
+
+	/* Release mapping into kernel space. */
+	vunmapbuf(bp);
+	relpbuf(bp, NULL);
+	atomic_subtract_int(&num_buf_aio, 1);
+}
+
+/* syscall - wait for the next completion of an aio request */
+static int
+kern_aio_waitcomplete(struct thread *td, struct aiocb **aiocbp,
+    struct timespec *ts, struct aiocb_ops *ops)
+{
+	struct proc *p = td->td_proc;
+	struct timeval atv;
+	struct kaioinfo *ki;
+	struct aiocblist *cb;
+	struct aiocb *uuaiocb;
+	int error, status, timo;
+
+	ops->store_aiocb(aiocbp, NULL);
+
+	timo = 0;
+	if (ts) {
+		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
+			return (EINVAL);
+
+		TIMESPEC_TO_TIMEVAL(&atv, ts);
+		if (itimerfix(&atv))
+			return (EINVAL);
+		timo = tvtohz(&atv);
+	}
+
+	if (p->p_aioinfo == NULL)
+		aio_init_aioinfo(p);
+	ki = p->p_aioinfo;
+
+	error = 0;
+	cb = NULL;
+	AIO_LOCK(ki);
+	while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
+		ki->kaio_flags |= KAIO_WAKEUP;
+		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
+		    "aiowc", timo);
+		if (timo && error == ERESTART)
+			error = EINTR;
+		if (error)
+			break;
+	}
+
+	if (cb != NULL) {
+		MPASS(cb->jobstate == JOBST_JOBFINISHED);
+		uuaiocb = cb->uuaiocb;
+		status = cb->uaiocb._aiocb_private.status;
+		error = cb->uaiocb._aiocb_private.error;
+		td->td_retval[0] = status;
+		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+			td->td_ru.ru_oublock += cb->outputcharge;
+			cb->outputcharge = 0;
+		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+			td->td_ru.ru_inblock += cb->inputcharge;
+			cb->inputcharge = 0;
+		}
+		aio_free_entry(cb);
+		AIO_UNLOCK(ki);
+		ops->store_aiocb(aiocbp, uuaiocb);
+		ops->store_error(uuaiocb, error);
+		ops->store_status(uuaiocb, status);
+	} else
+		AIO_UNLOCK(ki);
+
+	return (error);
+}
+
+int
+sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
+{
+	struct timespec ts, *tsp;
+	int error;
+
+	if (uap->timeout) {
+		/* Get timespec struct. */
+		error = copyin(uap->timeout, &ts, sizeof(ts));
+		if (error)
+			return (error);
+		tsp = &ts;
+	} else
+		tsp = NULL;
+
+	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
+}
+
+static int
+kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
+    struct aiocb_ops *ops)
+{
+	struct proc *p = td->td_proc;
+	struct kaioinfo *ki;
+
+	if (op != O_SYNC) /* XXX lack of O_DSYNC */
+		return (EINVAL);
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		aio_init_aioinfo(p);
+	return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
+}
+
+int
+sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
+{
+
+	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
+}
+
+/* kqueue attach function */
+static int
+filt_aioattach(struct knote *kn)
+{
+	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
+
+	/*
+	 * The aiocbe pointer must be validated before using it, so
+	 * registration is restricted to the kernel; the user cannot
+	 * set EV_FLAG1.
+	 */
+	if ((kn->kn_flags & EV_FLAG1) == 0)
+		return (EPERM);
+	kn->kn_ptr.p_aio = aiocbe;
+	kn->kn_flags &= ~EV_FLAG1;
+
+	knlist_add(&aiocbe->klist, kn, 0);
+
+	return (0);
+}
+
+/* kqueue detach function */
+static void
+filt_aiodetach(struct knote *kn)
+{
+	struct knlist *knl;
+
+	knl = &kn->kn_ptr.p_aio->klist;
+	knl->kl_lock(knl->kl_lockarg);
+	if (!knlist_empty(knl))
+		knlist_remove(knl, kn, 1);
+	knl->kl_unlock(knl->kl_lockarg);
+}
+
+/* kqueue filter function */
+/*ARGSUSED*/
+static int
+filt_aio(struct knote *kn, long hint)
+{
+	struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
+
+	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
+	if (aiocbe->jobstate != JOBST_JOBFINISHED)
+		return (0);
+	kn->kn_flags |= EV_EOF;
+	return (1);
+}
+
+/* kqueue attach function */
+static int
+filt_lioattach(struct knote *kn)
+{
+	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
+
+	/*
+	 * The aioliojob pointer must be validated before using it, so
+	 * registration is restricted to the kernel; the user cannot
+	 * set EV_FLAG1.
+	 */
+	if ((kn->kn_flags & EV_FLAG1) == 0)
+		return (EPERM);
+	kn->kn_ptr.p_lio = lj;
+	kn->kn_flags &= ~EV_FLAG1;
+
+	knlist_add(&lj->klist, kn, 0);
+
+	return (0);
+}
+
+/* kqueue detach function */
+static void
+filt_liodetach(struct knote *kn)
+{
+	struct knlist *knl;
+
+	knl = &kn->kn_ptr.p_lio->klist;
+	knl->kl_lock(knl->kl_lockarg);
+	if (!knlist_empty(knl))
+		knlist_remove(knl, kn, 1);
+	knl->kl_unlock(knl->kl_lockarg);
+}
+
+/* kqueue filter function */
+/*ARGSUSED*/
+static int
+filt_lio(struct knote *kn, long hint)
+{
+	struct aioliojob * lj = kn->kn_ptr.p_lio;
+
+	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
+}
+
+#ifdef COMPAT_FREEBSD32
+
+struct __aiocb_private32 {
+	int32_t	status;
+	int32_t	error;
+	uint32_t kernelinfo;
+};
+
+typedef struct oaiocb32 {
+	int	aio_fildes;		/* File descriptor */
+	uint64_t aio_offset __packed;	/* File offset for I/O */
+	uint32_t aio_buf;		/* I/O buffer in process space */
+	uint32_t aio_nbytes;		/* Number of bytes for I/O */
+	struct	osigevent32 aio_sigevent; /* Signal to deliver */
+	int	aio_lio_opcode;		/* LIO opcode */
+	int	aio_reqprio;		/* Request priority -- ignored */
+	struct	__aiocb_private32 _aiocb_private;
+} oaiocb32_t;
+
+typedef struct aiocb32 {
+	int32_t	aio_fildes;		/* File descriptor */
+	uint64_t aio_offset __packed;	/* File offset for I/O */
+	uint32_t aio_buf;		/* I/O buffer in process space */
+	uint32_t aio_nbytes;		/* Number of bytes for I/O */
+	int	__spare__[2];
+	uint32_t __spare2__;
+	int	aio_lio_opcode;		/* LIO opcode */
+	int	aio_reqprio;		/* Request priority -- ignored */
+	struct __aiocb_private32 _aiocb_private;
+	struct sigevent32 aio_sigevent;	/* Signal to deliver */
+} aiocb32_t;
+
+static int
+convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
+{
+
+	/*
+	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
+	 * supported by AIO with the old sigevent structure.
+	 */
+	CP(*osig, *nsig, sigev_notify);
+	switch (nsig->sigev_notify) {
+	case SIGEV_NONE:
+		break;
+	case SIGEV_SIGNAL:
+		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
+		break;
+	case SIGEV_KEVENT:
+		nsig->sigev_notify_kqueue =
+		    osig->__sigev_u.__sigev_notify_kqueue;
+		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
+		break;
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+static int
+aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
+{
+	struct oaiocb32 job32;
+	int error;
+
+	bzero(kjob, sizeof(struct aiocb));
+	error = copyin(ujob, &job32, sizeof(job32));
+	if (error)
+		return (error);
+
+	CP(job32, *kjob, aio_fildes);
+	CP(job32, *kjob, aio_offset);
+	PTRIN_CP(job32, *kjob, aio_buf);
+	CP(job32, *kjob, aio_nbytes);
+	CP(job32, *kjob, aio_lio_opcode);
+	CP(job32, *kjob, aio_reqprio);
+	CP(job32, *kjob, _aiocb_private.status);
+	CP(job32, *kjob, _aiocb_private.error);
+	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
+	return (convert_old_sigevent32(&job32.aio_sigevent,
+	    &kjob->aio_sigevent));
+}
+
+static int
+aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
+{
+	struct aiocb32 job32;
+	int error;
+
+	error = copyin(ujob, &job32, sizeof(job32));
+	if (error)
+		return (error);
+	CP(job32, *kjob, aio_fildes);
+	CP(job32, *kjob, aio_offset);
+	PTRIN_CP(job32, *kjob, aio_buf);
+	CP(job32, *kjob, aio_nbytes);
+	CP(job32, *kjob, aio_lio_opcode);
+	CP(job32, *kjob, aio_reqprio);
+	CP(job32, *kjob, _aiocb_private.status);
+	CP(job32, *kjob, _aiocb_private.error);
+	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
+	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
+}
+
+static long
+aiocb32_fetch_status(struct aiocb *ujob)
+{
+	struct aiocb32 *ujob32;
+
+	ujob32 = (struct aiocb32 *)ujob;
+	return (fuword32(&ujob32->_aiocb_private.status));
+}
+
+static long
+aiocb32_fetch_error(struct aiocb *ujob)
+{
+	struct aiocb32 *ujob32;
+
+	ujob32 = (struct aiocb32 *)ujob;
+	return (fuword32(&ujob32->_aiocb_private.error));
+}
+
+static int
+aiocb32_store_status(struct aiocb *ujob, long status)
+{
+	struct aiocb32 *ujob32;
+
+	ujob32 = (struct aiocb32 *)ujob;
+	return (suword32(&ujob32->_aiocb_private.status, status));
+}
+
+static int
+aiocb32_store_error(struct aiocb *ujob, long error)
+{
+	struct aiocb32 *ujob32;
+
+	ujob32 = (struct aiocb32 *)ujob;
+	return (suword32(&ujob32->_aiocb_private.error, error));
+}
+
+static int
+aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
+{
+	struct aiocb32 *ujob32;
+
+	ujob32 = (struct aiocb32 *)ujob;
+	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
+}
+
+static int
+aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
+{
+
+	return (suword32(ujobp, (long)ujob));
+}
+
+static struct aiocb_ops aiocb32_ops = {
+	.copyin = aiocb32_copyin,
+	.fetch_status = aiocb32_fetch_status,
+	.fetch_error = aiocb32_fetch_error,
+	.store_status = aiocb32_store_status,
+	.store_error = aiocb32_store_error,
+	.store_kernelinfo = aiocb32_store_kernelinfo,
+	.store_aiocb = aiocb32_store_aiocb,
+};
+
+static struct aiocb_ops aiocb32_ops_osigevent = {
+	.copyin = aiocb32_copyin_old_sigevent,
+	.fetch_status = aiocb32_fetch_status,
+	.fetch_error = aiocb32_fetch_error,
+	.store_status = aiocb32_store_status,
+	.store_error = aiocb32_store_error,
+	.store_kernelinfo = aiocb32_store_kernelinfo,
+	.store_aiocb = aiocb32_store_aiocb,
+};
+
+int
+freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
+{
+
+	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
+}
+
+int
+freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
+{
+	struct timespec32 ts32;
+	struct timespec ts, *tsp;
+	struct aiocb **ujoblist;
+	uint32_t *ujoblist32;
+	int error, i;
+
+	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
+		return (EINVAL);
+
+	if (uap->timeout) {
+		/* Get timespec struct. */
+		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
+			return (error);
+		CP(ts32, ts, tv_sec);
+		CP(ts32, ts, tv_nsec);
+		tsp = &ts;
+	} else
+		tsp = NULL;
+
+	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
+	ujoblist32 = (uint32_t *)ujoblist;
+	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
+	    sizeof(ujoblist32[0]));
+	if (error == 0) {
+		for (i = uap->nent; i > 0; i--)
+			ujoblist[i] = PTRIN(ujoblist32[i]);
+
+		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
+	}
+	uma_zfree(aiol_zone, ujoblist);
+	return (error);
+}
+
+int
+freebsd32_aio_cancel(struct thread *td, struct freebsd32_aio_cancel_args *uap)
+{
+
+	return (sys_aio_cancel(td, (struct aio_cancel_args *)uap));
+}
+
+int
+freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
+{
+
+	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
+}
+
+int
+freebsd32_oaio_read(struct thread *td, struct freebsd32_oaio_read_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
+	    &aiocb32_ops_osigevent));
+}
+
+int
+freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
+	    &aiocb32_ops));
+}
+
+int
+freebsd32_oaio_write(struct thread *td, struct freebsd32_oaio_write_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
+	    &aiocb32_ops_osigevent));
+}
+
+int
+freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
+	    &aiocb32_ops));
+}
+
+int
+freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
+	    &aiocb32_ops));
+}
+
+int
+freebsd32_aio_waitcomplete(struct thread *td,
+    struct freebsd32_aio_waitcomplete_args *uap)
+{
+	struct timespec32 ts32;
+	struct timespec ts, *tsp;
+	int error;
+
+	if (uap->timeout) {
+		/* Get timespec struct. */
+		error = copyin(uap->timeout, &ts32, sizeof(ts32));
+		if (error)
+			return (error);
+		CP(ts32, ts, tv_sec);
+		CP(ts32, ts, tv_nsec);
+		tsp = &ts;
+	} else
+		tsp = NULL;
+
+	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
+	    &aiocb32_ops));
+}
+
+int
+freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
+{
+
+	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
+	    &aiocb32_ops));
+}
+
+int
+freebsd32_olio_listio(struct thread *td, struct freebsd32_olio_listio_args *uap)
+{
+	struct aiocb **acb_list;
+	struct sigevent *sigp, sig;
+	struct osigevent32 osig;
+	uint32_t *acb_list32;
+	int error, i, nent;
+
+	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+		return (EINVAL);
+
+	nent = uap->nent;
+	if (nent < 0 || nent > AIO_LISTIO_MAX)
+		return (EINVAL);
+
+	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+		error = copyin(uap->sig, &osig, sizeof(osig));
+		if (error)
+			return (error);
+		error = convert_old_sigevent32(&osig, &sig);
+		if (error)
+			return (error);
+		sigp = &sig;
+	} else
+		sigp = NULL;
+
+	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
+	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
+	if (error) {
+		free(acb_list32, M_LIO);
+		return (error);
+	}
+	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+	for (i = 0; i < nent; i++)
+		acb_list[i] = PTRIN(acb_list32[i]);
+	free(acb_list32, M_LIO);
+
+	error = kern_lio_listio(td, uap->mode,
+	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
+	    &aiocb32_ops_osigevent);
+	free(acb_list, M_LIO);
+	return (error);
+}
+
+int
+freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
+{
+	struct aiocb **acb_list;
+	struct sigevent *sigp, sig;
+	struct sigevent32 sig32;
+	uint32_t *acb_list32;
+	int error, i, nent;
+
+	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+		return (EINVAL);
+
+	nent = uap->nent;
+	if (nent < 0 || nent > AIO_LISTIO_MAX)
+		return (EINVAL);
+
+	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+		error = copyin(uap->sig, &sig32, sizeof(sig32));
+		if (error)
+			return (error);
+		error = convert_sigevent32(&sig32, &sig);
+		if (error)
+			return (error);
+		sigp = &sig;
+	} else
+		sigp = NULL;
+
+	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
+	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
+	if (error) {
+		free(acb_list32, M_LIO);
+		return (error);
+	}
+	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+	for (i = 0; i < nent; i++)
+		acb_list[i] = PTRIN(acb_list32[i]);
+	free(acb_list32, M_LIO);
+
+	error = kern_lio_listio(td, uap->mode,
+	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
+	    &aiocb32_ops);
+	free(acb_list, M_LIO);
+	return (error);
+}
+
+#endif
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
new file mode 100644
index 0000000..ea8a002
--- /dev/null
+++ b/sys/kern/vfs_bio.c
@@ -0,0 +1,4602 @@
+/*-
+ * Copyright (c) 2004 Poul-Henning Kamp
+ * Copyright (c) 1994,1997 John S. Dyson
+ * Copyright (c) 2013 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * this file contains a new buffer I/O scheme implementing a coherent
+ * VM object and buffer cache scheme.  Pains have been taken to make
+ * sure that the performance degradation associated with schemes such
+ * as this is not realized.
+ *
+ * Author:  John S. Dyson
+ * Significant help during the development and debugging phases
+ * had been provided by David Greenman, also of the FreeBSD core team.
+ *
+ * see man buf(9) for more info.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/conf.h>
+#include <sys/buf.h>
+#include <sys/devicestat.h>
+#include <sys/eventhandler.h>
+#include <sys/fail.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sysctl.h>
+#include <sys/vmem.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <geom/geom.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include "opt_compat.h"
+#include "opt_directio.h"
+#include "opt_swap.h"
+
+static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
+
+struct	bio_ops bioops;		/* I/O operation notification */
+
+struct	buf_ops buf_ops_bio = {
+	.bop_name	=	"buf_ops_bio",
+	.bop_write	=	bufwrite,
+	.bop_strategy	=	bufstrategy,
+	.bop_sync	=	bufsync,
+	.bop_bdflush	=	bufbdflush,
+};
+
+/*
+ * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
+ * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
+ */
+struct buf *buf;		/* buffer header pool */
+caddr_t unmapped_buf;
+
+static struct proc *bufdaemonproc;
+
+static int inmem(struct vnode *vp, daddr_t blkno);
+static void vm_hold_free_pages(struct buf *bp, int newbsize);
+static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
+		vm_offset_t to);
+static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
+static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
+		vm_page_t m);
+static void vfs_clean_pages_dirty_buf(struct buf *bp);
+static void vfs_setdirty_locked_object(struct buf *bp);
+static void vfs_vmio_release(struct buf *bp);
+static int vfs_bio_clcheck(struct vnode *vp, int size,
+		daddr_t lblkno, daddr_t blkno);
+static int buf_flush(struct vnode *vp, int);
+static int flushbufqueues(struct vnode *, int, int);
+static void buf_daemon(void);
+static void bremfreel(struct buf *bp);
+static __inline void bd_wakeup(void);
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
+#endif
+
+int vmiodirenable = TRUE;
+SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
+    "Use the VM system for directory writes");
+long runningbufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
+    "Amount of presently outstanding async buffer io");
+static long bufspace;
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
+    &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
+#else
+SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
+    "Virtual memory used for buffers");
+#endif
+static long unmapped_bufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD,
+    &unmapped_bufspace, 0,
+    "Amount of unmapped buffers, inclusive in the bufspace");
+static long maxbufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
+    "Maximum allowed value of bufspace (including buf_daemon)");
+static long bufmallocspace;
+SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
+    "Amount of malloced memory for buffers");
+static long maxbufmallocspace;
+SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
+    "Maximum amount of malloced memory for buffers");
+static long lobufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
+    "Minimum amount of buffers we want to have");
+long hibufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
+    "Maximum allowed value of bufspace (excluding buf_daemon)");
+static int bufreusecnt;
+SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
+    "Number of times we have reused a buffer");
+static int buffreekvacnt;
+SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
+    "Number of times we have freed the KVA space from some buffer");
+static int bufdefragcnt;
+SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
+    "Number of times we have had to repeat buffer allocation to defragment");
+static long lorunningspace;
+SYSCTL_LONG(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
+    "Minimum preferred space used for in-progress I/O");
+static long hirunningspace;
+SYSCTL_LONG(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
+    "Maximum amount of space to use for in-progress I/O");
+int dirtybufferflushes;
+SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
+    0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
+int bdwriteskip;
+SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
+    0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
+int altbufferflushes;
+SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
+    0, "Number of fsync flushes to limit dirty buffers");
+static int recursiveflushes;
+SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
+    0, "Number of flushes skipped due to being recursive");
+static int numdirtybuffers;
+SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
+    "Number of buffers that are dirty (has unwritten changes) at the moment");
+static int lodirtybuffers;
+SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
+    "How many buffers we want to have free before bufdaemon can sleep");
+static int hidirtybuffers;
+SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
+    "When the number of dirty buffers is considered severe");
+int dirtybufthresh;
+SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
+    0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
+static int numfreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
+    "Number of free buffers");
+static int lofreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
+   "XXX Unused");
+static int hifreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
+   "XXX Complicatedly unused");
+static int getnewbufcalls;
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
+   "Number of calls to getnewbuf");
+static int getnewbufrestarts;
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
+    "Number of times getnewbuf has had to restart a buffer aquisition");
+static int mappingrestarts;
+SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
+    "Number of times getblk has had to restart a buffer mapping for "
+    "unmapped buffer");
+static int flushbufqtarget = 100;
+SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
+    "Amount of work to do in flushbufqueues when helping bufdaemon");
+static long notbufdflushes;
+SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
+    "Number of dirty buffer flushes done by the bufdaemon helpers");
+static long barrierwrites;
+SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
+    "Number of barrier writes");
+SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
+    &unmapped_buf_allowed, 0,
+    "Permit the use of the unmapped i/o");
+
+/*
+ * Lock for the non-dirty bufqueues
+ */
+static struct mtx_padalign bqclean;
+
+/*
+ * Lock for the dirty queue.
+ */
+static struct mtx_padalign bqdirty;
+
+/*
+ * This lock synchronizes access to bd_request.
+ */
+static struct mtx_padalign bdlock;
+
+/*
+ * This lock protects the runningbufreq and synchronizes runningbufwakeup and
+ * waitrunningbufspace().
+ */
+static struct mtx_padalign rbreqlock;
+
+/*
+ * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
+ */
+static struct mtx_padalign nblock;
+
+/*
+ * Lock that protects bdirtywait.
+ */
+static struct mtx_padalign bdirtylock;
+
+/*
+ * Wakeup point for bufdaemon, as well as indicator of whether it is already
+ * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
+ * is idling.
+ */
+static int bd_request;
+
+/*
+ * Request for the buf daemon to write more buffers than is indicated by
+ * lodirtybuf.  This may be necessary to push out excess dependencies or
+ * defragment the address space where a simple count of the number of dirty
+ * buffers is insufficient to characterize the demand for flushing them.
+ */
+static int bd_speedupreq;
+
+/*
+ * bogus page -- for I/O to/from partially complete buffers
+ * this is a temporary solution to the problem, but it is not
+ * really that bad.  it would be better to split the buffer
+ * for input in the case of buffers partially already in memory,
+ * but the code is intricate enough already.
+ */
+vm_page_t bogus_page;
+
+/*
+ * Synchronization (sleep/wakeup) variable for active buffer space requests.
+ * Set when wait starts, cleared prior to wakeup().
+ * Used in runningbufwakeup() and waitrunningbufspace().
+ */
+static int runningbufreq;
+
+/* 
+ * Synchronization (sleep/wakeup) variable for buffer requests.
+ * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
+ * by and/or.
+ * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
+ * getnewbuf(), and getblk().
+ */
+static int needsbuffer;
+
+/*
+ * Synchronization for bwillwrite() waiters.
+ */
+static int bdirtywait;
+
+/*
+ * Definitions for the buffer free lists.
+ */
+#define BUFFER_QUEUES	5	/* number of free buffer queues */
+
+#define QUEUE_NONE	0	/* on no queue */
+#define QUEUE_CLEAN	1	/* non-B_DELWRI buffers */
+#define QUEUE_DIRTY	2	/* B_DELWRI buffers */
+#define QUEUE_EMPTYKVA	3	/* empty buffer headers w/KVA assignment */
+#define QUEUE_EMPTY	4	/* empty buffer headers */
+#define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
+
+/* Queues for free buffers with various properties */
+static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
+#ifdef INVARIANTS
+static int bq_len[BUFFER_QUEUES];
+#endif
+
+/*
+ * Single global constant for BUF_WMESG, to avoid getting multiple references.
+ * buf_wmesg is referred from macros.
+ */
+const char *buf_wmesg = BUF_WMESG;
+
+#define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
+#define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
+#define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+static int
+sysctl_bufspace(SYSCTL_HANDLER_ARGS)
+{
+	long lvalue;
+	int ivalue;
+
+	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
+		return (sysctl_handle_long(oidp, arg1, arg2, req));
+	lvalue = *(long *)arg1;
+	if (lvalue > INT_MAX)
+		/* On overflow, still write out a long to trigger ENOMEM. */
+		return (sysctl_handle_long(oidp, &lvalue, 0, req));
+	ivalue = lvalue;
+	return (sysctl_handle_int(oidp, &ivalue, 0, req));
+}
+#endif
+
+#ifdef DIRECTIO
+extern void ffs_rawread_setup(void);
+#endif /* DIRECTIO */
+
+/*
+ *	bqlock:
+ *
+ *	Return the appropriate queue lock based on the index.
+ */
+static inline struct mtx *
+bqlock(int qindex)
+{
+
+	if (qindex == QUEUE_DIRTY)
+		return (struct mtx *)(&bqdirty);
+	return (struct mtx *)(&bqclean);
+}
+
+/*
+ *	bdirtywakeup:
+ *
+ *	Wakeup any bwillwrite() waiters.
+ */
+static void
+bdirtywakeup(void)
+{
+	mtx_lock(&bdirtylock);
+	if (bdirtywait) {
+		bdirtywait = 0;
+		wakeup(&bdirtywait);
+	}
+	mtx_unlock(&bdirtylock);
+}
+
+/*
+ *	bdirtysub:
+ *
+ *	Decrement the numdirtybuffers count by one and wakeup any
+ *	threads blocked in bwillwrite().
+ */
+static void
+bdirtysub(void)
+{
+
+	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
+	    (lodirtybuffers + hidirtybuffers) / 2)
+		bdirtywakeup();
+}
+
+/*
+ *	bdirtyadd:
+ *
+ *	Increment the numdirtybuffers count by one and wakeup the buf 
+ *	daemon if needed.
+ */
+static void
+bdirtyadd(void)
+{
+
+	/*
+	 * Only do the wakeup once as we cross the boundary.  The
+	 * buf daemon will keep running until the condition clears.
+	 */
+	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
+	    (lodirtybuffers + hidirtybuffers) / 2)
+		bd_wakeup();
+}
+
+/*
+ *	bufspacewakeup:
+ *
+ *	Called when buffer space is potentially available for recovery.
+ *	getnewbuf() will block on this flag when it is unable to free 
+ *	sufficient buffer space.  Buffer space becomes recoverable when 
+ *	bp's get placed back in the queues.
+ */
+
+static __inline void
+bufspacewakeup(void)
+{
+
+	/*
+	 * If someone is waiting for BUF space, wake them up.  Even
+	 * though we haven't freed the kva space yet, the waiting
+	 * process will be able to now.
+	 */
+	mtx_lock(&nblock);
+	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
+		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
+		wakeup(&needsbuffer);
+	}
+	mtx_unlock(&nblock);
+}
+
+/*
+ *	runningwakeup:
+ *
+ *	Wake up processes that are waiting on asynchronous writes to fall
+ *	below lorunningspace.
+ */
+static void
+runningwakeup(void)
+{
+
+	mtx_lock(&rbreqlock);
+	if (runningbufreq) {
+		runningbufreq = 0;
+		wakeup(&runningbufreq);
+	}
+	mtx_unlock(&rbreqlock);
+}
+
+/*
+ *	runningbufwakeup:
+ *
+ *	Decrement the outstanding write count according.
+ */
+void
+runningbufwakeup(struct buf *bp)
+{
+	long space, bspace;
+
+	bspace = bp->b_runningbufspace;
+	if (bspace == 0)
+		return;
+	space = atomic_fetchadd_long(&runningbufspace, -bspace);
+	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
+	    space, bspace));
+	bp->b_runningbufspace = 0;
+	/*
+	 * Only acquire the lock and wakeup on the transition from exceeding
+	 * the threshold to falling below it.
+	 */
+	if (space < lorunningspace)
+		return;
+	if (space - bspace > lorunningspace)
+		return;
+	runningwakeup();
+}
+
+/*
+ *	bufcountadd:
+ *
+ *	Called when a buffer has been added to one of the free queues to
+ *	account for the buffer and to wakeup anyone waiting for free buffers.
+ *	This typically occurs when large amounts of metadata are being handled
+ *	by the buffer cache ( else buffer space runs out first, usually ).
+ */
+static __inline void
+bufcountadd(struct buf *bp)
+{
+	int old;
+
+	KASSERT((bp->b_flags & B_INFREECNT) == 0,
+	    ("buf %p already counted as free", bp));
+	bp->b_flags |= B_INFREECNT;
+	old = atomic_fetchadd_int(&numfreebuffers, 1);
+	KASSERT(old >= 0 && old < nbuf,
+	    ("numfreebuffers climbed to %d", old + 1));
+	mtx_lock(&nblock);
+	if (needsbuffer) {
+		needsbuffer &= ~VFS_BIO_NEED_ANY;
+		if (numfreebuffers >= hifreebuffers)
+			needsbuffer &= ~VFS_BIO_NEED_FREE;
+		wakeup(&needsbuffer);
+	}
+	mtx_unlock(&nblock);
+}
+
+/*
+ *	bufcountsub:
+ *
+ *	Decrement the numfreebuffers count as needed.
+ */
+static void
+bufcountsub(struct buf *bp)
+{
+	int old;
+
+	/*
+	 * Fixup numfreebuffers count.  If the buffer is invalid or not
+	 * delayed-write, the buffer was free and we must decrement
+	 * numfreebuffers.
+	 */
+	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
+		KASSERT((bp->b_flags & B_INFREECNT) != 0,
+		    ("buf %p not counted in numfreebuffers", bp));
+		bp->b_flags &= ~B_INFREECNT;
+		old = atomic_fetchadd_int(&numfreebuffers, -1);
+		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
+	}
+}
+
+/*
+ *	waitrunningbufspace()
+ *
+ *	runningbufspace is a measure of the amount of I/O currently
+ *	running.  This routine is used in async-write situations to
+ *	prevent creating huge backups of pending writes to a device.
+ *	Only asynchronous writes are governed by this function.
+ *
+ *	This does NOT turn an async write into a sync write.  It waits  
+ *	for earlier writes to complete and generally returns before the
+ *	caller's write has reached the device.
+ */
+void
+waitrunningbufspace(void)
+{
+
+	mtx_lock(&rbreqlock);
+	while (runningbufspace > hirunningspace) {
+		runningbufreq = 1;
+		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
+	}
+	mtx_unlock(&rbreqlock);
+}
+
+
+/*
+ *	vfs_buf_test_cache:
+ *
+ *	Called when a buffer is extended.  This function clears the B_CACHE
+ *	bit if the newly extended portion of the buffer does not contain
+ *	valid data.
+ */
+static __inline
+void
+vfs_buf_test_cache(struct buf *bp,
+		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
+		  vm_page_t m)
+{
+
+	VM_OBJECT_ASSERT_LOCKED(m->object);
+	if (bp->b_flags & B_CACHE) {
+		int base = (foff + off) & PAGE_MASK;
+		if (vm_page_is_valid(m, base, size) == 0)
+			bp->b_flags &= ~B_CACHE;
+	}
+}
+
+/* Wake up the buffer daemon if necessary */
+static __inline void
+bd_wakeup(void)
+{
+
+	mtx_lock(&bdlock);
+	if (bd_request == 0) {
+		bd_request = 1;
+		wakeup(&bd_request);
+	}
+	mtx_unlock(&bdlock);
+}
+
+/*
+ * bd_speedup - speedup the buffer cache flushing code
+ */
+void
+bd_speedup(void)
+{
+	int needwake;
+
+	mtx_lock(&bdlock);
+	needwake = 0;
+	if (bd_speedupreq == 0 || bd_request == 0)
+		needwake = 1;
+	bd_speedupreq = 1;
+	bd_request = 1;
+	if (needwake)
+		wakeup(&bd_request);
+	mtx_unlock(&bdlock);
+}
+
+#ifdef __i386__
+#define	TRANSIENT_DENOM	5
+#else
+#define	TRANSIENT_DENOM 10
+#endif
+
+/*
+ * Calculating buffer cache scaling values and reserve space for buffer
+ * headers.  This is called during low level kernel initialization and
+ * may be called more then once.  We CANNOT write to the memory area
+ * being reserved at this time.
+ */
+caddr_t
+kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
+{
+	int tuned_nbuf;
+	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
+
+	/*
+	 * physmem_est is in pages.  Convert it to kilobytes (assumes
+	 * PAGE_SIZE is >= 1K)
+	 */
+	physmem_est = physmem_est * (PAGE_SIZE / 1024);
+
+	/*
+	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
+	 * For the first 64MB of ram nominally allocate sufficient buffers to
+	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
+	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
+	 * the buffer cache we limit the eventual kva reservation to
+	 * maxbcache bytes.
+	 *
+	 * factor represents the 1/4 x ram conversion.
+	 */
+	if (nbuf == 0) {
+		int factor = 4 * BKVASIZE / 1024;
+
+		nbuf = 50;
+		if (physmem_est > 4096)
+			nbuf += min((physmem_est - 4096) / factor,
+			    65536 / factor);
+		if (physmem_est > 65536)
+			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
+			    32 * 1024 * 1024 / (factor * 5));
+
+		if (maxbcache && nbuf > maxbcache / BKVASIZE)
+			nbuf = maxbcache / BKVASIZE;
+		tuned_nbuf = 1;
+	} else
+		tuned_nbuf = 0;
+
+	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
+	maxbuf = (LONG_MAX / 3) / BKVASIZE;
+	if (nbuf > maxbuf) {
+		if (!tuned_nbuf)
+			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
+			    maxbuf);
+		nbuf = maxbuf;
+	}
+
+	/*
+	 * Ideal allocation size for the transient bio submap if 10%
+	 * of the maximal space buffer map.  This roughly corresponds
+	 * to the amount of the buffer mapped for typical UFS load.
+	 *
+	 * Clip the buffer map to reserve space for the transient
+	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
+	 * maximum buffer map extent on the platform.
+	 *
+	 * The fall-back to the maxbuf in case of maxbcache unset,
+	 * allows to not trim the buffer KVA for the architectures
+	 * with ample KVA space.
+	 */
+	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
+		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
+		buf_sz = (long)nbuf * BKVASIZE;
+		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
+		    (TRANSIENT_DENOM - 1)) {
+			/*
+			 * There is more KVA than memory.  Do not
+			 * adjust buffer map size, and assign the rest
+			 * of maxbuf to transient map.
+			 */
+			biotmap_sz = maxbuf_sz - buf_sz;
+		} else {
+			/*
+			 * Buffer map spans all KVA we could afford on
+			 * this platform.  Give 10% (20% on i386) of
+			 * the buffer map to the transient bio map.
+			 */
+			biotmap_sz = buf_sz / TRANSIENT_DENOM;
+			buf_sz -= biotmap_sz;
+		}
+		if (biotmap_sz / INT_MAX > MAXPHYS)
+			bio_transient_maxcnt = INT_MAX;
+		else
+			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
+		/*
+		 * Artifically limit to 1024 simultaneous in-flight I/Os
+		 * using the transient mapping.
+		 */
+		if (bio_transient_maxcnt > 1024)
+			bio_transient_maxcnt = 1024;
+		if (tuned_nbuf)
+			nbuf = buf_sz / BKVASIZE;
+	}
+
+	/*
+	 * swbufs are used as temporary holders for I/O, such as paging I/O.
+	 * We have no less then 16 and no more then 256.
+	 */
+	nswbuf = max(min(nbuf/4, 256), 16);
+#ifdef NSWBUF_MIN
+	if (nswbuf < NSWBUF_MIN)
+		nswbuf = NSWBUF_MIN;
+#endif
+#ifdef DIRECTIO
+	ffs_rawread_setup();
+#endif
+
+	/*
+	 * Reserve space for the buffer cache buffers
+	 */
+	swbuf = (void *)v;
+	v = (caddr_t)(swbuf + nswbuf);
+	buf = (void *)v;
+	v = (caddr_t)(buf + nbuf);
+
+	return(v);
+}
+
+/* Initialize the buffer subsystem.  Called before use of any buffers. */
+void
+bufinit(void)
+{
+	struct buf *bp;
+	int i;
+
+	mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
+	mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
+	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
+	mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
+	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
+	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
+
+	/* next, make a null set of free lists */
+	for (i = 0; i < BUFFER_QUEUES; i++)
+		TAILQ_INIT(&bufqueues[i]);
+
+	/* finally, initialize each buffer header and stick on empty q */
+	for (i = 0; i < nbuf; i++) {
+		bp = &buf[i];
+		bzero(bp, sizeof *bp);
+		bp->b_flags = B_INVAL | B_INFREECNT;
+		bp->b_rcred = NOCRED;
+		bp->b_wcred = NOCRED;
+		bp->b_qindex = QUEUE_EMPTY;
+		bp->b_xflags = 0;
+		LIST_INIT(&bp->b_dep);
+		BUF_LOCKINIT(bp);
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+#ifdef INVARIANTS
+		bq_len[QUEUE_EMPTY]++;
+#endif
+	}
+
+	/*
+	 * maxbufspace is the absolute maximum amount of buffer space we are 
+	 * allowed to reserve in KVM and in real terms.  The absolute maximum
+	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
+	 * used by most other processes.  The differential is required to 
+	 * ensure that buf_daemon is able to run when other processes might 
+	 * be blocked waiting for buffer space.
+	 *
+	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
+	 * this may result in KVM fragmentation which is not handled optimally
+	 * by the system.
+	 */
+	maxbufspace = (long)nbuf * BKVASIZE;
+	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
+	lobufspace = hibufspace - MAXBSIZE;
+
+	/*
+	 * Note: The 16 MiB upper limit for hirunningspace was chosen
+	 * arbitrarily and may need further tuning. It corresponds to
+	 * 128 outstanding write IO requests (if IO size is 128 KiB),
+	 * which fits with many RAID controllers' tagged queuing limits.
+	 * The lower 1 MiB limit is the historical upper limit for
+	 * hirunningspace.
+	 */
+	hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBSIZE),
+	    16 * 1024 * 1024), 1024 * 1024);
+	lorunningspace = roundup((hirunningspace * 2) / 3, MAXBSIZE);
+
+/*
+ * Limit the amount of malloc memory since it is wired permanently into
+ * the kernel space.  Even though this is accounted for in the buffer
+ * allocation, we don't want the malloced region to grow uncontrolled.
+ * The malloc scheme improves memory utilization significantly on average
+ * (small) directories.
+ */
+	maxbufmallocspace = hibufspace / 20;
+
+/*
+ * Reduce the chance of a deadlock occuring by limiting the number
+ * of delayed-write dirty buffers we allow to stack up.
+ */
+	hidirtybuffers = nbuf / 4 + 20;
+	dirtybufthresh = hidirtybuffers * 9 / 10;
+	numdirtybuffers = 0;
+/*
+ * To support extreme low-memory systems, make sure hidirtybuffers cannot
+ * eat up all available buffer space.  This occurs when our minimum cannot
+ * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
+ * BKVASIZE'd buffers.
+ */
+	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
+		hidirtybuffers >>= 1;
+	}
+	lodirtybuffers = hidirtybuffers / 2;
+
+/*
+ * Try to keep the number of free buffers in the specified range,
+ * and give special processes (e.g. like buf_daemon) access to an 
+ * emergency reserve.
+ */
+	lofreebuffers = nbuf / 18 + 5;
+	hifreebuffers = 2 * lofreebuffers;
+	numfreebuffers = nbuf;
+
+	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
+	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
+	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
+}
+
+#ifdef INVARIANTS
+static inline void
+vfs_buf_check_mapped(struct buf *bp)
+{
+
+	KASSERT((bp->b_flags & B_UNMAPPED) == 0,
+	    ("mapped buf %p %x", bp, bp->b_flags));
+	KASSERT(bp->b_kvabase != unmapped_buf,
+	    ("mapped buf: b_kvabase was not updated %p", bp));
+	KASSERT(bp->b_data != unmapped_buf,
+	    ("mapped buf: b_data was not updated %p", bp));
+}
+
+static inline void
+vfs_buf_check_unmapped(struct buf *bp)
+{
+
+	KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED,
+	    ("unmapped buf %p %x", bp, bp->b_flags));
+	KASSERT(bp->b_kvabase == unmapped_buf,
+	    ("unmapped buf: corrupted b_kvabase %p", bp));
+	KASSERT(bp->b_data == unmapped_buf,
+	    ("unmapped buf: corrupted b_data %p", bp));
+}
+
+#define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
+#define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
+#else
+#define	BUF_CHECK_MAPPED(bp) do {} while (0)
+#define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
+#endif
+
+static void
+bpmap_qenter(struct buf *bp)
+{
+
+	BUF_CHECK_MAPPED(bp);
+
+	/*
+	 * bp->b_data is relative to bp->b_offset, but
+	 * bp->b_offset may be offset into the first page.
+	 */
+	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
+	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
+	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
+	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
+}
+
+/*
+ * bfreekva() - free the kva allocation for a buffer.
+ *
+ *	Since this call frees up buffer space, we call bufspacewakeup().
+ */
+static void
+bfreekva(struct buf *bp)
+{
+
+	if (bp->b_kvasize == 0)
+		return;
+
+	atomic_add_int(&buffreekvacnt, 1);
+	atomic_subtract_long(&bufspace, bp->b_kvasize);
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		BUF_CHECK_MAPPED(bp);
+		vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase,
+		    bp->b_kvasize);
+	} else {
+		BUF_CHECK_UNMAPPED(bp);
+		if ((bp->b_flags & B_KVAALLOC) != 0) {
+			vmem_free(buffer_arena, (vm_offset_t)bp->b_kvaalloc,
+			    bp->b_kvasize);
+		}
+		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+		bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
+	}
+	bp->b_kvasize = 0;
+	bufspacewakeup();
+}
+
+/*
+ *	binsfree:
+ *
+ *	Insert the buffer into the appropriate free list.
+ */
+static void
+binsfree(struct buf *bp, int qindex)
+{
+	struct mtx *olock, *nlock;
+
+	BUF_ASSERT_XLOCKED(bp);
+
+	olock = bqlock(bp->b_qindex);
+	nlock = bqlock(qindex);
+	mtx_lock(olock);
+	/* Handle delayed bremfree() processing. */
+	if (bp->b_flags & B_REMFREE)
+		bremfreel(bp);
+
+	if (bp->b_qindex != QUEUE_NONE)
+		panic("binsfree: free buffer onto another queue???");
+
+	bp->b_qindex = qindex;
+	if (olock != nlock) {
+		mtx_unlock(olock);
+		mtx_lock(nlock);
+	}
+	if (bp->b_flags & B_AGE)
+		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+	else
+		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+	bq_len[bp->b_qindex]++;
+#endif
+	mtx_unlock(nlock);
+
+	/*
+	 * Something we can maybe free or reuse.
+	 */
+	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
+		bufspacewakeup();
+
+	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
+		bufcountadd(bp);
+}
+
+/*
+ *	bremfree:
+ *
+ *	Mark the buffer for removal from the appropriate free list.
+ *	
+ */
+void
+bremfree(struct buf *bp)
+{
+
+	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	KASSERT((bp->b_flags & B_REMFREE) == 0,
+	    ("bremfree: buffer %p already marked for delayed removal.", bp));
+	KASSERT(bp->b_qindex != QUEUE_NONE,
+	    ("bremfree: buffer %p not on a queue.", bp));
+	BUF_ASSERT_XLOCKED(bp);
+
+	bp->b_flags |= B_REMFREE;
+	bufcountsub(bp);
+}
+
+/*
+ *	bremfreef:
+ *
+ *	Force an immediate removal from a free list.  Used only in nfs when
+ *	it abuses the b_freelist pointer.
+ */
+void
+bremfreef(struct buf *bp)
+{
+	struct mtx *qlock;
+
+	qlock = bqlock(bp->b_qindex);
+	mtx_lock(qlock);
+	bremfreel(bp);
+	mtx_unlock(qlock);
+}
+
+/*
+ *	bremfreel:
+ *
+ *	Removes a buffer from the free list, must be called with the
+ *	correct qlock held.
+ */
+static void
+bremfreel(struct buf *bp)
+{
+
+	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
+	    bp, bp->b_vp, bp->b_flags);
+	KASSERT(bp->b_qindex != QUEUE_NONE,
+	    ("bremfreel: buffer %p not on a queue.", bp));
+	BUF_ASSERT_XLOCKED(bp);
+	mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
+
+	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
+	    bp->b_qindex));
+	bq_len[bp->b_qindex]--;
+#endif
+	bp->b_qindex = QUEUE_NONE;
+	/*
+	 * If this was a delayed bremfree() we only need to remove the buffer
+	 * from the queue and return the stats are already done.
+	 */
+	if (bp->b_flags & B_REMFREE) {
+		bp->b_flags &= ~B_REMFREE;
+		return;
+	}
+	bufcountsub(bp);
+}
+
+/*
+ * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
+ * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
+ * the buffer is valid and we do not have to do anything.
+ */
+void
+breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
+    int cnt, struct ucred * cred)
+{
+	struct buf *rabp;
+	int i;
+
+	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+		if (inmem(vp, *rablkno))
+			continue;
+		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
+
+		if ((rabp->b_flags & B_CACHE) == 0) {
+			if (!TD_IS_IDLETHREAD(curthread))
+				curthread->td_ru.ru_inblock++;
+			rabp->b_flags |= B_ASYNC;
+			rabp->b_flags &= ~B_INVAL;
+			rabp->b_ioflags &= ~BIO_ERROR;
+			rabp->b_iocmd = BIO_READ;
+			if (rabp->b_rcred == NOCRED && cred != NOCRED)
+				rabp->b_rcred = crhold(cred);
+			vfs_busy_pages(rabp, 0);
+			BUF_KERNPROC(rabp);
+			rabp->b_iooffset = dbtob(rabp->b_blkno);
+			bstrategy(rabp);
+		} else {
+			brelse(rabp);
+		}
+	}
+}
+
+/*
+ * Entry point for bread() and breadn() via #defines in sys/buf.h.
+ *
+ * Get a buffer with the specified data.  Look in the cache first.  We
+ * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
+ * is set, the buffer is valid and we do not have to do anything, see
+ * getblk(). Also starts asynchronous I/O on read-ahead blocks.
+ */
+int
+breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
+    int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
+{
+	struct buf *bp;
+	int rv = 0, readwait = 0;
+
+	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
+	/*
+	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
+	 */
+	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
+	if (bp == NULL)
+		return (EBUSY);
+
+	/* if not found in cache, do some I/O */
+	if ((bp->b_flags & B_CACHE) == 0) {
+		if (!TD_IS_IDLETHREAD(curthread))
+			curthread->td_ru.ru_inblock++;
+		bp->b_iocmd = BIO_READ;
+		bp->b_flags &= ~B_INVAL;
+		bp->b_ioflags &= ~BIO_ERROR;
+		if (bp->b_rcred == NOCRED && cred != NOCRED)
+			bp->b_rcred = crhold(cred);
+		vfs_busy_pages(bp, 0);
+		bp->b_iooffset = dbtob(bp->b_blkno);
+		bstrategy(bp);
+		++readwait;
+	}
+
+	breada(vp, rablkno, rabsize, cnt, cred);
+
+	if (readwait) {
+		rv = bufwait(bp);
+	}
+	return (rv);
+}
+
+/*
+ * Write, release buffer on completion.  (Done by iodone
+ * if async).  Do not bother writing anything if the buffer
+ * is invalid.
+ *
+ * Note that we set B_CACHE here, indicating that buffer is
+ * fully valid and thus cacheable.  This is true even of NFS
+ * now so we set it generally.  This could be set either here 
+ * or in biodone() since the I/O is synchronous.  We put it
+ * here.
+ */
+int
+bufwrite(struct buf *bp)
+{
+	int oldflags;
+	struct vnode *vp;
+	long space;
+	int vp_md;
+
+	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return (0);
+	}
+
+	if (bp->b_flags & B_BARRIER)
+		barrierwrites++;
+
+	oldflags = bp->b_flags;
+
+	BUF_ASSERT_HELD(bp);
+
+	if (bp->b_pin_count > 0)
+		bunpin_wait(bp);
+
+	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
+	    ("FFS background buffer should not get here %p", bp));
+
+	vp = bp->b_vp;
+	if (vp)
+		vp_md = vp->v_vflag & VV_MD;
+	else
+		vp_md = 0;
+
+	/*
+	 * Mark the buffer clean.  Increment the bufobj write count
+	 * before bundirty() call, to prevent other thread from seeing
+	 * empty dirty list and zero counter for writes in progress,
+	 * falsely indicating that the bufobj is clean.
+	 */
+	bufobj_wref(bp->b_bufobj);
+	bundirty(bp);
+
+	bp->b_flags &= ~B_DONE;
+	bp->b_ioflags &= ~BIO_ERROR;
+	bp->b_flags |= B_CACHE;
+	bp->b_iocmd = BIO_WRITE;
+
+	vfs_busy_pages(bp, 1);
+
+	/*
+	 * Normal bwrites pipeline writes
+	 */
+	bp->b_runningbufspace = bp->b_bufsize;
+	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
+
+	if (!TD_IS_IDLETHREAD(curthread))
+		curthread->td_ru.ru_oublock++;
+	if (oldflags & B_ASYNC)
+		BUF_KERNPROC(bp);
+	bp->b_iooffset = dbtob(bp->b_blkno);
+	bstrategy(bp);
+
+	if ((oldflags & B_ASYNC) == 0) {
+		int rtval = bufwait(bp);
+		brelse(bp);
+		return (rtval);
+	} else if (space > hirunningspace) {
+		/*
+		 * don't allow the async write to saturate the I/O
+		 * system.  We will not deadlock here because
+		 * we are blocking waiting for I/O that is already in-progress
+		 * to complete. We do not block here if it is the update
+		 * or syncer daemon trying to clean up as that can lead
+		 * to deadlock.
+		 */
+		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
+			waitrunningbufspace();
+	}
+
+	return (0);
+}
+
+void
+bufbdflush(struct bufobj *bo, struct buf *bp)
+{
+	struct buf *nbp;
+
+	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
+		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
+		altbufferflushes++;
+	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
+		BO_LOCK(bo);
+		/*
+		 * Try to find a buffer to flush.
+		 */
+		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
+			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
+			    BUF_LOCK(nbp,
+				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
+				continue;
+			if (bp == nbp)
+				panic("bdwrite: found ourselves");
+			BO_UNLOCK(bo);
+			/* Don't countdeps with the bo lock held. */
+			if (buf_countdeps(nbp, 0)) {
+				BO_LOCK(bo);
+				BUF_UNLOCK(nbp);
+				continue;
+			}
+			if (nbp->b_flags & B_CLUSTEROK) {
+				vfs_bio_awrite(nbp);
+			} else {
+				bremfree(nbp);
+				bawrite(nbp);
+			}
+			dirtybufferflushes++;
+			break;
+		}
+		if (nbp == NULL)
+			BO_UNLOCK(bo);
+	}
+}
+
+/*
+ * Delayed write. (Buffer is marked dirty).  Do not bother writing
+ * anything if the buffer is marked invalid.
+ *
+ * Note that since the buffer must be completely valid, we can safely
+ * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
+ * biodone() in order to prevent getblk from writing the buffer
+ * out synchronously.
+ */
+void
+bdwrite(struct buf *bp)
+{
+	struct thread *td = curthread;
+	struct vnode *vp;
+	struct bufobj *bo;
+
+	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+	KASSERT((bp->b_flags & B_BARRIER) == 0,
+	    ("Barrier request in delayed write %p", bp));
+	BUF_ASSERT_HELD(bp);
+
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return;
+	}
+
+	/*
+	 * If we have too many dirty buffers, don't create any more.
+	 * If we are wildly over our limit, then force a complete
+	 * cleanup. Otherwise, just keep the situation from getting
+	 * out of control. Note that we have to avoid a recursive
+	 * disaster and not try to clean up after our own cleanup!
+	 */
+	vp = bp->b_vp;
+	bo = bp->b_bufobj;
+	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
+		td->td_pflags |= TDP_INBDFLUSH;
+		BO_BDFLUSH(bo, bp);
+		td->td_pflags &= ~TDP_INBDFLUSH;
+	} else
+		recursiveflushes++;
+
+	bdirty(bp);
+	/*
+	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
+	 * true even of NFS now.
+	 */
+	bp->b_flags |= B_CACHE;
+
+	/*
+	 * This bmap keeps the system from needing to do the bmap later,
+	 * perhaps when the system is attempting to do a sync.  Since it
+	 * is likely that the indirect block -- or whatever other datastructure
+	 * that the filesystem needs is still in memory now, it is a good
+	 * thing to do this.  Note also, that if the pageout daemon is
+	 * requesting a sync -- there might not be enough memory to do
+	 * the bmap then...  So, this is important to do.
+	 */
+	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
+		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+	}
+
+	/*
+	 * Set the *dirty* buffer range based upon the VM system dirty
+	 * pages.
+	 *
+	 * Mark the buffer pages as clean.  We need to do this here to
+	 * satisfy the vnode_pager and the pageout daemon, so that it
+	 * thinks that the pages have been "cleaned".  Note that since
+	 * the pages are in a delayed write buffer -- the VFS layer
+	 * "will" see that the pages get written out on the next sync,
+	 * or perhaps the cluster will be completed.
+	 */
+	vfs_clean_pages_dirty_buf(bp);
+	bqrelse(bp);
+
+	/*
+	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
+	 * due to the softdep code.
+	 */
+}
+
+/*
+ *	bdirty:
+ *
+ *	Turn buffer into delayed write request.  We must clear BIO_READ and
+ *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
+ *	itself to properly update it in the dirty/clean lists.  We mark it
+ *	B_DONE to ensure that any asynchronization of the buffer properly
+ *	clears B_DONE ( else a panic will occur later ).  
+ *
+ *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
+ *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
+ *	should only be called if the buffer is known-good.
+ *
+ *	Since the buffer is not on a queue, we do not update the numfreebuffers
+ *	count.
+ *
+ *	The buffer must be on QUEUE_NONE.
+ */
+void
+bdirty(struct buf *bp)
+{
+
+	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
+	    bp, bp->b_vp, bp->b_flags);
+	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
+	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
+	BUF_ASSERT_HELD(bp);
+	bp->b_flags &= ~(B_RELBUF);
+	bp->b_iocmd = BIO_WRITE;
+
+	if ((bp->b_flags & B_DELWRI) == 0) {
+		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
+		reassignbuf(bp);
+		bdirtyadd();
+	}
+}
+
+/*
+ *	bundirty:
+ *
+ *	Clear B_DELWRI for buffer.
+ *
+ *	Since the buffer is not on a queue, we do not update the numfreebuffers
+ *	count.
+ *	
+ *	The buffer must be on QUEUE_NONE.
+ */
+
+void
+bundirty(struct buf *bp)
+{
+
+	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
+	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
+	BUF_ASSERT_HELD(bp);
+
+	if (bp->b_flags & B_DELWRI) {
+		bp->b_flags &= ~B_DELWRI;
+		reassignbuf(bp);
+		bdirtysub();
+	}
+	/*
+	 * Since it is now being written, we can clear its deferred write flag.
+	 */
+	bp->b_flags &= ~B_DEFERRED;
+}
+
+/*
+ *	bawrite:
+ *
+ *	Asynchronous write.  Start output on a buffer, but do not wait for
+ *	it to complete.  The buffer is released when the output completes.
+ *
+ *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
+ *	B_INVAL buffers.  Not us.
+ */
+void
+bawrite(struct buf *bp)
+{
+
+	bp->b_flags |= B_ASYNC;
+	(void) bwrite(bp);
+}
+
+/*
+ *	babarrierwrite:
+ *
+ *	Asynchronous barrier write.  Start output on a buffer, but do not
+ *	wait for it to complete.  Place a write barrier after this write so
+ *	that this buffer and all buffers written before it are committed to
+ *	the disk before any buffers written after this write are committed
+ *	to the disk.  The buffer is released when the output completes.
+ */
+void
+babarrierwrite(struct buf *bp)
+{
+
+	bp->b_flags |= B_ASYNC | B_BARRIER;
+	(void) bwrite(bp);
+}
+
+/*
+ *	bbarrierwrite:
+ *
+ *	Synchronous barrier write.  Start output on a buffer and wait for
+ *	it to complete.  Place a write barrier after this write so that
+ *	this buffer and all buffers written before it are committed to 
+ *	the disk before any buffers written after this write are committed
+ *	to the disk.  The buffer is released when the output completes.
+ */
+int
+bbarrierwrite(struct buf *bp)
+{
+
+	bp->b_flags |= B_BARRIER;
+	return (bwrite(bp));
+}
+
+/*
+ *	bwillwrite:
+ *
+ *	Called prior to the locking of any vnodes when we are expecting to
+ *	write.  We do not want to starve the buffer cache with too many
+ *	dirty buffers so we block here.  By blocking prior to the locking
+ *	of any vnodes we attempt to avoid the situation where a locked vnode
+ *	prevents the various system daemons from flushing related buffers.
+ */
+void
+bwillwrite(void)
+{
+
+	if (numdirtybuffers >= hidirtybuffers) {
+		mtx_lock(&bdirtylock);
+		while (numdirtybuffers >= hidirtybuffers) {
+			bdirtywait = 1;
+			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
+			    "flswai", 0);
+		}
+		mtx_unlock(&bdirtylock);
+	}
+}
+
+/*
+ * Return true if we have too many dirty buffers.
+ */
+int
+buf_dirty_count_severe(void)
+{
+
+	return(numdirtybuffers >= hidirtybuffers);
+}
+
+static __noinline int
+buf_vm_page_count_severe(void)
+{
+
+	KFAIL_POINT_CODE(DEBUG_FP, buf_pressure, return 1);
+
+	return vm_page_count_severe();
+}
+
+/*
+ *	brelse:
+ *
+ *	Release a busy buffer and, if requested, free its resources.  The
+ *	buffer will be stashed in the appropriate bufqueue[] allowing it
+ *	to be accessed later as a cache entity or reused for other purposes.
+ */
+void
+brelse(struct buf *bp)
+{
+	int qindex;
+
+	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
+	    bp, bp->b_vp, bp->b_flags);
+	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
+	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+
+	if (BUF_LOCKRECURSED(bp)) {
+		/*
+		 * Do not process, in particular, do not handle the
+		 * B_INVAL/B_RELBUF and do not release to free list.
+		 */
+		BUF_UNLOCK(bp);
+		return;
+	}
+
+	if (bp->b_flags & B_MANAGED) {
+		bqrelse(bp);
+		return;
+	}
+
+	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
+	    bp->b_error == EIO && !(bp->b_flags & B_INVAL)) {
+		/*
+		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
+		 * pages from being scrapped.  If the error is anything
+		 * other than an I/O error (EIO), assume that retrying
+		 * is futile.
+		 */
+		bp->b_ioflags &= ~BIO_ERROR;
+		bdirty(bp);
+	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
+	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
+		/*
+		 * Either a failed I/O or we were asked to free or not
+		 * cache the buffer.
+		 */
+		bp->b_flags |= B_INVAL;
+		if (!LIST_EMPTY(&bp->b_dep))
+			buf_deallocate(bp);
+		if (bp->b_flags & B_DELWRI)
+			bdirtysub();
+		bp->b_flags &= ~(B_DELWRI | B_CACHE);
+		if ((bp->b_flags & B_VMIO) == 0) {
+			if (bp->b_bufsize)
+				allocbuf(bp, 0);
+			if (bp->b_vp)
+				brelvp(bp);
+		}
+	}
+
+	/*
+	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release() 
+	 * is called with B_DELWRI set, the underlying pages may wind up
+	 * getting freed causing a previous write (bdwrite()) to get 'lost'
+	 * because pages associated with a B_DELWRI bp are marked clean.
+	 * 
+	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
+	 * if B_DELWRI is set.
+	 *
+	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
+	 * on pages to return pages to the VM page queues.
+	 */
+	if (bp->b_flags & B_DELWRI)
+		bp->b_flags &= ~B_RELBUF;
+	else if (buf_vm_page_count_severe()) {
+		/*
+		 * BKGRDINPROG can only be set with the buf and bufobj
+		 * locks both held.  We tolerate a race to clear it here.
+		 */
+		if (!(bp->b_vflags & BV_BKGRDINPROG))
+			bp->b_flags |= B_RELBUF;
+	}
+
+	/*
+	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
+	 * constituted, not even NFS buffers now.  Two flags effect this.  If
+	 * B_INVAL, the struct buf is invalidated but the VM object is kept
+	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
+	 *
+	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
+	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
+	 * buffer is also B_INVAL because it hits the re-dirtying code above.
+	 *
+	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
+	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
+	 * the commit state and we cannot afford to lose the buffer. If the
+	 * buffer has a background write in progress, we need to keep it
+	 * around to prevent it from being reconstituted and starting a second
+	 * background write.
+	 */
+	if ((bp->b_flags & B_VMIO)
+	    && !(bp->b_vp->v_mount != NULL &&
+		 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
+		 !vn_isdisk(bp->b_vp, NULL) &&
+		 (bp->b_flags & B_DELWRI))
+	    ) {
+
+		int i, j, resid;
+		vm_page_t m;
+		off_t foff;
+		vm_pindex_t poff;
+		vm_object_t obj;
+
+		obj = bp->b_bufobj->bo_object;
+
+		/*
+		 * Get the base offset and length of the buffer.  Note that 
+		 * in the VMIO case if the buffer block size is not
+		 * page-aligned then b_data pointer may not be page-aligned.
+		 * But our b_pages[] array *IS* page aligned.
+		 *
+		 * block sizes less then DEV_BSIZE (usually 512) are not 
+		 * supported due to the page granularity bits (m->valid,
+		 * m->dirty, etc...). 
+		 *
+		 * See man buf(9) for more information
+		 */
+		resid = bp->b_bufsize;
+		foff = bp->b_offset;
+		for (i = 0; i < bp->b_npages; i++) {
+			int had_bogus = 0;
+
+			m = bp->b_pages[i];
+
+			/*
+			 * If we hit a bogus page, fixup *all* the bogus pages
+			 * now.
+			 */
+			if (m == bogus_page) {
+				poff = OFF_TO_IDX(bp->b_offset);
+				had_bogus = 1;
+
+				VM_OBJECT_RLOCK(obj);
+				for (j = i; j < bp->b_npages; j++) {
+					vm_page_t mtmp;
+					mtmp = bp->b_pages[j];
+					if (mtmp == bogus_page) {
+						mtmp = vm_page_lookup(obj, poff + j);
+						if (!mtmp) {
+							panic("brelse: page missing\n");
+						}
+						bp->b_pages[j] = mtmp;
+					}
+				}
+				VM_OBJECT_RUNLOCK(obj);
+
+				if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) {
+					BUF_CHECK_MAPPED(bp);
+					pmap_qenter(
+					    trunc_page((vm_offset_t)bp->b_data),
+					    bp->b_pages, bp->b_npages);
+				}
+				m = bp->b_pages[i];
+			}
+			if ((bp->b_flags & B_NOCACHE) ||
+			    (bp->b_ioflags & BIO_ERROR &&
+			     bp->b_iocmd == BIO_READ)) {
+				int poffset = foff & PAGE_MASK;
+				int presid = resid > (PAGE_SIZE - poffset) ?
+					(PAGE_SIZE - poffset) : resid;
+
+				KASSERT(presid >= 0, ("brelse: extra page"));
+				VM_OBJECT_WLOCK(obj);
+				while (vm_page_xbusied(m)) {
+					vm_page_lock(m);
+					VM_OBJECT_WUNLOCK(obj);
+					vm_page_busy_sleep(m, "mbncsh");
+					VM_OBJECT_WLOCK(obj);
+				}
+				if (pmap_page_wired_mappings(m) == 0)
+					vm_page_set_invalid(m, poffset, presid);
+				VM_OBJECT_WUNLOCK(obj);
+				if (had_bogus)
+					printf("avoided corruption bug in bogus_page/brelse code\n");
+			}
+			resid -= PAGE_SIZE - (foff & PAGE_MASK);
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+		}
+		if (bp->b_flags & (B_INVAL | B_RELBUF))
+			vfs_vmio_release(bp);
+
+	} else if (bp->b_flags & B_VMIO) {
+
+		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
+			vfs_vmio_release(bp);
+		}
+
+	} else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
+		if (bp->b_bufsize != 0)
+			allocbuf(bp, 0);
+		if (bp->b_vp != NULL)
+			brelvp(bp);
+	}
+			
+	/*
+	 * If the buffer has junk contents signal it and eventually
+	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
+	 * doesn't find it.
+	 */
+	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
+	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
+		bp->b_flags |= B_INVAL;
+	if (bp->b_flags & B_INVAL) {
+		if (bp->b_flags & B_DELWRI)
+			bundirty(bp);
+		if (bp->b_vp)
+			brelvp(bp);
+	}
+
+	/* buffers with no memory */
+	if (bp->b_bufsize == 0) {
+		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
+		if (bp->b_vflags & BV_BKGRDINPROG)
+			panic("losing buffer 1");
+		if (bp->b_kvasize)
+			qindex = QUEUE_EMPTYKVA;
+		else
+			qindex = QUEUE_EMPTY;
+		bp->b_flags |= B_AGE;
+	/* buffers with junk contents */
+	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
+	    (bp->b_ioflags & BIO_ERROR)) {
+		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
+		if (bp->b_vflags & BV_BKGRDINPROG)
+			panic("losing buffer 2");
+		qindex = QUEUE_CLEAN;
+		bp->b_flags |= B_AGE;
+	/* remaining buffers */
+	} else if (bp->b_flags & B_DELWRI)
+		qindex = QUEUE_DIRTY;
+	else
+		qindex = QUEUE_CLEAN;
+
+	binsfree(bp, qindex);
+
+	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
+	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
+		panic("brelse: not dirty");
+	/* unlock */
+	BUF_UNLOCK(bp);
+}
+
+/*
+ * Release a buffer back to the appropriate queue but do not try to free
+ * it.  The buffer is expected to be used again soon.
+ *
+ * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
+ * biodone() to requeue an async I/O on completion.  It is also used when
+ * known good buffers need to be requeued but we think we may need the data
+ * again soon.
+ *
+ * XXX we should be able to leave the B_RELBUF hint set on completion.
+ */
+void
+bqrelse(struct buf *bp)
+{
+	int qindex;
+
+	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
+	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+
+	if (BUF_LOCKRECURSED(bp)) {
+		/* do not release to free list */
+		BUF_UNLOCK(bp);
+		return;
+	}
+	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+
+	if (bp->b_flags & B_MANAGED) {
+		if (bp->b_flags & B_REMFREE)
+			bremfreef(bp);
+		goto out;
+	}
+
+	/* buffers with stale but valid contents */
+	if (bp->b_flags & B_DELWRI) {
+		qindex = QUEUE_DIRTY;
+	} else {
+		if ((bp->b_flags & B_DELWRI) == 0 &&
+		    (bp->b_xflags & BX_VNDIRTY))
+			panic("bqrelse: not dirty");
+		/*
+		 * BKGRDINPROG can only be set with the buf and bufobj
+		 * locks both held.  We tolerate a race to clear it here.
+		 */
+		if (buf_vm_page_count_severe() &&
+		    (bp->b_vflags & BV_BKGRDINPROG) == 0) {
+			/*
+			 * We are too low on memory, we have to try to free
+			 * the buffer (most importantly: the wired pages
+			 * making up its backing store) *now*.
+			 */
+			brelse(bp);
+			return;
+		}
+		qindex = QUEUE_CLEAN;
+	}
+	binsfree(bp, qindex);
+
+out:
+	/* unlock */
+	BUF_UNLOCK(bp);
+}
+
+/* Give pages used by the bp back to the VM system (where possible) */
+static void
+vfs_vmio_release(struct buf *bp)
+{
+	int i;
+	vm_page_t m;
+
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		BUF_CHECK_MAPPED(bp);
+		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
+	} else
+		BUF_CHECK_UNMAPPED(bp);
+	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+	for (i = 0; i < bp->b_npages; i++) {
+		m = bp->b_pages[i];
+		bp->b_pages[i] = NULL;
+		/*
+		 * In order to keep page LRU ordering consistent, put
+		 * everything on the inactive queue.
+		 */
+		vm_page_lock(m);
+		vm_page_unwire(m, 0);
+
+		/*
+		 * Might as well free the page if we can and it has
+		 * no valid data.  We also free the page if the
+		 * buffer was used for direct I/O
+		 */
+		if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) {
+			if (m->wire_count == 0 && !vm_page_busied(m))
+				vm_page_free(m);
+		} else if (bp->b_flags & B_DIRECT)
+			vm_page_try_to_free(m);
+		else if (buf_vm_page_count_severe())
+			vm_page_try_to_cache(m);
+		vm_page_unlock(m);
+	}
+	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+	
+	if (bp->b_bufsize) {
+		bufspacewakeup();
+		bp->b_bufsize = 0;
+	}
+	bp->b_npages = 0;
+	bp->b_flags &= ~B_VMIO;
+	if (bp->b_vp)
+		brelvp(bp);
+}
+
+/*
+ * Check to see if a block at a particular lbn is available for a clustered
+ * write.
+ */
+static int
+vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
+{
+	struct buf *bpa;
+	int match;
+
+	match = 0;
+
+	/* If the buf isn't in core skip it */
+	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
+		return (0);
+
+	/* If the buf is busy we don't want to wait for it */
+	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+		return (0);
+
+	/* Only cluster with valid clusterable delayed write buffers */
+	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
+	    (B_DELWRI | B_CLUSTEROK))
+		goto done;
+
+	if (bpa->b_bufsize != size)
+		goto done;
+
+	/*
+	 * Check to see if it is in the expected place on disk and that the
+	 * block has been mapped.
+	 */
+	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
+		match = 1;
+done:
+	BUF_UNLOCK(bpa);
+	return (match);
+}
+
+/*
+ *	vfs_bio_awrite:
+ *
+ *	Implement clustered async writes for clearing out B_DELWRI buffers.
+ *	This is much better then the old way of writing only one buffer at
+ *	a time.  Note that we may not be presented with the buffers in the 
+ *	correct order, so we search for the cluster in both directions.
+ */
+int
+vfs_bio_awrite(struct buf *bp)
+{
+	struct bufobj *bo;
+	int i;
+	int j;
+	daddr_t lblkno = bp->b_lblkno;
+	struct vnode *vp = bp->b_vp;
+	int ncl;
+	int nwritten;
+	int size;
+	int maxcl;
+	int gbflags;
+
+	bo = &vp->v_bufobj;
+	gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
+	/*
+	 * right now we support clustered writing only to regular files.  If
+	 * we find a clusterable block we could be in the middle of a cluster
+	 * rather then at the beginning.
+	 */
+	if ((vp->v_type == VREG) && 
+	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
+	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
+
+		size = vp->v_mount->mnt_stat.f_iosize;
+		maxcl = MAXPHYS / size;
+
+		BO_RLOCK(bo);
+		for (i = 1; i < maxcl; i++)
+			if (vfs_bio_clcheck(vp, size, lblkno + i,
+			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
+				break;
+
+		for (j = 1; i + j <= maxcl && j <= lblkno; j++) 
+			if (vfs_bio_clcheck(vp, size, lblkno - j,
+			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
+				break;
+		BO_RUNLOCK(bo);
+		--j;
+		ncl = i + j;
+		/*
+		 * this is a possible cluster write
+		 */
+		if (ncl != 1) {
+			BUF_UNLOCK(bp);
+			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
+			    gbflags);
+			return (nwritten);
+		}
+	}
+	bremfree(bp);
+	bp->b_flags |= B_ASYNC;
+	/*
+	 * default (old) behavior, writing out only one block
+	 *
+	 * XXX returns b_bufsize instead of b_bcount for nwritten?
+	 */
+	nwritten = bp->b_bufsize;
+	(void) bwrite(bp);
+
+	return (nwritten);
+}
+
+static void
+setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags)
+{
+
+	KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+	    bp->b_kvasize == 0, ("call bfreekva(%p)", bp));
+	if ((gbflags & GB_UNMAPPED) == 0) {
+		bp->b_kvabase = (caddr_t)addr;
+	} else if ((gbflags & GB_KVAALLOC) != 0) {
+		KASSERT((gbflags & GB_UNMAPPED) != 0,
+		    ("GB_KVAALLOC without GB_UNMAPPED"));
+		bp->b_kvaalloc = (caddr_t)addr;
+		bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+	}
+	bp->b_kvasize = maxsize;
+}
+
+/*
+ * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if
+ * needed.
+ */
+static int
+allocbufkva(struct buf *bp, int maxsize, int gbflags)
+{
+	vm_offset_t addr;
+
+	bfreekva(bp);
+	addr = 0;
+
+	if (vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr)) {
+		/*
+		 * Buffer map is too fragmented.  Request the caller
+		 * to defragment the map.
+		 */
+		atomic_add_int(&bufdefragcnt, 1);
+		return (1);
+	}
+	setbufkva(bp, addr, maxsize, gbflags);
+	atomic_add_long(&bufspace, bp->b_kvasize);
+	return (0);
+}
+
+/*
+ * Ask the bufdaemon for help, or act as bufdaemon itself, when a
+ * locked vnode is supplied.
+ */
+static void
+getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
+    int defrag)
+{
+	struct thread *td;
+	char *waitmsg;
+	int fl, flags, norunbuf;
+
+	mtx_assert(&bqclean, MA_OWNED);
+
+	if (defrag) {
+		flags = VFS_BIO_NEED_BUFSPACE;
+		waitmsg = "nbufkv";
+	} else if (bufspace >= hibufspace) {
+		waitmsg = "nbufbs";
+		flags = VFS_BIO_NEED_BUFSPACE;
+	} else {
+		waitmsg = "newbuf";
+		flags = VFS_BIO_NEED_ANY;
+	}
+	mtx_lock(&nblock);
+	needsbuffer |= flags;
+	mtx_unlock(&nblock);
+	mtx_unlock(&bqclean);
+
+	bd_speedup();	/* heeeelp */
+	if ((gbflags & GB_NOWAIT_BD) != 0)
+		return;
+
+	td = curthread;
+	mtx_lock(&nblock);
+	while (needsbuffer & flags) {
+		if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
+			mtx_unlock(&nblock);
+			/*
+			 * getblk() is called with a vnode locked, and
+			 * some majority of the dirty buffers may as
+			 * well belong to the vnode.  Flushing the
+			 * buffers there would make a progress that
+			 * cannot be achieved by the buf_daemon, that
+			 * cannot lock the vnode.
+			 */
+			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+			    (td->td_pflags & TDP_NORUNNINGBUF);
+			/* play bufdaemon */
+			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+			fl = buf_flush(vp, flushbufqtarget);
+			td->td_pflags &= norunbuf;
+			mtx_lock(&nblock);
+			if (fl != 0)
+				continue;
+			if ((needsbuffer & flags) == 0)
+				break;
+		}
+		if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag,
+		    waitmsg, slptimeo))
+			break;
+	}
+	mtx_unlock(&nblock);
+}
+
+static void
+getnewbuf_reuse_bp(struct buf *bp, int qindex)
+{
+
+	CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
+	    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
+	     bp->b_kvasize, bp->b_bufsize, qindex);
+	mtx_assert(&bqclean, MA_NOTOWNED);
+
+	/*
+	 * Note: we no longer distinguish between VMIO and non-VMIO
+	 * buffers.
+	 */
+	KASSERT((bp->b_flags & B_DELWRI) == 0,
+	    ("delwri buffer %p found in queue %d", bp, qindex));
+
+	if (qindex == QUEUE_CLEAN) {
+		if (bp->b_flags & B_VMIO) {
+			bp->b_flags &= ~B_ASYNC;
+			vfs_vmio_release(bp);
+		}
+		if (bp->b_vp != NULL)
+			brelvp(bp);
+	}
+
+	/*
+	 * Get the rest of the buffer freed up.  b_kva* is still valid
+	 * after this operation.
+	 */
+
+	if (bp->b_rcred != NOCRED) {
+		crfree(bp->b_rcred);
+		bp->b_rcred = NOCRED;
+	}
+	if (bp->b_wcred != NOCRED) {
+		crfree(bp->b_wcred);
+		bp->b_wcred = NOCRED;
+	}
+	if (!LIST_EMPTY(&bp->b_dep))
+		buf_deallocate(bp);
+	if (bp->b_vflags & BV_BKGRDINPROG)
+		panic("losing buffer 3");
+	KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.  qindex: %d",
+	    bp, bp->b_vp, qindex));
+	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
+	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
+
+	if (bp->b_bufsize)
+		allocbuf(bp, 0);
+
+	bp->b_flags &= B_UNMAPPED | B_KVAALLOC;
+	bp->b_ioflags = 0;
+	bp->b_xflags = 0;
+	KASSERT((bp->b_flags & B_INFREECNT) == 0,
+	    ("buf %p still counted as free?", bp));
+	bp->b_vflags = 0;
+	bp->b_vp = NULL;
+	bp->b_blkno = bp->b_lblkno = 0;
+	bp->b_offset = NOOFFSET;
+	bp->b_iodone = 0;
+	bp->b_error = 0;
+	bp->b_resid = 0;
+	bp->b_bcount = 0;
+	bp->b_npages = 0;
+	bp->b_dirtyoff = bp->b_dirtyend = 0;
+	bp->b_bufobj = NULL;
+	bp->b_pin_count = 0;
+	bp->b_fsprivate1 = NULL;
+	bp->b_fsprivate2 = NULL;
+	bp->b_fsprivate3 = NULL;
+
+	LIST_INIT(&bp->b_dep);
+}
+
+static int flushingbufs;
+
+static struct buf *
+getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
+{
+	struct buf *bp, *nbp;
+	int nqindex, qindex, pass;
+
+	KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
+
+	pass = 1;
+restart:
+	atomic_add_int(&getnewbufrestarts, 1);
+
+	/*
+	 * Setup for scan.  If we do not have enough free buffers,
+	 * we setup a degenerate case that immediately fails.  Note
+	 * that if we are specially marked process, we are allowed to
+	 * dip into our reserves.
+	 *
+	 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
+	 * for the allocation of the mapped buffer.  For unmapped, the
+	 * easiest is to start with EMPTY outright.
+	 *
+	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
+	 * However, there are a number of cases (defragging, reusing, ...)
+	 * where we cannot backup.
+	 */
+	nbp = NULL;
+	mtx_lock(&bqclean);
+	if (!defrag && unmapped) {
+		nqindex = QUEUE_EMPTY;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+	}
+	if (nbp == NULL) {
+		nqindex = QUEUE_EMPTYKVA;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+	}
+
+	/*
+	 * If no EMPTYKVA buffers and we are either defragging or
+	 * reusing, locate a CLEAN buffer to free or reuse.  If
+	 * bufspace useage is low skip this step so we can allocate a
+	 * new buffer.
+	 */
+	if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
+		nqindex = QUEUE_CLEAN;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+	}
+
+	/*
+	 * If we could not find or were not allowed to reuse a CLEAN
+	 * buffer, check to see if it is ok to use an EMPTY buffer.
+	 * We can only use an EMPTY buffer if allocating its KVA would
+	 * not otherwise run us out of buffer space.  No KVA is needed
+	 * for the unmapped allocation.
+	 */
+	if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
+	    metadata)) {
+		nqindex = QUEUE_EMPTY;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+	}
+
+	/*
+	 * All available buffers might be clean, retry ignoring the
+	 * lobufspace as the last resort.
+	 */
+	if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
+		nqindex = QUEUE_CLEAN;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+	}
+
+	/*
+	 * Run scan, possibly freeing data and/or kva mappings on the fly
+	 * depending.
+	 */
+	while ((bp = nbp) != NULL) {
+		qindex = nqindex;
+
+		/*
+		 * Calculate next bp (we can only use it if we do not
+		 * block or do other fancy things).
+		 */
+		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
+			switch (qindex) {
+			case QUEUE_EMPTY:
+				nqindex = QUEUE_EMPTYKVA;
+				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+				if (nbp != NULL)
+					break;
+				/* FALLTHROUGH */
+			case QUEUE_EMPTYKVA:
+				nqindex = QUEUE_CLEAN;
+				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+				if (nbp != NULL)
+					break;
+				/* FALLTHROUGH */
+			case QUEUE_CLEAN:
+				if (metadata && pass == 1) {
+					pass = 2;
+					nqindex = QUEUE_EMPTY;
+					nbp = TAILQ_FIRST(
+					    &bufqueues[QUEUE_EMPTY]);
+				}
+				/*
+				 * nbp is NULL. 
+				 */
+				break;
+			}
+		}
+		/*
+		 * If we are defragging then we need a buffer with 
+		 * b_kvasize != 0.  XXX this situation should no longer
+		 * occur, if defrag is non-zero the buffer's b_kvasize
+		 * should also be non-zero at this point.  XXX
+		 */
+		if (defrag && bp->b_kvasize == 0) {
+			printf("Warning: defrag empty buffer %p\n", bp);
+			continue;
+		}
+
+		/*
+		 * Start freeing the bp.  This is somewhat involved.  nbp
+		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
+		 */
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+			continue;
+		/*
+		 * BKGRDINPROG can only be set with the buf and bufobj
+		 * locks both held.  We tolerate a race to clear it here.
+		 */
+		if (bp->b_vflags & BV_BKGRDINPROG) {
+			BUF_UNLOCK(bp);
+			continue;
+		}
+
+		KASSERT(bp->b_qindex == qindex,
+		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
+
+		bremfreel(bp);
+		mtx_unlock(&bqclean);
+		/*
+		 * NOTE:  nbp is now entirely invalid.  We can only restart
+		 * the scan from this point on.
+		 */
+
+		getnewbuf_reuse_bp(bp, qindex);
+		mtx_assert(&bqclean, MA_NOTOWNED);
+
+		/*
+		 * If we are defragging then free the buffer.
+		 */
+		if (defrag) {
+			bp->b_flags |= B_INVAL;
+			bfreekva(bp);
+			brelse(bp);
+			defrag = 0;
+			goto restart;
+		}
+
+		/*
+		 * Notify any waiters for the buffer lock about
+		 * identity change by freeing the buffer.
+		 */
+		if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) {
+			bp->b_flags |= B_INVAL;
+			bfreekva(bp);
+			brelse(bp);
+			goto restart;
+		}
+
+		if (metadata)
+			break;
+
+		/*
+		 * If we are overcomitted then recover the buffer and its
+		 * KVM space.  This occurs in rare situations when multiple
+		 * processes are blocked in getnewbuf() or allocbuf().
+		 */
+		if (bufspace >= hibufspace)
+			flushingbufs = 1;
+		if (flushingbufs && bp->b_kvasize != 0) {
+			bp->b_flags |= B_INVAL;
+			bfreekva(bp);
+			brelse(bp);
+			goto restart;
+		}
+		if (bufspace < lobufspace)
+			flushingbufs = 0;
+		break;
+	}
+	return (bp);
+}
+
+/*
+ *	getnewbuf:
+ *
+ *	Find and initialize a new buffer header, freeing up existing buffers
+ *	in the bufqueues as necessary.  The new buffer is returned locked.
+ *
+ *	Important:  B_INVAL is not set.  If the caller wishes to throw the
+ *	buffer away, the caller must set B_INVAL prior to calling brelse().
+ *
+ *	We block if:
+ *		We have insufficient buffer headers
+ *		We have insufficient buffer space
+ *		buffer_arena is too fragmented ( space reservation fails )
+ *		If we have to flush dirty buffers ( but we try to avoid this )
+ */
+static struct buf *
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
+    int gbflags)
+{
+	struct buf *bp;
+	int defrag, metadata;
+
+	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
+	if (!unmapped_buf_allowed)
+		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+
+	defrag = 0;
+	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
+	    vp->v_type == VCHR)
+		metadata = 1;
+	else
+		metadata = 0;
+	/*
+	 * We can't afford to block since we might be holding a vnode lock,
+	 * which may prevent system daemons from running.  We deal with
+	 * low-memory situations by proactively returning memory and running
+	 * async I/O rather then sync I/O.
+	 */
+	atomic_add_int(&getnewbufcalls, 1);
+	atomic_subtract_int(&getnewbufrestarts, 1);
+restart:
+	bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
+	    GB_KVAALLOC)) == GB_UNMAPPED, metadata);
+	if (bp != NULL)
+		defrag = 0;
+
+	/*
+	 * If we exhausted our list, sleep as appropriate.  We may have to
+	 * wakeup various daemons and write out some dirty buffers.
+	 *
+	 * Generally we are sleeping due to insufficient buffer space.
+	 */
+	if (bp == NULL) {
+		mtx_assert(&bqclean, MA_OWNED);
+		getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
+		mtx_assert(&bqclean, MA_NOTOWNED);
+	} else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
+		mtx_assert(&bqclean, MA_NOTOWNED);
+
+		bfreekva(bp);
+		bp->b_flags |= B_UNMAPPED;
+		bp->b_kvabase = bp->b_data = unmapped_buf;
+		bp->b_kvasize = maxsize;
+		atomic_add_long(&bufspace, bp->b_kvasize);
+		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+		atomic_add_int(&bufreusecnt, 1);
+	} else {
+		mtx_assert(&bqclean, MA_NOTOWNED);
+
+		/*
+		 * We finally have a valid bp.  We aren't quite out of the
+		 * woods, we still have to reserve kva space.  In order
+		 * to keep fragmentation sane we only allocate kva in
+		 * BKVASIZE chunks.
+		 */
+		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
+
+		if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED |
+		    B_KVAALLOC)) == B_UNMAPPED) {
+			if (allocbufkva(bp, maxsize, gbflags)) {
+				defrag = 1;
+				bp->b_flags |= B_INVAL;
+				brelse(bp);
+				goto restart;
+			}
+			atomic_add_int(&bufreusecnt, 1);
+		} else if ((bp->b_flags & B_KVAALLOC) != 0 &&
+		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) {
+			/*
+			 * If the reused buffer has KVA allocated,
+			 * reassign b_kvaalloc to b_kvabase.
+			 */
+			bp->b_kvabase = bp->b_kvaalloc;
+			bp->b_flags &= ~B_KVAALLOC;
+			atomic_subtract_long(&unmapped_bufspace,
+			    bp->b_kvasize);
+			atomic_add_int(&bufreusecnt, 1);
+		} else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED |
+		    GB_KVAALLOC)) {
+			/*
+			 * The case of reused buffer already have KVA
+			 * mapped, but the request is for unmapped
+			 * buffer with KVA allocated.
+			 */
+			bp->b_kvaalloc = bp->b_kvabase;
+			bp->b_data = bp->b_kvabase = unmapped_buf;
+			bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+			atomic_add_long(&unmapped_bufspace,
+			    bp->b_kvasize);
+			atomic_add_int(&bufreusecnt, 1);
+		}
+		if ((gbflags & GB_UNMAPPED) == 0) {
+			bp->b_saveaddr = bp->b_kvabase;
+			bp->b_data = bp->b_saveaddr;
+			bp->b_flags &= ~B_UNMAPPED;
+			BUF_CHECK_MAPPED(bp);
+		}
+	}
+	return (bp);
+}
+
+/*
+ *	buf_daemon:
+ *
+ *	buffer flushing daemon.  Buffers are normally flushed by the
+ *	update daemon but if it cannot keep up this process starts to
+ *	take the load in an attempt to prevent getnewbuf() from blocking.
+ */
+
+static struct kproc_desc buf_kp = {
+	"bufdaemon",
+	buf_daemon,
+	&bufdaemonproc
+};
+SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
+
+static int
+buf_flush(struct vnode *vp, int target)
+{
+	int flushed;
+
+	flushed = flushbufqueues(vp, target, 0);
+	if (flushed == 0) {
+		/*
+		 * Could not find any buffers without rollback
+		 * dependencies, so just write the first one
+		 * in the hopes of eventually making progress.
+		 */
+		if (vp != NULL && target > 2)
+			target /= 2;
+		flushbufqueues(vp, target, 1);
+	}
+	return (flushed);
+}
+
+static void
+buf_daemon()
+{
+	int lodirty;
+
+	/*
+	 * This process needs to be suspended prior to shutdown sync.
+	 */
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
+	    SHUTDOWN_PRI_LAST);
+
+	/*
+	 * This process is allowed to take the buffer cache to the limit
+	 */
+	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
+	mtx_lock(&bdlock);
+	for (;;) {
+		bd_request = 0;
+		mtx_unlock(&bdlock);
+
+		kproc_suspend_check(bufdaemonproc);
+		lodirty = lodirtybuffers;
+		if (bd_speedupreq) {
+			lodirty = numdirtybuffers / 2;
+			bd_speedupreq = 0;
+		}
+		/*
+		 * Do the flush.  Limit the amount of in-transit I/O we
+		 * allow to build up, otherwise we would completely saturate
+		 * the I/O system.
+		 */
+		while (numdirtybuffers > lodirty) {
+			if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
+				break;
+			kern_yield(PRI_USER);
+		}
+
+		/*
+		 * Only clear bd_request if we have reached our low water
+		 * mark.  The buf_daemon normally waits 1 second and
+		 * then incrementally flushes any dirty buffers that have
+		 * built up, within reason.
+		 *
+		 * If we were unable to hit our low water mark and couldn't
+		 * find any flushable buffers, we sleep for a short period
+		 * to avoid endless loops on unlockable buffers.
+		 */
+		mtx_lock(&bdlock);
+		if (numdirtybuffers <= lodirtybuffers) {
+			/*
+			 * We reached our low water mark, reset the
+			 * request and sleep until we are needed again.
+			 * The sleep is just so the suspend code works.
+			 */
+			bd_request = 0;
+			/*
+			 * Do an extra wakeup in case dirty threshold
+			 * changed via sysctl and the explicit transition
+			 * out of shortfall was missed.
+			 */
+			bdirtywakeup();
+			if (runningbufspace <= lorunningspace)
+				runningwakeup();
+			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
+		} else {
+			/*
+			 * We couldn't find any flushable dirty buffers but
+			 * still have too many dirty buffers, we
+			 * have to sleep and try again.  (rare)
+			 */
+			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
+		}
+	}
+}
+
+/*
+ *	flushbufqueues:
+ *
+ *	Try to flush a buffer in the dirty queue.  We must be careful to
+ *	free up B_INVAL buffers instead of write them, which NFS is 
+ *	particularly sensitive to.
+ */
+static int flushwithdeps = 0;
+SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
+    0, "Number of buffers flushed with dependecies that require rollbacks");
+
+static int
+flushbufqueues(struct vnode *lvp, int target, int flushdeps)
+{
+	struct buf *sentinel;
+	struct vnode *vp;
+	struct mount *mp;
+	struct buf *bp;
+	int hasdeps;
+	int flushed;
+	int queue;
+
+	flushed = 0;
+	queue = QUEUE_DIRTY;
+	bp = NULL;
+	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
+	sentinel->b_qindex = QUEUE_SENTINEL;
+	mtx_lock(&bqdirty);
+	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
+	while (flushed != target) {
+		bp = TAILQ_NEXT(sentinel, b_freelist);
+		if (bp != NULL) {
+			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
+			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
+			    b_freelist);
+		} else
+			break;
+		/*
+		 * Skip sentinels inserted by other invocations of the
+		 * flushbufqueues(), taking care to not reorder them.
+		 */
+		if (bp->b_qindex == QUEUE_SENTINEL)
+			continue;
+		/*
+		 * Only flush the buffers that belong to the
+		 * vnode locked by the curthread.
+		 */
+		if (lvp != NULL && bp->b_vp != lvp)
+			continue;
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+			continue;
+		if (bp->b_pin_count > 0) {
+			BUF_UNLOCK(bp);
+			continue;
+		}
+		/*
+		 * BKGRDINPROG can only be set with the buf and bufobj
+		 * locks both held.  We tolerate a race to clear it here.
+		 */
+		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
+		    (bp->b_flags & B_DELWRI) == 0) {
+			BUF_UNLOCK(bp);
+			continue;
+		}
+		if (bp->b_flags & B_INVAL) {
+			bremfreel(bp);
+			mtx_unlock(&bqdirty);
+			brelse(bp);
+			flushed++;
+			mtx_lock(&bqdirty);
+			continue;
+		}
+
+		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
+			if (flushdeps == 0) {
+				BUF_UNLOCK(bp);
+				continue;
+			}
+			hasdeps = 1;
+		} else
+			hasdeps = 0;
+		/*
+		 * We must hold the lock on a vnode before writing
+		 * one of its buffers. Otherwise we may confuse, or
+		 * in the case of a snapshot vnode, deadlock the
+		 * system.
+		 *
+		 * The lock order here is the reverse of the normal
+		 * of vnode followed by buf lock.  This is ok because
+		 * the NOWAIT will prevent deadlock.
+		 */
+		vp = bp->b_vp;
+		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+			BUF_UNLOCK(bp);
+			continue;
+		}
+		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) {
+			mtx_unlock(&bqdirty);
+			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
+			    bp, bp->b_vp, bp->b_flags);
+			if (curproc == bufdaemonproc)
+				vfs_bio_awrite(bp);
+			else {
+				bremfree(bp);
+				bwrite(bp);
+				notbufdflushes++;
+			}
+			vn_finished_write(mp);
+			VOP_UNLOCK(vp, 0);
+			flushwithdeps += hasdeps;
+			flushed++;
+
+			/*
+			 * Sleeping on runningbufspace while holding
+			 * vnode lock leads to deadlock.
+			 */
+			if (curproc == bufdaemonproc &&
+			    runningbufspace > hirunningspace)
+				waitrunningbufspace();
+			mtx_lock(&bqdirty);
+			continue;
+		}
+		vn_finished_write(mp);
+		BUF_UNLOCK(bp);
+	}
+	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
+	mtx_unlock(&bqdirty);
+	free(sentinel, M_TEMP);
+	return (flushed);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
+struct buf *
+incore(struct bufobj *bo, daddr_t blkno)
+{
+	struct buf *bp;
+
+	BO_RLOCK(bo);
+	bp = gbincore(bo, blkno);
+	BO_RUNLOCK(bo);
+	return (bp);
+}
+
+/*
+ * Returns true if no I/O is needed to access the
+ * associated VM object.  This is like incore except
+ * it also hunts around in the VM system for the data.
+ */
+
+static int
+inmem(struct vnode * vp, daddr_t blkno)
+{
+	vm_object_t obj;
+	vm_offset_t toff, tinc, size;
+	vm_page_t m;
+	vm_ooffset_t off;
+
+	ASSERT_VOP_LOCKED(vp, "inmem");
+
+	if (incore(&vp->v_bufobj, blkno))
+		return 1;
+	if (vp->v_mount == NULL)
+		return 0;
+	obj = vp->v_object;
+	if (obj == NULL)
+		return (0);
+
+	size = PAGE_SIZE;
+	if (size > vp->v_mount->mnt_stat.f_iosize)
+		size = vp->v_mount->mnt_stat.f_iosize;
+	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
+
+	VM_OBJECT_RLOCK(obj);
+	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
+		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
+		if (!m)
+			goto notinmem;
+		tinc = size;
+		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
+			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
+		if (vm_page_is_valid(m,
+		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
+			goto notinmem;
+	}
+	VM_OBJECT_RUNLOCK(obj);
+	return 1;
+
+notinmem:
+	VM_OBJECT_RUNLOCK(obj);
+	return (0);
+}
+
+/*
+ * Set the dirty range for a buffer based on the status of the dirty
+ * bits in the pages comprising the buffer.  The range is limited
+ * to the size of the buffer.
+ *
+ * Tell the VM system that the pages associated with this buffer
+ * are clean.  This is used for delayed writes where the data is
+ * going to go to disk eventually without additional VM intevention.
+ *
+ * Note that while we only really need to clean through to b_bcount, we
+ * just go ahead and clean through to b_bufsize.
+ */
+static void
+vfs_clean_pages_dirty_buf(struct buf *bp)
+{
+	vm_ooffset_t foff, noff, eoff;
+	vm_page_t m;
+	int i;
+
+	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
+		return;
+
+	foff = bp->b_offset;
+	KASSERT(bp->b_offset != NOOFFSET,
+	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
+
+	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+	vfs_drain_busy_pages(bp);
+	vfs_setdirty_locked_object(bp);
+	for (i = 0; i < bp->b_npages; i++) {
+		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+		eoff = noff;
+		if (eoff > bp->b_offset + bp->b_bufsize)
+			eoff = bp->b_offset + bp->b_bufsize;
+		m = bp->b_pages[i];
+		vfs_page_set_validclean(bp, foff, m);
+		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
+		foff = noff;
+	}
+	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+}
+
+static void
+vfs_setdirty_locked_object(struct buf *bp)
+{
+	vm_object_t object;
+	int i;
+
+	object = bp->b_bufobj->bo_object;
+	VM_OBJECT_ASSERT_WLOCKED(object);
+
+	/*
+	 * We qualify the scan for modified pages on whether the
+	 * object has been flushed yet.
+	 */
+	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
+		vm_offset_t boffset;
+		vm_offset_t eoffset;
+
+		/*
+		 * test the pages to see if they have been modified directly
+		 * by users through the VM system.
+		 */
+		for (i = 0; i < bp->b_npages; i++)
+			vm_page_test_dirty(bp->b_pages[i]);
+
+		/*
+		 * Calculate the encompassing dirty range, boffset and eoffset,
+		 * (eoffset - boffset) bytes.
+		 */
+
+		for (i = 0; i < bp->b_npages; i++) {
+			if (bp->b_pages[i]->dirty)
+				break;
+		}
+		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+		for (i = bp->b_npages - 1; i >= 0; --i) {
+			if (bp->b_pages[i]->dirty) {
+				break;
+			}
+		}
+		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+		/*
+		 * Fit it to the buffer.
+		 */
+
+		if (eoffset > bp->b_bcount)
+			eoffset = bp->b_bcount;
+
+		/*
+		 * If we have a good dirty range, merge with the existing
+		 * dirty range.
+		 */
+
+		if (boffset < eoffset) {
+			if (bp->b_dirtyoff > boffset)
+				bp->b_dirtyoff = boffset;
+			if (bp->b_dirtyend < eoffset)
+				bp->b_dirtyend = eoffset;
+		}
+	}
+}
+
+/*
+ * Allocate the KVA mapping for an existing buffer. It handles the
+ * cases of both B_UNMAPPED buffer, and buffer with the preallocated
+ * KVA which is not mapped (B_KVAALLOC).
+ */
+static void
+bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
+{
+	struct buf *scratch_bp;
+	int bsize, maxsize, need_mapping, need_kva;
+	off_t offset;
+
+	need_mapping = (bp->b_flags & B_UNMAPPED) != 0 &&
+	    (gbflags & GB_UNMAPPED) == 0;
+	need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED &&
+	    (gbflags & GB_KVAALLOC) != 0;
+	if (!need_mapping && !need_kva)
+		return;
+
+	BUF_CHECK_UNMAPPED(bp);
+
+	if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) {
+		/*
+		 * Buffer is not mapped, but the KVA was already
+		 * reserved at the time of the instantiation.  Use the
+		 * allocated space.
+		 */
+		bp->b_flags &= ~B_KVAALLOC;
+		KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0"));
+		bp->b_kvabase = bp->b_kvaalloc;
+		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+		goto has_addr;
+	}
+
+	/*
+	 * Calculate the amount of the address space we would reserve
+	 * if the buffer was mapped.
+	 */
+	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
+	offset = blkno * bsize;
+	maxsize = size + (offset & PAGE_MASK);
+	maxsize = imax(maxsize, bsize);
+
+mapping_loop:
+	if (allocbufkva(bp, maxsize, gbflags)) {
+		/*
+		 * Request defragmentation. getnewbuf() returns us the
+		 * allocated space by the scratch buffer KVA.
+		 */
+		scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
+		    (GB_UNMAPPED | GB_KVAALLOC));
+		if (scratch_bp == NULL) {
+			if ((gbflags & GB_NOWAIT_BD) != 0) {
+				/*
+				 * XXXKIB: defragmentation cannot
+				 * succeed, not sure what else to do.
+				 */
+				panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp);
+			}
+			atomic_add_int(&mappingrestarts, 1);
+			goto mapping_loop;
+		}
+		KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0,
+		    ("scratch bp !B_KVAALLOC %p", scratch_bp));
+		setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc,
+		    scratch_bp->b_kvasize, gbflags);
+
+		/* Get rid of the scratch buffer. */
+		scratch_bp->b_kvasize = 0;
+		scratch_bp->b_flags |= B_INVAL;
+		scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
+		brelse(scratch_bp);
+	}
+	if (!need_mapping)
+		return;
+
+has_addr:
+	bp->b_saveaddr = bp->b_kvabase;
+	bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */
+	bp->b_flags &= ~B_UNMAPPED;
+	BUF_CHECK_MAPPED(bp);
+	bpmap_qenter(bp);
+}
+
+/*
+ *	getblk:
+ *
+ *	Get a block given a specified block and offset into a file/device.
+ *	The buffers B_DONE bit will be cleared on return, making it almost
+ * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
+ *	return.  The caller should clear B_INVAL prior to initiating a
+ *	READ.
+ *
+ *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
+ *	an existing buffer.
+ *
+ *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
+ *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
+ *	and then cleared based on the backing VM.  If the previous buffer is
+ *	non-0-sized but invalid, B_CACHE will be cleared.
+ *
+ *	If getblk() must create a new buffer, the new buffer is returned with
+ *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
+ *	case it is returned with B_INVAL clear and B_CACHE set based on the
+ *	backing VM.
+ *
+ *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
+ *	B_CACHE bit is clear.
+ *	
+ *	What this means, basically, is that the caller should use B_CACHE to
+ *	determine whether the buffer is fully valid or not and should clear
+ *	B_INVAL prior to issuing a read.  If the caller intends to validate
+ *	the buffer by loading its data area with something, the caller needs
+ *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
+ *	the caller should set B_CACHE ( as an optimization ), else the caller
+ *	should issue the I/O and biodone() will set B_CACHE if the I/O was
+ *	a write attempt or if it was a successfull read.  If the caller 
+ *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
+ *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
+ */
+struct buf *
+getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
+    int flags)
+{
+	struct buf *bp;
+	struct bufobj *bo;
+	int bsize, error, maxsize, vmio;
+	off_t offset;
+
+	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
+	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
+	ASSERT_VOP_LOCKED(vp, "getblk");
+	if (size > MAXBSIZE)
+		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
+	if (!unmapped_buf_allowed)
+		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+
+	bo = &vp->v_bufobj;
+loop:
+	BO_RLOCK(bo);
+	bp = gbincore(bo, blkno);
+	if (bp != NULL) {
+		int lockflags;
+		/*
+		 * Buffer is in-core.  If the buffer is not busy nor managed,
+		 * it must be on a queue.
+		 */
+		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
+
+		if (flags & GB_LOCK_NOWAIT)
+			lockflags |= LK_NOWAIT;
+
+		error = BUF_TIMELOCK(bp, lockflags,
+		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
+
+		/*
+		 * If we slept and got the lock we have to restart in case
+		 * the buffer changed identities.
+		 */
+		if (error == ENOLCK)
+			goto loop;
+		/* We timed out or were interrupted. */
+		else if (error)
+			return (NULL);
+		/* If recursed, assume caller knows the rules. */
+		else if (BUF_LOCKRECURSED(bp))
+			goto end;
+
+		/*
+		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
+		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
+		 * and for a VMIO buffer B_CACHE is adjusted according to the
+		 * backing VM cache.
+		 */
+		if (bp->b_flags & B_INVAL)
+			bp->b_flags &= ~B_CACHE;
+		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
+			bp->b_flags |= B_CACHE;
+		if (bp->b_flags & B_MANAGED)
+			MPASS(bp->b_qindex == QUEUE_NONE);
+		else
+			bremfree(bp);
+
+		/*
+		 * check for size inconsistencies for non-VMIO case.
+		 */
+		if (bp->b_bcount != size) {
+			if ((bp->b_flags & B_VMIO) == 0 ||
+			    (size > bp->b_kvasize)) {
+				if (bp->b_flags & B_DELWRI) {
+					/*
+					 * If buffer is pinned and caller does
+					 * not want sleep  waiting for it to be
+					 * unpinned, bail out
+					 * */
+					if (bp->b_pin_count > 0) {
+						if (flags & GB_LOCK_NOWAIT) {
+							bqrelse(bp);
+							return (NULL);
+						} else {
+							bunpin_wait(bp);
+						}
+					}
+					bp->b_flags |= B_NOCACHE;
+					bwrite(bp);
+				} else {
+					if (LIST_EMPTY(&bp->b_dep)) {
+						bp->b_flags |= B_RELBUF;
+						brelse(bp);
+					} else {
+						bp->b_flags |= B_NOCACHE;
+						bwrite(bp);
+					}
+				}
+				goto loop;
+			}
+		}
+
+		/*
+		 * Handle the case of unmapped buffer which should
+		 * become mapped, or the buffer for which KVA
+		 * reservation is requested.
+		 */
+		bp_unmapped_get_kva(bp, blkno, size, flags);
+
+		/*
+		 * If the size is inconsistant in the VMIO case, we can resize
+		 * the buffer.  This might lead to B_CACHE getting set or
+		 * cleared.  If the size has not changed, B_CACHE remains
+		 * unchanged from its previous state.
+		 */
+		if (bp->b_bcount != size)
+			allocbuf(bp, size);
+
+		KASSERT(bp->b_offset != NOOFFSET, 
+		    ("getblk: no buffer offset"));
+
+		/*
+		 * A buffer with B_DELWRI set and B_CACHE clear must
+		 * be committed before we can return the buffer in
+		 * order to prevent the caller from issuing a read
+		 * ( due to B_CACHE not being set ) and overwriting
+		 * it.
+		 *
+		 * Most callers, including NFS and FFS, need this to
+		 * operate properly either because they assume they
+		 * can issue a read if B_CACHE is not set, or because
+		 * ( for example ) an uncached B_DELWRI might loop due 
+		 * to softupdates re-dirtying the buffer.  In the latter
+		 * case, B_CACHE is set after the first write completes,
+		 * preventing further loops.
+		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
+		 * above while extending the buffer, we cannot allow the
+		 * buffer to remain with B_CACHE set after the write
+		 * completes or it will represent a corrupt state.  To
+		 * deal with this we set B_NOCACHE to scrap the buffer
+		 * after the write.
+		 *
+		 * We might be able to do something fancy, like setting
+		 * B_CACHE in bwrite() except if B_DELWRI is already set,
+		 * so the below call doesn't set B_CACHE, but that gets real
+		 * confusing.  This is much easier.
+		 */
+
+		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+			bp->b_flags |= B_NOCACHE;
+			bwrite(bp);
+			goto loop;
+		}
+		bp->b_flags &= ~B_DONE;
+	} else {
+		/*
+		 * Buffer is not in-core, create new buffer.  The buffer
+		 * returned by getnewbuf() is locked.  Note that the returned
+		 * buffer is also considered valid (not marked B_INVAL).
+		 */
+		BO_RUNLOCK(bo);
+		/*
+		 * If the user does not want us to create the buffer, bail out
+		 * here.
+		 */
+		if (flags & GB_NOCREAT)
+			return NULL;
+		if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
+			return NULL;
+
+		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
+		offset = blkno * bsize;
+		vmio = vp->v_object != NULL;
+		if (vmio) {
+			maxsize = size + (offset & PAGE_MASK);
+		} else {
+			maxsize = size;
+			/* Do not allow non-VMIO notmapped buffers. */
+			flags &= ~GB_UNMAPPED;
+		}
+		maxsize = imax(maxsize, bsize);
+
+		bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
+		if (bp == NULL) {
+			if (slpflag || slptimeo)
+				return NULL;
+			goto loop;
+		}
+
+		/*
+		 * This code is used to make sure that a buffer is not
+		 * created while the getnewbuf routine is blocked.
+		 * This can be a problem whether the vnode is locked or not.
+		 * If the buffer is created out from under us, we have to
+		 * throw away the one we just created.
+		 *
+		 * Note: this must occur before we associate the buffer
+		 * with the vp especially considering limitations in
+		 * the splay tree implementation when dealing with duplicate
+		 * lblkno's.
+		 */
+		BO_LOCK(bo);
+		if (gbincore(bo, blkno)) {
+			BO_UNLOCK(bo);
+			bp->b_flags |= B_INVAL;
+			brelse(bp);
+			goto loop;
+		}
+
+		/*
+		 * Insert the buffer into the hash, so that it can
+		 * be found by incore.
+		 */
+		bp->b_blkno = bp->b_lblkno = blkno;
+		bp->b_offset = offset;
+		bgetvp(vp, bp);
+		BO_UNLOCK(bo);
+
+		/*
+		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
+		 * buffer size starts out as 0, B_CACHE will be set by
+		 * allocbuf() for the VMIO case prior to it testing the
+		 * backing store for validity.
+		 */
+
+		if (vmio) {
+			bp->b_flags |= B_VMIO;
+			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
+			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
+			    bp, vp->v_object, bp->b_bufobj->bo_object));
+		} else {
+			bp->b_flags &= ~B_VMIO;
+			KASSERT(bp->b_bufobj->bo_object == NULL,
+			    ("ARGH! has b_bufobj->bo_object %p %p\n",
+			    bp, bp->b_bufobj->bo_object));
+			BUF_CHECK_MAPPED(bp);
+		}
+
+		allocbuf(bp, size);
+		bp->b_flags &= ~B_DONE;
+	}
+	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
+	BUF_ASSERT_HELD(bp);
+end:
+	KASSERT(bp->b_bufobj == bo,
+	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+	return (bp);
+}
+
+/*
+ * Get an empty, disassociated buffer of given size.  The buffer is initially
+ * set to B_INVAL.
+ */
+struct buf *
+geteblk(int size, int flags)
+{
+	struct buf *bp;
+	int maxsize;
+
+	maxsize = (size + BKVAMASK) & ~BKVAMASK;
+	while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
+		if ((flags & GB_NOWAIT_BD) &&
+		    (curthread->td_pflags & TDP_BUFNEED) != 0)
+			return (NULL);
+	}
+	allocbuf(bp, size);
+	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
+	BUF_ASSERT_HELD(bp);
+	return (bp);
+}
+
+
+/*
+ * This code constitutes the buffer memory from either anonymous system
+ * memory (in the case of non-VMIO operations) or from an associated
+ * VM object (in the case of VMIO operations).  This code is able to
+ * resize a buffer up or down.
+ *
+ * Note that this code is tricky, and has many complications to resolve
+ * deadlock or inconsistant data situations.  Tread lightly!!! 
+ * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
+ * the caller.  Calling this code willy nilly can result in the loss of data.
+ *
+ * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
+ * B_CACHE for the non-VMIO case.
+ */
+
+int
+allocbuf(struct buf *bp, int size)
+{
+	int newbsize, mbsize;
+	int i;
+
+	BUF_ASSERT_HELD(bp);
+
+	if (bp->b_kvasize < size)
+		panic("allocbuf: buffer too small");
+
+	if ((bp->b_flags & B_VMIO) == 0) {
+		caddr_t origbuf;
+		int origbufsize;
+		/*
+		 * Just get anonymous memory from the kernel.  Don't
+		 * mess with B_CACHE.
+		 */
+		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+		if (bp->b_flags & B_MALLOC)
+			newbsize = mbsize;
+		else
+			newbsize = round_page(size);
+
+		if (newbsize < bp->b_bufsize) {
+			/*
+			 * malloced buffers are not shrunk
+			 */
+			if (bp->b_flags & B_MALLOC) {
+				if (newbsize) {
+					bp->b_bcount = size;
+				} else {
+					free(bp->b_data, M_BIOBUF);
+					if (bp->b_bufsize) {
+						atomic_subtract_long(
+						    &bufmallocspace,
+						    bp->b_bufsize);
+						bufspacewakeup();
+						bp->b_bufsize = 0;
+					}
+					bp->b_saveaddr = bp->b_kvabase;
+					bp->b_data = bp->b_saveaddr;
+					bp->b_bcount = 0;
+					bp->b_flags &= ~B_MALLOC;
+				}
+				return 1;
+			}		
+			vm_hold_free_pages(bp, newbsize);
+		} else if (newbsize > bp->b_bufsize) {
+			/*
+			 * We only use malloced memory on the first allocation.
+			 * and revert to page-allocated memory when the buffer
+			 * grows.
+			 */
+			/*
+			 * There is a potential smp race here that could lead
+			 * to bufmallocspace slightly passing the max.  It
+			 * is probably extremely rare and not worth worrying
+			 * over.
+			 */
+			if ( (bufmallocspace < maxbufmallocspace) &&
+				(bp->b_bufsize == 0) &&
+				(mbsize <= PAGE_SIZE/2)) {
+
+				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
+				bp->b_bufsize = mbsize;
+				bp->b_bcount = size;
+				bp->b_flags |= B_MALLOC;
+				atomic_add_long(&bufmallocspace, mbsize);
+				return 1;
+			}
+			origbuf = NULL;
+			origbufsize = 0;
+			/*
+			 * If the buffer is growing on its other-than-first allocation,
+			 * then we revert to the page-allocation scheme.
+			 */
+			if (bp->b_flags & B_MALLOC) {
+				origbuf = bp->b_data;
+				origbufsize = bp->b_bufsize;
+				bp->b_data = bp->b_kvabase;
+				if (bp->b_bufsize) {
+					atomic_subtract_long(&bufmallocspace,
+					    bp->b_bufsize);
+					bufspacewakeup();
+					bp->b_bufsize = 0;
+				}
+				bp->b_flags &= ~B_MALLOC;
+				newbsize = round_page(newbsize);
+			}
+			vm_hold_load_pages(
+			    bp,
+			    (vm_offset_t) bp->b_data + bp->b_bufsize,
+			    (vm_offset_t) bp->b_data + newbsize);
+			if (origbuf) {
+				bcopy(origbuf, bp->b_data, origbufsize);
+				free(origbuf, M_BIOBUF);
+			}
+		}
+	} else {
+		int desiredpages;
+
+		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+		desiredpages = (size == 0) ? 0 :
+			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
+
+		if (bp->b_flags & B_MALLOC)
+			panic("allocbuf: VMIO buffer can't be malloced");
+		/*
+		 * Set B_CACHE initially if buffer is 0 length or will become
+		 * 0-length.
+		 */
+		if (size == 0 || bp->b_bufsize == 0)
+			bp->b_flags |= B_CACHE;
+
+		if (newbsize < bp->b_bufsize) {
+			/*
+			 * DEV_BSIZE aligned new buffer size is less then the
+			 * DEV_BSIZE aligned existing buffer size.  Figure out
+			 * if we have to remove any pages.
+			 */
+			if (desiredpages < bp->b_npages) {
+				vm_page_t m;
+
+				if ((bp->b_flags & B_UNMAPPED) == 0) {
+					BUF_CHECK_MAPPED(bp);
+					pmap_qremove((vm_offset_t)trunc_page(
+					    (vm_offset_t)bp->b_data) +
+					    (desiredpages << PAGE_SHIFT),
+					    (bp->b_npages - desiredpages));
+				} else
+					BUF_CHECK_UNMAPPED(bp);
+				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+				for (i = desiredpages; i < bp->b_npages; i++) {
+					/*
+					 * the page is not freed here -- it
+					 * is the responsibility of 
+					 * vnode_pager_setsize
+					 */
+					m = bp->b_pages[i];
+					KASSERT(m != bogus_page,
+					    ("allocbuf: bogus page found"));
+					while (vm_page_sleep_if_busy(m,
+					    "biodep"))
+						continue;
+
+					bp->b_pages[i] = NULL;
+					vm_page_lock(m);
+					vm_page_unwire(m, 0);
+					vm_page_unlock(m);
+				}
+				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+				bp->b_npages = desiredpages;
+			}
+		} else if (size > bp->b_bcount) {
+			/*
+			 * We are growing the buffer, possibly in a 
+			 * byte-granular fashion.
+			 */
+			vm_object_t obj;
+			vm_offset_t toff;
+			vm_offset_t tinc;
+
+			/*
+			 * Step 1, bring in the VM pages from the object, 
+			 * allocating them if necessary.  We must clear
+			 * B_CACHE if these pages are not valid for the 
+			 * range covered by the buffer.
+			 */
+
+			obj = bp->b_bufobj->bo_object;
+
+			VM_OBJECT_WLOCK(obj);
+			while (bp->b_npages < desiredpages) {
+				vm_page_t m;
+
+				/*
+				 * We must allocate system pages since blocking
+				 * here could interfere with paging I/O, no
+				 * matter which process we are.
+				 *
+				 * Only exclusive busy can be tested here.
+				 * Blocking on shared busy might lead to
+				 * deadlocks once allocbuf() is called after
+				 * pages are vfs_busy_pages().
+				 */
+				m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) +
+				    bp->b_npages, VM_ALLOC_NOBUSY |
+				    VM_ALLOC_SYSTEM | VM_ALLOC_WIRED |
+				    VM_ALLOC_IGN_SBUSY |
+				    VM_ALLOC_COUNT(desiredpages - bp->b_npages));
+				if (m->valid == 0)
+					bp->b_flags &= ~B_CACHE;
+				bp->b_pages[bp->b_npages] = m;
+				++bp->b_npages;
+			}
+
+			/*
+			 * Step 2.  We've loaded the pages into the buffer,
+			 * we have to figure out if we can still have B_CACHE
+			 * set.  Note that B_CACHE is set according to the
+			 * byte-granular range ( bcount and size ), new the
+			 * aligned range ( newbsize ).
+			 *
+			 * The VM test is against m->valid, which is DEV_BSIZE
+			 * aligned.  Needless to say, the validity of the data
+			 * needs to also be DEV_BSIZE aligned.  Note that this
+			 * fails with NFS if the server or some other client
+			 * extends the file's EOF.  If our buffer is resized, 
+			 * B_CACHE may remain set! XXX
+			 */
+
+			toff = bp->b_bcount;
+			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
+
+			while ((bp->b_flags & B_CACHE) && toff < size) {
+				vm_pindex_t pi;
+
+				if (tinc > (size - toff))
+					tinc = size - toff;
+
+				pi = ((bp->b_offset & PAGE_MASK) + toff) >> 
+				    PAGE_SHIFT;
+
+				vfs_buf_test_cache(
+				    bp, 
+				    bp->b_offset,
+				    toff, 
+				    tinc, 
+				    bp->b_pages[pi]
+				);
+				toff += tinc;
+				tinc = PAGE_SIZE;
+			}
+			VM_OBJECT_WUNLOCK(obj);
+
+			/*
+			 * Step 3, fixup the KVM pmap.
+			 */
+			if ((bp->b_flags & B_UNMAPPED) == 0)
+				bpmap_qenter(bp);
+			else
+				BUF_CHECK_UNMAPPED(bp);
+		}
+	}
+	if (newbsize < bp->b_bufsize)
+		bufspacewakeup();
+	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
+	bp->b_bcount = size;		/* requested buffer size	*/
+	return 1;
+}
+
+extern int inflight_transient_maps;
+
+void
+biodone(struct bio *bp)
+{
+	struct mtx *mtxp;
+	void (*done)(struct bio *);
+	vm_offset_t start, end;
+	int transient;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, bp);
+	mtx_lock(mtxp);
+	bp->bio_flags |= BIO_DONE;
+	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
+		start = trunc_page((vm_offset_t)bp->bio_data);
+		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
+		transient = 1;
+	} else {
+		transient = 0;
+		start = end = 0;
+	}
+	done = bp->bio_done;
+	if (done == NULL)
+		wakeup(bp);
+	mtx_unlock(mtxp);
+	if (done != NULL)
+		done(bp);
+	if (transient) {
+		pmap_qremove(start, OFF_TO_IDX(end - start));
+		vmem_free(transient_arena, start, end - start);
+		atomic_add_int(&inflight_transient_maps, -1);
+	}
+}
+
+/*
+ * Wait for a BIO to finish.
+ *
+ * XXX: resort to a timeout for now.  The optimal locking (if any) for this
+ * case is not yet clear.
+ */
+int
+biowait(struct bio *bp, const char *wchan)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, bp);
+	mtx_lock(mtxp);
+	while ((bp->bio_flags & BIO_DONE) == 0)
+		msleep(bp, mtxp, PRIBIO, wchan, hz / 10);
+	mtx_unlock(mtxp);
+	if (bp->bio_error != 0)
+		return (bp->bio_error);
+	if (!(bp->bio_flags & BIO_ERROR))
+		return (0);
+	return (EIO);
+}
+
+void
+biofinish(struct bio *bp, struct devstat *stat, int error)
+{
+	
+	if (error) {
+		bp->bio_error = error;
+		bp->bio_flags |= BIO_ERROR;
+	}
+	if (stat != NULL)
+		devstat_end_transaction_bio(stat, bp);
+	biodone(bp);
+}
+
+/*
+ *	bufwait:
+ *
+ *	Wait for buffer I/O completion, returning error status.  The buffer
+ *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
+ *	error and cleared.
+ */
+int
+bufwait(struct buf *bp)
+{
+	if (bp->b_iocmd == BIO_READ)
+		bwait(bp, PRIBIO, "biord");
+	else
+		bwait(bp, PRIBIO, "biowr");
+	if (bp->b_flags & B_EINTR) {
+		bp->b_flags &= ~B_EINTR;
+		return (EINTR);
+	}
+	if (bp->b_ioflags & BIO_ERROR) {
+		return (bp->b_error ? bp->b_error : EIO);
+	} else {
+		return (0);
+	}
+}
+
+ /*
+  * Call back function from struct bio back up to struct buf.
+  */
+static void
+bufdonebio(struct bio *bip)
+{
+	struct buf *bp;
+
+	bp = bip->bio_caller2;
+	bp->b_resid = bp->b_bcount - bip->bio_completed;
+	bp->b_resid = bip->bio_resid;	/* XXX: remove */
+	bp->b_ioflags = bip->bio_flags;
+	bp->b_error = bip->bio_error;
+	if (bp->b_error)
+		bp->b_ioflags |= BIO_ERROR;
+	bufdone(bp);
+	g_destroy_bio(bip);
+}
+
+void
+dev_strategy(struct cdev *dev, struct buf *bp)
+{
+	struct cdevsw *csw;
+	int ref;
+
+	KASSERT(dev->si_refcount > 0,
+	    ("dev_strategy on un-referenced struct cdev *(%s) %p",
+	    devtoname(dev), dev));
+
+	csw = dev_refthread(dev, &ref);
+	dev_strategy_csw(dev, csw, bp);
+	dev_relthread(dev, ref);
+}
+
+void
+dev_strategy_csw(struct cdev *dev, struct cdevsw *csw, struct buf *bp)
+{
+	struct bio *bip;
+
+	KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE,
+	    ("b_iocmd botch"));
+	KASSERT(((dev->si_flags & SI_ETERNAL) != 0 && csw != NULL) ||
+	    dev->si_threadcount > 0,
+	    ("dev_strategy_csw threadcount cdev *(%s) %p", devtoname(dev),
+	    dev));
+	if (csw == NULL) {
+		bp->b_error = ENXIO;
+		bp->b_ioflags = BIO_ERROR;
+		bufdone(bp);
+		return;
+	}
+	for (;;) {
+		bip = g_new_bio();
+		if (bip != NULL)
+			break;
+		/* Try again later */
+		tsleep(&bp, PRIBIO, "dev_strat", hz/10);
+	}
+	bip->bio_cmd = bp->b_iocmd;
+	bip->bio_offset = bp->b_iooffset;
+	bip->bio_length = bp->b_bcount;
+	bip->bio_bcount = bp->b_bcount;	/* XXX: remove */
+	bdata2bio(bp, bip);
+	bip->bio_done = bufdonebio;
+	bip->bio_caller2 = bp;
+	bip->bio_dev = dev;
+	(*csw->d_strategy)(bip);
+}
+
+/*
+ *	bufdone:
+ *
+ *	Finish I/O on a buffer, optionally calling a completion function.
+ *	This is usually called from an interrupt so process blocking is
+ *	not allowed.
+ *
+ *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
+ *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
+ *	assuming B_INVAL is clear.
+ *
+ *	For the VMIO case, we set B_CACHE if the op was a read and no
+ *	read error occured, or if the op was a write.  B_CACHE is never
+ *	set if the buffer is invalid or otherwise uncacheable.
+ *
+ *	biodone does not mess with B_INVAL, allowing the I/O routine or the
+ *	initiator to leave B_INVAL set to brelse the buffer out of existance
+ *	in the biodone routine.
+ */
+void
+bufdone(struct buf *bp)
+{
+	struct bufobj *dropobj;
+	void    (*biodone)(struct buf *);
+
+	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	dropobj = NULL;
+
+	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
+	BUF_ASSERT_HELD(bp);
+
+	runningbufwakeup(bp);
+	if (bp->b_iocmd == BIO_WRITE)
+		dropobj = bp->b_bufobj;
+	/* call optional completion function if requested */
+	if (bp->b_iodone != NULL) {
+		biodone = bp->b_iodone;
+		bp->b_iodone = NULL;
+		(*biodone) (bp);
+		if (dropobj)
+			bufobj_wdrop(dropobj);
+		return;
+	}
+
+	bufdone_finish(bp);
+
+	if (dropobj)
+		bufobj_wdrop(dropobj);
+}
+
+void
+bufdone_finish(struct buf *bp)
+{
+	BUF_ASSERT_HELD(bp);
+
+	if (!LIST_EMPTY(&bp->b_dep))
+		buf_complete(bp);
+
+	if (bp->b_flags & B_VMIO) {
+		vm_ooffset_t foff;
+		vm_page_t m;
+		vm_object_t obj;
+		struct vnode *vp;
+		int bogus, i, iosize;
+
+		obj = bp->b_bufobj->bo_object;
+		KASSERT(obj->paging_in_progress >= bp->b_npages,
+		    ("biodone_finish: paging in progress(%d) < b_npages(%d)",
+		    obj->paging_in_progress, bp->b_npages));
+
+		vp = bp->b_vp;
+		KASSERT(vp->v_holdcnt > 0,
+		    ("biodone_finish: vnode %p has zero hold count", vp));
+		KASSERT(vp->v_object != NULL,
+		    ("biodone_finish: vnode %p has no vm_object", vp));
+
+		foff = bp->b_offset;
+		KASSERT(bp->b_offset != NOOFFSET,
+		    ("biodone_finish: bp %p has no buffer offset", bp));
+
+		/*
+		 * Set B_CACHE if the op was a normal read and no error
+		 * occured.  B_CACHE is set for writes in the b*write()
+		 * routines.
+		 */
+		iosize = bp->b_bcount - bp->b_resid;
+		if (bp->b_iocmd == BIO_READ &&
+		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
+		    !(bp->b_ioflags & BIO_ERROR)) {
+			bp->b_flags |= B_CACHE;
+		}
+		bogus = 0;
+		VM_OBJECT_WLOCK(obj);
+		for (i = 0; i < bp->b_npages; i++) {
+			int bogusflag = 0;
+			int resid;
+
+			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
+			if (resid > iosize)
+				resid = iosize;
+
+			/*
+			 * cleanup bogus pages, restoring the originals
+			 */
+			m = bp->b_pages[i];
+			if (m == bogus_page) {
+				bogus = bogusflag = 1;
+				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
+				if (m == NULL)
+					panic("biodone: page disappeared!");
+				bp->b_pages[i] = m;
+			}
+			KASSERT(OFF_TO_IDX(foff) == m->pindex,
+			    ("biodone_finish: foff(%jd)/pindex(%ju) mismatch",
+			    (intmax_t)foff, (uintmax_t)m->pindex));
+
+			/*
+			 * In the write case, the valid and clean bits are
+			 * already changed correctly ( see bdwrite() ), so we 
+			 * only need to do this here in the read case.
+			 */
+			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
+				KASSERT((m->dirty & vm_page_bits(foff &
+				    PAGE_MASK, resid)) == 0, ("bufdone_finish:"
+				    " page %p has unexpected dirty bits", m));
+				vfs_page_set_valid(bp, foff, m);
+			}
+
+			vm_page_sunbusy(m);
+			vm_object_pip_subtract(obj, 1);
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+			iosize -= resid;
+		}
+		vm_object_pip_wakeupn(obj, 0);
+		VM_OBJECT_WUNLOCK(obj);
+		if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+			BUF_CHECK_MAPPED(bp);
+			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+			    bp->b_pages, bp->b_npages);
+		}
+	}
+
+	/*
+	 * For asynchronous completions, release the buffer now. The brelse
+	 * will do a wakeup there if necessary - so no need to do a wakeup
+	 * here in the async case. The sync case always needs to do a wakeup.
+	 */
+
+	if (bp->b_flags & B_ASYNC) {
+		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
+			brelse(bp);
+		else
+			bqrelse(bp);
+	} else
+		bdone(bp);
+}
+
+/*
+ * This routine is called in lieu of iodone in the case of
+ * incomplete I/O.  This keeps the busy status for pages
+ * consistant.
+ */
+void
+vfs_unbusy_pages(struct buf *bp)
+{
+	int i;
+	vm_object_t obj;
+	vm_page_t m;
+
+	runningbufwakeup(bp);
+	if (!(bp->b_flags & B_VMIO))
+		return;
+
+	obj = bp->b_bufobj->bo_object;
+	VM_OBJECT_WLOCK(obj);
+	for (i = 0; i < bp->b_npages; i++) {
+		m = bp->b_pages[i];
+		if (m == bogus_page) {
+			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
+			if (!m)
+				panic("vfs_unbusy_pages: page missing\n");
+			bp->b_pages[i] = m;
+			if ((bp->b_flags & B_UNMAPPED) == 0) {
+				BUF_CHECK_MAPPED(bp);
+				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+				    bp->b_pages, bp->b_npages);
+			} else
+				BUF_CHECK_UNMAPPED(bp);
+		}
+		vm_object_pip_subtract(obj, 1);
+		vm_page_sunbusy(m);
+	}
+	vm_object_pip_wakeupn(obj, 0);
+	VM_OBJECT_WUNLOCK(obj);
+}
+
+/*
+ * vfs_page_set_valid:
+ *
+ *	Set the valid bits in a page based on the supplied offset.   The
+ *	range is restricted to the buffer's size.
+ *
+ *	This routine is typically called after a read completes.
+ */
+static void
+vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
+{
+	vm_ooffset_t eoff;
+
+	/*
+	 * Compute the end offset, eoff, such that [off, eoff) does not span a
+	 * page boundary and eoff is not greater than the end of the buffer.
+	 * The end of the buffer, in this case, is our file EOF, not the
+	 * allocation size of the buffer.
+	 */
+	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
+	if (eoff > bp->b_offset + bp->b_bcount)
+		eoff = bp->b_offset + bp->b_bcount;
+
+	/*
+	 * Set valid range.  This is typically the entire buffer and thus the
+	 * entire page.
+	 */
+	if (eoff > off)
+		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
+}
+
+/*
+ * vfs_page_set_validclean:
+ *
+ *	Set the valid bits and clear the dirty bits in a page based on the
+ *	supplied offset.   The range is restricted to the buffer's size.
+ */
+static void
+vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
+{
+	vm_ooffset_t soff, eoff;
+
+	/*
+	 * Start and end offsets in buffer.  eoff - soff may not cross a
+	 * page boundry or cross the end of the buffer.  The end of the
+	 * buffer, in this case, is our file EOF, not the allocation size
+	 * of the buffer.
+	 */
+	soff = off;
+	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+	if (eoff > bp->b_offset + bp->b_bcount)
+		eoff = bp->b_offset + bp->b_bcount;
+
+	/*
+	 * Set valid range.  This is typically the entire buffer and thus the
+	 * entire page.
+	 */
+	if (eoff > soff) {
+		vm_page_set_validclean(
+		    m,
+		   (vm_offset_t) (soff & PAGE_MASK),
+		   (vm_offset_t) (eoff - soff)
+		);
+	}
+}
+
+/*
+ * Ensure that all buffer pages are not exclusive busied.  If any page is
+ * exclusive busy, drain it.
+ */
+void
+vfs_drain_busy_pages(struct buf *bp)
+{
+	vm_page_t m;
+	int i, last_busied;
+
+	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
+	last_busied = 0;
+	for (i = 0; i < bp->b_npages; i++) {
+		m = bp->b_pages[i];
+		if (vm_page_xbusied(m)) {
+			for (; last_busied < i; last_busied++)
+				vm_page_sbusy(bp->b_pages[last_busied]);
+			while (vm_page_xbusied(m)) {
+				vm_page_lock(m);
+				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+				vm_page_busy_sleep(m, "vbpage");
+				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+			}
+		}
+	}
+	for (i = 0; i < last_busied; i++)
+		vm_page_sunbusy(bp->b_pages[i]);
+}
+
+/*
+ * This routine is called before a device strategy routine.
+ * It is used to tell the VM system that paging I/O is in
+ * progress, and treat the pages associated with the buffer
+ * almost as being exclusive busy.  Also the object paging_in_progress
+ * flag is handled to make sure that the object doesn't become
+ * inconsistant.
+ *
+ * Since I/O has not been initiated yet, certain buffer flags
+ * such as BIO_ERROR or B_INVAL may be in an inconsistant state
+ * and should be ignored.
+ */
+void
+vfs_busy_pages(struct buf *bp, int clear_modify)
+{
+	int i, bogus;
+	vm_object_t obj;
+	vm_ooffset_t foff;
+	vm_page_t m;
+
+	if (!(bp->b_flags & B_VMIO))
+		return;
+
+	obj = bp->b_bufobj->bo_object;
+	foff = bp->b_offset;
+	KASSERT(bp->b_offset != NOOFFSET,
+	    ("vfs_busy_pages: no buffer offset"));
+	VM_OBJECT_WLOCK(obj);
+	vfs_drain_busy_pages(bp);
+	if (bp->b_bufsize != 0)
+		vfs_setdirty_locked_object(bp);
+	bogus = 0;
+	for (i = 0; i < bp->b_npages; i++) {
+		m = bp->b_pages[i];
+
+		if ((bp->b_flags & B_CLUSTER) == 0) {
+			vm_object_pip_add(obj, 1);
+			vm_page_sbusy(m);
+		}
+		/*
+		 * When readying a buffer for a read ( i.e
+		 * clear_modify == 0 ), it is important to do
+		 * bogus_page replacement for valid pages in 
+		 * partially instantiated buffers.  Partially 
+		 * instantiated buffers can, in turn, occur when
+		 * reconstituting a buffer from its VM backing store
+		 * base.  We only have to do this if B_CACHE is
+		 * clear ( which causes the I/O to occur in the
+		 * first place ).  The replacement prevents the read
+		 * I/O from overwriting potentially dirty VM-backed
+		 * pages.  XXX bogus page replacement is, uh, bogus.
+		 * It may not work properly with small-block devices.
+		 * We need to find a better way.
+		 */
+		if (clear_modify) {
+			pmap_remove_write(m);
+			vfs_page_set_validclean(bp, foff, m);
+		} else if (m->valid == VM_PAGE_BITS_ALL &&
+		    (bp->b_flags & B_CACHE) == 0) {
+			bp->b_pages[i] = bogus_page;
+			bogus++;
+		}
+		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+	}
+	VM_OBJECT_WUNLOCK(obj);
+	if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+		BUF_CHECK_MAPPED(bp);
+		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+		    bp->b_pages, bp->b_npages);
+	}
+}
+
+/*
+ *	vfs_bio_set_valid:
+ *
+ *	Set the range within the buffer to valid.  The range is
+ *	relative to the beginning of the buffer, b_offset.  Note that
+ *	b_offset itself may be offset from the beginning of the first
+ *	page.
+ */
+void   
+vfs_bio_set_valid(struct buf *bp, int base, int size)
+{
+	int i, n;
+	vm_page_t m;
+
+	if (!(bp->b_flags & B_VMIO))
+		return;
+
+	/*
+	 * Fixup base to be relative to beginning of first page.
+	 * Set initial n to be the maximum number of bytes in the
+	 * first page that can be validated.
+	 */
+	base += (bp->b_offset & PAGE_MASK);
+	n = PAGE_SIZE - (base & PAGE_MASK);
+
+	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+		m = bp->b_pages[i];
+		if (n > size)
+			n = size;
+		vm_page_set_valid_range(m, base & PAGE_MASK, n);
+		base += n;
+		size -= n;
+		n = PAGE_SIZE;
+	}
+	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+}
+
+/*
+ *	vfs_bio_clrbuf:
+ *
+ *	If the specified buffer is a non-VMIO buffer, clear the entire
+ *	buffer.  If the specified buffer is a VMIO buffer, clear and
+ *	validate only the previously invalid portions of the buffer.
+ *	This routine essentially fakes an I/O, so we need to clear
+ *	BIO_ERROR and B_INVAL.
+ *
+ *	Note that while we only theoretically need to clear through b_bcount,
+ *	we go ahead and clear through b_bufsize.
+ */
+void
+vfs_bio_clrbuf(struct buf *bp) 
+{
+	int i, j, mask, sa, ea, slide;
+
+	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
+		clrbuf(bp);
+		return;
+	}
+	bp->b_flags &= ~B_INVAL;
+	bp->b_ioflags &= ~BIO_ERROR;
+	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
+	    (bp->b_offset & PAGE_MASK) == 0) {
+		if (bp->b_pages[0] == bogus_page)
+			goto unlock;
+		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
+		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
+		if ((bp->b_pages[0]->valid & mask) == mask)
+			goto unlock;
+		if ((bp->b_pages[0]->valid & mask) == 0) {
+			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
+			bp->b_pages[0]->valid |= mask;
+			goto unlock;
+		}
+	}
+	sa = bp->b_offset & PAGE_MASK;
+	slide = 0;
+	for (i = 0; i < bp->b_npages; i++, sa = 0) {
+		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
+		ea = slide & PAGE_MASK;
+		if (ea == 0)
+			ea = PAGE_SIZE;
+		if (bp->b_pages[i] == bogus_page)
+			continue;
+		j = sa / DEV_BSIZE;
+		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
+		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
+		if ((bp->b_pages[i]->valid & mask) == mask)
+			continue;
+		if ((bp->b_pages[i]->valid & mask) == 0)
+			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
+		else {
+			for (; sa < ea; sa += DEV_BSIZE, j++) {
+				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
+					pmap_zero_page_area(bp->b_pages[i],
+					    sa, DEV_BSIZE);
+				}
+			}
+		}
+		bp->b_pages[i]->valid |= mask;
+	}
+unlock:
+	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+	bp->b_resid = 0;
+}
+
+void
+vfs_bio_bzero_buf(struct buf *bp, int base, int size)
+{
+	vm_page_t m;
+	int i, n;
+
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		BUF_CHECK_MAPPED(bp);
+		bzero(bp->b_data + base, size);
+	} else {
+		BUF_CHECK_UNMAPPED(bp);
+		n = PAGE_SIZE - (base & PAGE_MASK);
+		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+			m = bp->b_pages[i];
+			if (n > size)
+				n = size;
+			pmap_zero_page_area(m, base & PAGE_MASK, n);
+			base += n;
+			size -= n;
+			n = PAGE_SIZE;
+		}
+	}
+}
+
+/*
+ * vm_hold_load_pages and vm_hold_free_pages get pages into
+ * a buffers address space.  The pages are anonymous and are
+ * not associated with a file object.
+ */
+static void
+vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
+{
+	vm_offset_t pg;
+	vm_page_t p;
+	int index;
+
+	BUF_CHECK_MAPPED(bp);
+
+	to = round_page(to);
+	from = round_page(from);
+	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+
+	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+tryagain:
+		/*
+		 * note: must allocate system pages since blocking here
+		 * could interfere with paging I/O, no matter which
+		 * process we are.
+		 */
+		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
+		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
+		if (p == NULL) {
+			VM_WAIT;
+			goto tryagain;
+		}
+		pmap_qenter(pg, &p, 1);
+		bp->b_pages[index] = p;
+	}
+	bp->b_npages = index;
+}
+
+/* Return pages associated with this buf to the vm system */
+static void
+vm_hold_free_pages(struct buf *bp, int newbsize)
+{
+	vm_offset_t from;
+	vm_page_t p;
+	int index, newnpages;
+
+	BUF_CHECK_MAPPED(bp);
+
+	from = round_page((vm_offset_t)bp->b_data + newbsize);
+	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+	if (bp->b_npages > newnpages)
+		pmap_qremove(from, bp->b_npages - newnpages);
+	for (index = newnpages; index < bp->b_npages; index++) {
+		p = bp->b_pages[index];
+		bp->b_pages[index] = NULL;
+		if (vm_page_sbusied(p))
+			printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
+			    (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
+		p->wire_count--;
+		vm_page_free(p);
+		atomic_subtract_int(&cnt.v_wire_count, 1);
+	}
+	bp->b_npages = newnpages;
+}
+
+/*
+ * Map an IO request into kernel virtual address space.
+ *
+ * All requests are (re)mapped into kernel VA space.
+ * Notice that we use b_bufsize for the size of the buffer
+ * to be mapped.  b_bcount might be modified by the driver.
+ *
+ * Note that even if the caller determines that the address space should
+ * be valid, a race or a smaller-file mapped into a larger space may
+ * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
+ * check the return value.
+ */
+int
+vmapbuf(struct buf *bp, int mapbuf)
+{
+	caddr_t kva;
+	vm_prot_t prot;
+	int pidx;
+
+	if (bp->b_bufsize < 0)
+		return (-1);
+	prot = VM_PROT_READ;
+	if (bp->b_iocmd == BIO_READ)
+		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
+	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
+	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
+	    btoc(MAXPHYS))) < 0)
+		return (-1);
+	bp->b_npages = pidx;
+	if (mapbuf || !unmapped_buf_allowed) {
+		pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
+		kva = bp->b_saveaddr;
+		bp->b_saveaddr = bp->b_data;
+		bp->b_data = kva + (((vm_offset_t)bp->b_data) & PAGE_MASK);
+		bp->b_flags &= ~B_UNMAPPED;
+	} else {
+		bp->b_flags |= B_UNMAPPED;
+		bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
+		bp->b_saveaddr = bp->b_data;
+		bp->b_data = unmapped_buf;
+	}
+	return(0);
+}
+
+/*
+ * Free the io map PTEs associated with this IO operation.
+ * We also invalidate the TLB entries and restore the original b_addr.
+ */
+void
+vunmapbuf(struct buf *bp)
+{
+	int npages;
+
+	npages = bp->b_npages;
+	if (bp->b_flags & B_UNMAPPED)
+		bp->b_flags &= ~B_UNMAPPED;
+	else
+		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
+	vm_page_unhold_pages(bp->b_pages, npages);
+	
+	bp->b_data = bp->b_saveaddr;
+}
+
+void
+bdone(struct buf *bp)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, bp);
+	mtx_lock(mtxp);
+	bp->b_flags |= B_DONE;
+	wakeup(bp);
+	mtx_unlock(mtxp);
+}
+
+void
+bwait(struct buf *bp, u_char pri, const char *wchan)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, bp);
+	mtx_lock(mtxp);
+	while ((bp->b_flags & B_DONE) == 0)
+		msleep(bp, mtxp, pri, wchan, 0);
+	mtx_unlock(mtxp);
+}
+
+int
+bufsync(struct bufobj *bo, int waitfor)
+{
+
+	return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread));
+}
+
+void
+bufstrategy(struct bufobj *bo, struct buf *bp)
+{
+	int i = 0;
+	struct vnode *vp;
+
+	vp = bp->b_vp;
+	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
+	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
+	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
+	i = VOP_STRATEGY(vp, bp);
+	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
+}
+
+void
+bufobj_wrefl(struct bufobj *bo)
+{
+
+	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
+	ASSERT_BO_WLOCKED(bo);
+	bo->bo_numoutput++;
+}
+
+void
+bufobj_wref(struct bufobj *bo)
+{
+
+	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
+	BO_LOCK(bo);
+	bo->bo_numoutput++;
+	BO_UNLOCK(bo);
+}
+
+void
+bufobj_wdrop(struct bufobj *bo)
+{
+
+	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
+	BO_LOCK(bo);
+	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
+	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
+		bo->bo_flag &= ~BO_WWAIT;
+		wakeup(&bo->bo_numoutput);
+	}
+	BO_UNLOCK(bo);
+}
+
+int
+bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
+{
+	int error;
+
+	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
+	ASSERT_BO_WLOCKED(bo);
+	error = 0;
+	while (bo->bo_numoutput) {
+		bo->bo_flag |= BO_WWAIT;
+		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
+		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
+		if (error)
+			break;
+	}
+	return (error);
+}
+
+void
+bpin(struct buf *bp)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, bp);
+	mtx_lock(mtxp);
+	bp->b_pin_count++;
+	mtx_unlock(mtxp);
+}
+
+void
+bunpin(struct buf *bp)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, bp);
+	mtx_lock(mtxp);
+	if (--bp->b_pin_count == 0)
+		wakeup(bp);
+	mtx_unlock(mtxp);
+}
+
+void
+bunpin_wait(struct buf *bp)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, bp);
+	mtx_lock(mtxp);
+	while (bp->b_pin_count > 0)
+		msleep(bp, mtxp, PRIBIO, "bwunpin", 0);
+	mtx_unlock(mtxp);
+}
+
+/*
+ * Set bio_data or bio_ma for struct bio from the struct buf.
+ */
+void
+bdata2bio(struct buf *bp, struct bio *bip)
+{
+
+	if ((bp->b_flags & B_UNMAPPED) != 0) {
+		KASSERT(unmapped_buf_allowed, ("unmapped"));
+		bip->bio_ma = bp->b_pages;
+		bip->bio_ma_n = bp->b_npages;
+		bip->bio_data = unmapped_buf;
+		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
+		bip->bio_flags |= BIO_UNMAPPED;
+		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
+		    PAGE_SIZE == bp->b_npages,
+		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
+		    (long long)bip->bio_length, bip->bio_ma_n));
+	} else {
+		bip->bio_data = bp->b_data;
+		bip->bio_ma = NULL;
+	}
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+/* DDB command to show buffer data */
+DB_SHOW_COMMAND(buffer, db_show_buffer)
+{
+	/* get args */
+	struct buf *bp = (struct buf *)addr;
+
+	if (!have_addr) {
+		db_printf("usage: show buffer <addr>\n");
+		return;
+	}
+
+	db_printf("buf at %p\n", bp);
+	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
+	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
+	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
+	db_printf(
+	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
+	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
+	    "b_dep = %p\n",
+	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
+	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
+	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
+	if (bp->b_npages) {
+		int i;
+		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m;
+			m = bp->b_pages[i];
+			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
+			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
+			if ((i + 1) < bp->b_npages)
+				db_printf(",");
+		}
+		db_printf("\n");
+	}
+	db_printf(" ");
+	BUF_LOCKPRINTINFO(bp);
+}
+
+DB_SHOW_COMMAND(lockedbufs, lockedbufs)
+{
+	struct buf *bp;
+	int i;
+
+	for (i = 0; i < nbuf; i++) {
+		bp = &buf[i];
+		if (BUF_ISLOCKED(bp)) {
+			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
+			db_printf("\n");
+		}
+	}
+}
+
+DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
+{
+	struct vnode *vp;
+	struct buf *bp;
+
+	if (!have_addr) {
+		db_printf("usage: show vnodebufs <addr>\n");
+		return;
+	}
+	vp = (struct vnode *)addr;
+	db_printf("Clean buffers:\n");
+	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
+		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
+		db_printf("\n");
+	}
+	db_printf("Dirty buffers:\n");
+	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
+		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
+		db_printf("\n");
+	}
+}
+
+DB_COMMAND(countfreebufs, db_coundfreebufs)
+{
+	struct buf *bp;
+	int i, used = 0, nfree = 0;
+
+	if (have_addr) {
+		db_printf("usage: countfreebufs\n");
+		return;
+	}
+
+	for (i = 0; i < nbuf; i++) {
+		bp = &buf[i];
+		if ((bp->b_flags & B_INFREECNT) != 0)
+			nfree++;
+		else
+			used++;
+	}
+
+	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
+	    nfree + used);
+	db_printf("numfreebuffers is %d\n", numfreebuffers);
+}
+#endif /* DDB */
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
new file mode 100644
index 0000000..31ed545
--- /dev/null
+++ b/sys/kern/vfs_cache.c
@@ -0,0 +1,1486 @@
+/*-
+ * Copyright (c) 1989, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Poul-Henning Kamp of the FreeBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/filedesc.h>
+#include <sys/fnv_hash.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/sdt.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vnode.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/uma.h>
+
+SDT_PROVIDER_DECLARE(vfs);
+SDT_PROBE_DEFINE3(vfs, namecache, enter, done, done, "struct vnode *", "char *",
+    "struct vnode *");
+SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, done, "struct vnode *",
+    "char *");
+SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, entry, "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, hit, "struct vnode *",
+    "char *", "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, miss, "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, return, "int",
+    "struct vnode *", "char *");
+SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, hit, "struct vnode *", "char *",
+    "struct vnode *");
+SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit_negative, hit-negative,
+    "struct vnode *", "char *");
+SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, miss, "struct vnode *",
+    "char *");
+SDT_PROBE_DEFINE1(vfs, namecache, purge, done, done, "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, done, "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, done, "struct mount *");
+SDT_PROBE_DEFINE3(vfs, namecache, zap, done, done, "struct vnode *", "char *",
+    "struct vnode *");
+SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, done, "struct vnode *",
+    "char *");
+
+/*
+ * This structure describes the elements in the cache of recent
+ * names looked up by namei.
+ */
+
+struct	namecache {
+	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
+	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
+	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
+	struct	vnode *nc_dvp;		/* vnode of parent of name */
+	struct	vnode *nc_vp;		/* vnode the name refers to */
+	u_char	nc_flag;		/* flag bits */
+	u_char	nc_nlen;		/* length of name */
+	char	nc_name[0];		/* segment name + nul */
+};
+
+/*
+ * struct namecache_ts repeats struct namecache layout up to the
+ * nc_nlen member.
+ * struct namecache_ts is used in place of struct namecache when time(s) need
+ * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
+ * both a non-dotdot directory name plus dotdot for the directory's
+ * parent.
+ */
+struct	namecache_ts {
+	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
+	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
+	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
+	struct	vnode *nc_dvp;		/* vnode of parent of name */
+	struct	vnode *nc_vp;		/* vnode the name refers to */
+	u_char	nc_flag;		/* flag bits */
+	u_char	nc_nlen;		/* length of name */
+	struct	timespec nc_time;	/* timespec provided by fs */
+	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
+	int	nc_ticks;		/* ticks value when entry was added */
+	char	nc_name[0];		/* segment name + nul */
+};
+
+/*
+ * Flags in namecache.nc_flag
+ */
+#define NCF_WHITE	0x01
+#define NCF_ISDOTDOT	0x02
+#define	NCF_TS		0x04
+#define	NCF_DTS		0x08
+
+/*
+ * Name caching works as follows:
+ *
+ * Names found by directory scans are retained in a cache
+ * for future reference.  It is managed LRU, so frequently
+ * used names will hang around.  Cache is indexed by hash value
+ * obtained from (vp, name) where vp refers to the directory
+ * containing name.
+ *
+ * If it is a "negative" entry, (i.e. for a name that is known NOT to
+ * exist) the vnode pointer will be NULL.
+ *
+ * Upon reaching the last segment of a path, if the reference
+ * is for DELETE, or NOCACHE is set (rewrite), and the
+ * name is located in the cache, it will be dropped.
+ */
+
+/*
+ * Structures associated with name cacheing.
+ */
+#define NCHHASH(hash) \
+	(&nchashtbl[(hash) & nchash])
+static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
+static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
+static u_long	nchash;			/* size of hash table */
+SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
+    "Size of namecache hash table");
+static u_long	ncnegfactor = 16;	/* ratio of negative entries */
+SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
+    "Ratio of negative namecache entries");
+static u_long	numneg;			/* number of negative entries allocated */
+SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
+    "Number of negative entries in namecache");
+static u_long	numcache;		/* number of cache entries allocated */
+SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
+    "Number of namecache entries");
+static u_long	numcachehv;		/* number of cache entries with vnodes held */
+SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0,
+    "Number of namecache entries with vnodes held");
+static u_int	ncsizefactor = 2;
+SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
+    "Size factor for namecache");
+
+struct nchstats	nchstats;		/* cache effectiveness statistics */
+
+static struct rwlock cache_lock;
+RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
+
+#define	CACHE_UPGRADE_LOCK()	rw_try_upgrade(&cache_lock)
+#define	CACHE_RLOCK()		rw_rlock(&cache_lock)
+#define	CACHE_RUNLOCK()		rw_runlock(&cache_lock)
+#define	CACHE_WLOCK()		rw_wlock(&cache_lock)
+#define	CACHE_WUNLOCK()		rw_wunlock(&cache_lock)
+
+/*
+ * UMA zones for the VFS cache.
+ *
+ * The small cache is used for entries with short names, which are the
+ * most common.  The large cache is used for entries which are too big to
+ * fit in the small cache.
+ */
+static uma_zone_t cache_zone_small;
+static uma_zone_t cache_zone_small_ts;
+static uma_zone_t cache_zone_large;
+static uma_zone_t cache_zone_large_ts;
+
+#define	CACHE_PATH_CUTOFF	35
+
+static struct namecache *
+cache_alloc(int len, int ts)
+{
+
+	if (len > CACHE_PATH_CUTOFF) {
+		if (ts)
+			return (uma_zalloc(cache_zone_large_ts, M_WAITOK));
+		else
+			return (uma_zalloc(cache_zone_large, M_WAITOK));
+	}
+	if (ts)
+		return (uma_zalloc(cache_zone_small_ts, M_WAITOK));
+	else
+		return (uma_zalloc(cache_zone_small, M_WAITOK));
+}
+
+static void
+cache_free(struct namecache *ncp)
+{
+	int ts;
+
+	if (ncp == NULL)
+		return;
+	ts = ncp->nc_flag & NCF_TS;
+	if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) {
+		if (ts)
+			uma_zfree(cache_zone_small_ts, ncp);
+		else
+			uma_zfree(cache_zone_small, ncp);
+	} else if (ts)
+		uma_zfree(cache_zone_large_ts, ncp);
+	else
+		uma_zfree(cache_zone_large, ncp);
+}
+
+static char *
+nc_get_name(struct namecache *ncp)
+{
+	struct namecache_ts *ncp_ts;
+
+	if ((ncp->nc_flag & NCF_TS) == 0)
+		return (ncp->nc_name);
+	ncp_ts = (struct namecache_ts *)ncp;
+	return (ncp_ts->nc_name);
+}
+
+static void
+cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
+{
+
+	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
+	    (tsp == NULL && ticksp == NULL),
+	    ("No NCF_TS"));
+
+	if (tsp != NULL)
+		*tsp = ((struct namecache_ts *)ncp)->nc_time;
+	if (ticksp != NULL)
+		*ticksp = ((struct namecache_ts *)ncp)->nc_ticks;
+}
+
+static int	doingcache = 1;		/* 1 => enable the cache */
+SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
+    "VFS namecache enabled");
+
+/* Export size information to userland */
+SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
+    sizeof(struct namecache), "sizeof(struct namecache)");
+
+/*
+ * The new name cache statistics
+ */
+static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
+    "Name cache statistics");
+#define STATNODE(mode, name, var, descr) \
+	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, descr);
+STATNODE(CTLFLAG_RD, numneg, &numneg, "Number of negative cache entries");
+STATNODE(CTLFLAG_RD, numcache, &numcache, "Number of cache entries");
+static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls,
+    "Number of cache lookups");
+static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits,
+    "Number of '.' hits");
+static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits,
+    "Number of '..' hits");
+static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks,
+    "Number of checks in lookup");
+static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss,
+    "Number of cache misses");
+static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap,
+    "Number of cache misses we do not want to cache");
+static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps, 
+    "Number of cache hits (positive) we do not want to cache");
+static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits,
+    "Number of cache hits (positive)");
+static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps,
+    "Number of cache hits (negative) we do not want to cache");
+static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits,
+    "Number of cache hits (negative)");
+static u_long numupgrades; STATNODE(CTLFLAG_RD, numupgrades, &numupgrades,
+    "Number of updates of the cache after lookup (write lock + retry)");
+
+SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD | CTLFLAG_MPSAFE,
+    &nchstats, sizeof(nchstats), "LU",
+    "VFS cache effectiveness statistics");
+
+
+
+static void cache_zap(struct namecache *ncp);
+static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
+    u_int *buflen);
+static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
+    char *buf, char **retbuf, u_int buflen);
+
+static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
+
+#ifdef DIAGNOSTIC
+/*
+ * Grab an atomic snapshot of the name cache hash chain lengths
+ */
+static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL,
+    "hash table stats");
+
+static int
+sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct nchashhead *ncpp;
+	struct namecache *ncp;
+	int n_nchash;
+	int count;
+
+	n_nchash = nchash + 1;	/* nchash is max index, not count */
+	if (!req->oldptr)
+		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
+
+	/* Scan hash tables for applicable entries */
+	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
+		CACHE_RLOCK();
+		count = 0;
+		LIST_FOREACH(ncp, ncpp, nc_hash) {
+			count++;
+		}
+		CACHE_RUNLOCK();
+		error = SYSCTL_OUT(req, &count, sizeof(count));
+		if (error)
+			return (error);
+	}
+	return (0);
+}
+SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
+    CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
+    "nchash chain lengths");
+
+static int
+sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct nchashhead *ncpp;
+	struct namecache *ncp;
+	int n_nchash;
+	int count, maxlength, used, pct;
+
+	if (!req->oldptr)
+		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
+
+	n_nchash = nchash + 1;	/* nchash is max index, not count */
+	used = 0;
+	maxlength = 0;
+
+	/* Scan hash tables for applicable entries */
+	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
+		count = 0;
+		CACHE_RLOCK();
+		LIST_FOREACH(ncp, ncpp, nc_hash) {
+			count++;
+		}
+		CACHE_RUNLOCK();
+		if (count)
+			used++;
+		if (maxlength < count)
+			maxlength = count;
+	}
+	n_nchash = nchash + 1;
+	pct = (used * 100) / (n_nchash / 100);
+	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
+	if (error)
+		return (error);
+	error = SYSCTL_OUT(req, &used, sizeof(used));
+	if (error)
+		return (error);
+	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
+	if (error)
+		return (error);
+	error = SYSCTL_OUT(req, &pct, sizeof(pct));
+	if (error)
+		return (error);
+	return (0);
+}
+SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
+    CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
+    "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
+#endif
+
+/*
+ * cache_zap():
+ *
+ *   Removes a namecache entry from cache, whether it contains an actual
+ *   pointer to a vnode or if it is just a negative cache entry.
+ */
+static void
+cache_zap(ncp)
+	struct namecache *ncp;
+{
+	struct vnode *vp;
+
+	rw_assert(&cache_lock, RA_WLOCKED);
+	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
+#ifdef KDTRACE_HOOKS
+	if (ncp->nc_vp != NULL) {
+		SDT_PROBE(vfs, namecache, zap, done, ncp->nc_dvp,
+		    nc_get_name(ncp), ncp->nc_vp, 0, 0);
+	} else {
+		SDT_PROBE(vfs, namecache, zap_negative, done, ncp->nc_dvp,
+		    nc_get_name(ncp), 0, 0, 0);
+	}
+#endif
+	vp = NULL;
+	LIST_REMOVE(ncp, nc_hash);
+	if (ncp->nc_flag & NCF_ISDOTDOT) {
+		if (ncp == ncp->nc_dvp->v_cache_dd)
+			ncp->nc_dvp->v_cache_dd = NULL;
+	} else {
+		LIST_REMOVE(ncp, nc_src);
+		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
+			vp = ncp->nc_dvp;
+			numcachehv--;
+		}
+	}
+	if (ncp->nc_vp) {
+		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
+		if (ncp == ncp->nc_vp->v_cache_dd)
+			ncp->nc_vp->v_cache_dd = NULL;
+	} else {
+		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+		numneg--;
+	}
+	numcache--;
+	cache_free(ncp);
+	if (vp)
+		vdrop(vp);
+}
+
+/*
+ * Lookup an entry in the cache
+ *
+ * Lookup is called with dvp pointing to the directory to search,
+ * cnp pointing to the name of the entry being sought. If the lookup
+ * succeeds, the vnode is returned in *vpp, and a status of -1 is
+ * returned. If the lookup determines that the name does not exist
+ * (negative cacheing), a status of ENOENT is returned. If the lookup
+ * fails, a status of zero is returned.  If the directory vnode is
+ * recycled out from under us due to a forced unmount, a status of
+ * ENOENT is returned.
+ *
+ * vpp is locked and ref'd on return.  If we're looking up DOTDOT, dvp is
+ * unlocked.  If we're looking up . an extra ref is taken, but the lock is
+ * not recursively acquired.
+ */
+
+int
+cache_lookup(dvp, vpp, cnp, tsp, ticksp)
+	struct vnode *dvp;
+	struct vnode **vpp;
+	struct componentname *cnp;
+	struct timespec *tsp;
+	int *ticksp;
+{
+	struct namecache *ncp;
+	uint32_t hash;
+	int error, ltype, wlocked;
+
+	if (!doingcache) {
+		cnp->cn_flags &= ~MAKEENTRY;
+		return (0);
+	}
+retry:
+	CACHE_RLOCK();
+	wlocked = 0;
+	numcalls++;
+	error = 0;
+
+retry_wlocked:
+	if (cnp->cn_nameptr[0] == '.') {
+		if (cnp->cn_namelen == 1) {
+			*vpp = dvp;
+			CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
+			    dvp, cnp->cn_nameptr);
+			dothits++;
+			SDT_PROBE(vfs, namecache, lookup, hit, dvp, ".",
+			    *vpp, 0, 0);
+			if (tsp != NULL)
+				timespecclear(tsp);
+			if (ticksp != NULL)
+				*ticksp = ticks;
+			goto success;
+		}
+		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+			dotdothits++;
+			if (dvp->v_cache_dd == NULL) {
+				SDT_PROBE(vfs, namecache, lookup, miss, dvp,
+				    "..", NULL, 0, 0);
+				goto unlock;
+			}
+			if ((cnp->cn_flags & MAKEENTRY) == 0) {
+				if (!wlocked && !CACHE_UPGRADE_LOCK())
+					goto wlock;
+				if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
+					cache_zap(dvp->v_cache_dd);
+				dvp->v_cache_dd = NULL;
+				CACHE_WUNLOCK();
+				return (0);
+			}
+			ncp = dvp->v_cache_dd;
+			if (ncp->nc_flag & NCF_ISDOTDOT)
+				*vpp = ncp->nc_vp;
+			else
+				*vpp = ncp->nc_dvp;
+			/* Return failure if negative entry was found. */
+			if (*vpp == NULL)
+				goto negative_success;
+			CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
+			    dvp, cnp->cn_nameptr, *vpp);
+			SDT_PROBE(vfs, namecache, lookup, hit, dvp, "..",
+			    *vpp, 0, 0);
+			cache_out_ts(ncp, tsp, ticksp);
+			if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
+			    NCF_DTS && tsp != NULL)
+				*tsp = ((struct namecache_ts *)ncp)->
+				    nc_dotdottime;
+			goto success;
+		}
+	}
+
+	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
+	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
+	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+		numchecks++;
+		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+		    !bcmp(nc_get_name(ncp), cnp->cn_nameptr, ncp->nc_nlen))
+			break;
+	}
+
+	/* We failed to find an entry */
+	if (ncp == NULL) {
+		SDT_PROBE(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
+		    NULL, 0, 0);
+		if ((cnp->cn_flags & MAKEENTRY) == 0) {
+			nummisszap++;
+		} else {
+			nummiss++;
+		}
+		nchstats.ncs_miss++;
+		goto unlock;
+	}
+
+	/* We don't want to have an entry, so dump it */
+	if ((cnp->cn_flags & MAKEENTRY) == 0) {
+		numposzaps++;
+		nchstats.ncs_badhits++;
+		if (!wlocked && !CACHE_UPGRADE_LOCK())
+			goto wlock;
+		cache_zap(ncp);
+		CACHE_WUNLOCK();
+		return (0);
+	}
+
+	/* We found a "positive" match, return the vnode */
+	if (ncp->nc_vp) {
+		numposhits++;
+		nchstats.ncs_goodhits++;
+		*vpp = ncp->nc_vp;
+		CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
+		    dvp, cnp->cn_nameptr, *vpp, ncp);
+		SDT_PROBE(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp),
+		    *vpp, 0, 0);
+		cache_out_ts(ncp, tsp, ticksp);
+		goto success;
+	}
+
+negative_success:
+	/* We found a negative match, and want to create it, so purge */
+	if (cnp->cn_nameiop == CREATE) {
+		numnegzaps++;
+		nchstats.ncs_badhits++;
+		if (!wlocked && !CACHE_UPGRADE_LOCK())
+			goto wlock;
+		cache_zap(ncp);
+		CACHE_WUNLOCK();
+		return (0);
+	}
+
+	if (!wlocked && !CACHE_UPGRADE_LOCK())
+		goto wlock;
+	numneghits++;
+	/*
+	 * We found a "negative" match, so we shift it to the end of
+	 * the "negative" cache entries queue to satisfy LRU.  Also,
+	 * check to see if the entry is a whiteout; indicate this to
+	 * the componentname, if so.
+	 */
+	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+	nchstats.ncs_neghits++;
+	if (ncp->nc_flag & NCF_WHITE)
+		cnp->cn_flags |= ISWHITEOUT;
+	SDT_PROBE(vfs, namecache, lookup, hit_negative, dvp, nc_get_name(ncp),
+	    0, 0, 0);
+	cache_out_ts(ncp, tsp, ticksp);
+	CACHE_WUNLOCK();
+	return (ENOENT);
+
+wlock:
+	/*
+	 * We need to update the cache after our lookup, so upgrade to
+	 * a write lock and retry the operation.
+	 */
+	CACHE_RUNLOCK();
+	CACHE_WLOCK();
+	numupgrades++;
+	wlocked = 1;
+	goto retry_wlocked;
+
+success:
+	/*
+	 * On success we return a locked and ref'd vnode as per the lookup
+	 * protocol.
+	 */
+	if (dvp == *vpp) {   /* lookup on "." */
+		VREF(*vpp);
+		if (wlocked)
+			CACHE_WUNLOCK();
+		else
+			CACHE_RUNLOCK();
+		/*
+		 * When we lookup "." we still can be asked to lock it
+		 * differently...
+		 */
+		ltype = cnp->cn_lkflags & LK_TYPE_MASK;
+		if (ltype != VOP_ISLOCKED(*vpp)) {
+			if (ltype == LK_EXCLUSIVE) {
+				vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
+				if ((*vpp)->v_iflag & VI_DOOMED) {
+					/* forced unmount */
+					vrele(*vpp);
+					*vpp = NULL;
+					return (ENOENT);
+				}
+			} else
+				vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
+		}
+		return (-1);
+	}
+	ltype = 0;	/* silence gcc warning */
+	if (cnp->cn_flags & ISDOTDOT) {
+		ltype = VOP_ISLOCKED(dvp);
+		VOP_UNLOCK(dvp, 0);
+	}
+	VI_LOCK(*vpp);
+	if (wlocked)
+		CACHE_WUNLOCK();
+	else
+		CACHE_RUNLOCK();
+	error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
+	if (cnp->cn_flags & ISDOTDOT) {
+		vn_lock(dvp, ltype | LK_RETRY);
+		if (dvp->v_iflag & VI_DOOMED) {
+			if (error == 0)
+				vput(*vpp);
+			*vpp = NULL;
+			return (ENOENT);
+		}
+	}
+	if (error) {
+		*vpp = NULL;
+		goto retry;
+	}
+	if ((cnp->cn_flags & ISLASTCN) &&
+	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
+		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
+	}
+	return (-1);
+
+unlock:
+	if (wlocked)
+		CACHE_WUNLOCK();
+	else
+		CACHE_RUNLOCK();
+	return (0);
+}
+
+/*
+ * Add an entry to the cache.
+ */
+void
+cache_enter_time(dvp, vp, cnp, tsp, dtsp)
+	struct vnode *dvp;
+	struct vnode *vp;
+	struct componentname *cnp;
+	struct timespec *tsp;
+	struct timespec *dtsp;
+{
+	struct namecache *ncp, *n2;
+	struct namecache_ts *n3;
+	struct nchashhead *ncpp;
+	uint32_t hash;
+	int flag;
+	int hold;
+	int zap;
+	int len;
+
+	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
+	VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
+	    ("cache_enter: Adding a doomed vnode"));
+	VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp,
+	    ("cache_enter: Doomed vnode used as src"));
+
+	if (!doingcache)
+		return;
+
+	/*
+	 * Avoid blowout in namecache entries.
+	 */
+	if (numcache >= desiredvnodes * ncsizefactor)
+		return;
+
+	flag = 0;
+	if (cnp->cn_nameptr[0] == '.') {
+		if (cnp->cn_namelen == 1)
+			return;
+		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+			CACHE_WLOCK();
+			/*
+			 * If dotdot entry already exists, just retarget it
+			 * to new parent vnode, otherwise continue with new
+			 * namecache entry allocation.
+			 */
+			if ((ncp = dvp->v_cache_dd) != NULL &&
+			    ncp->nc_flag & NCF_ISDOTDOT) {
+				KASSERT(ncp->nc_dvp == dvp,
+				    ("wrong isdotdot parent"));
+				if (ncp->nc_vp != NULL)
+					TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
+					    ncp, nc_dst);
+				else
+					TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+				if (vp != NULL)
+					TAILQ_INSERT_HEAD(&vp->v_cache_dst,
+					    ncp, nc_dst);
+				else
+					TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+				ncp->nc_vp = vp;
+				CACHE_WUNLOCK();
+				return;
+			}
+			dvp->v_cache_dd = NULL;
+			SDT_PROBE(vfs, namecache, enter, done, dvp, "..", vp,
+			    0, 0);
+			CACHE_WUNLOCK();
+			flag = NCF_ISDOTDOT;
+		}
+	}
+
+	hold = 0;
+	zap = 0;
+
+	/*
+	 * Calculate the hash key and setup as much of the new
+	 * namecache entry as possible before acquiring the lock.
+	 */
+	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
+	ncp->nc_vp = vp;
+	ncp->nc_dvp = dvp;
+	ncp->nc_flag = flag;
+	if (tsp != NULL) {
+		n3 = (struct namecache_ts *)ncp;
+		n3->nc_time = *tsp;
+		n3->nc_ticks = ticks;
+		n3->nc_flag |= NCF_TS;
+		if (dtsp != NULL) {
+			n3->nc_dotdottime = *dtsp;
+			n3->nc_flag |= NCF_DTS;
+		}
+	}
+	len = ncp->nc_nlen = cnp->cn_namelen;
+	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
+	strlcpy(nc_get_name(ncp), cnp->cn_nameptr, len + 1);
+	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
+	CACHE_WLOCK();
+
+	/*
+	 * See if this vnode or negative entry is already in the cache
+	 * with this name.  This can happen with concurrent lookups of
+	 * the same path name.
+	 */
+	ncpp = NCHHASH(hash);
+	LIST_FOREACH(n2, ncpp, nc_hash) {
+		if (n2->nc_dvp == dvp &&
+		    n2->nc_nlen == cnp->cn_namelen &&
+		    !bcmp(nc_get_name(n2), cnp->cn_nameptr, n2->nc_nlen)) {
+			if (tsp != NULL) {
+				KASSERT((n2->nc_flag & NCF_TS) != 0,
+				    ("no NCF_TS"));
+				n3 = (struct namecache_ts *)n2;
+				n3->nc_time =
+				    ((struct namecache_ts *)ncp)->nc_time;
+				n3->nc_ticks =
+				    ((struct namecache_ts *)ncp)->nc_ticks;
+				if (dtsp != NULL) {
+					n3->nc_dotdottime =
+					    ((struct namecache_ts *)ncp)->
+					    nc_dotdottime;
+					n3->nc_flag |= NCF_DTS;
+				}
+			}
+			CACHE_WUNLOCK();
+			cache_free(ncp);
+			return;
+		}
+	}
+
+	if (flag == NCF_ISDOTDOT) {
+		/*
+		 * See if we are trying to add .. entry, but some other lookup
+		 * has populated v_cache_dd pointer already.
+		 */
+		if (dvp->v_cache_dd != NULL) {
+		    CACHE_WUNLOCK();
+		    cache_free(ncp);
+		    return;
+		}
+		KASSERT(vp == NULL || vp->v_type == VDIR,
+		    ("wrong vnode type %p", vp));
+		dvp->v_cache_dd = ncp;
+	}
+
+	numcache++;
+	if (!vp) {
+		numneg++;
+		if (cnp->cn_flags & ISWHITEOUT)
+			ncp->nc_flag |= NCF_WHITE;
+	} else if (vp->v_type == VDIR) {
+		if (flag != NCF_ISDOTDOT) {
+			/*
+			 * For this case, the cache entry maps both the
+			 * directory name in it and the name ".." for the
+			 * directory's parent.
+			 */
+			if ((n2 = vp->v_cache_dd) != NULL &&
+			    (n2->nc_flag & NCF_ISDOTDOT) != 0)
+				cache_zap(n2);
+			vp->v_cache_dd = ncp;
+		}
+	} else {
+		vp->v_cache_dd = NULL;
+	}
+
+	/*
+	 * Insert the new namecache entry into the appropriate chain
+	 * within the cache entries table.
+	 */
+	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+	if (flag != NCF_ISDOTDOT) {
+		if (LIST_EMPTY(&dvp->v_cache_src)) {
+			hold = 1;
+			numcachehv++;
+		}
+		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
+	}
+
+	/*
+	 * If the entry is "negative", we place it into the
+	 * "negative" cache queue, otherwise, we place it into the
+	 * destination vnode's cache entries queue.
+	 */
+	if (vp) {
+		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
+		SDT_PROBE(vfs, namecache, enter, done, dvp, nc_get_name(ncp),
+		    vp, 0, 0);
+	} else {
+		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+		SDT_PROBE(vfs, namecache, enter_negative, done, dvp,
+		    nc_get_name(ncp), 0, 0, 0);
+	}
+	if (numneg * ncnegfactor > numcache) {
+		ncp = TAILQ_FIRST(&ncneg);
+		zap = 1;
+	}
+	if (hold)
+		vhold(dvp);
+	if (zap)
+		cache_zap(ncp);
+	CACHE_WUNLOCK();
+}
+
+/*
+ * Name cache initialization, from vfs_init() when we are booting
+ */
+static void
+nchinit(void *dummy __unused)
+{
+
+	TAILQ_INIT(&ncneg);
+
+	cache_zone_small = uma_zcreate("S VFS Cache",
+	    sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+	cache_zone_small_ts = uma_zcreate("STS VFS Cache",
+	    sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+	cache_zone_large = uma_zcreate("L VFS Cache",
+	    sizeof(struct namecache) + NAME_MAX + 1,
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+	cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
+	    sizeof(struct namecache_ts) + NAME_MAX + 1,
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+
+	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
+
+
+/*
+ * Invalidate all entries to a particular vnode.
+ */
+void
+cache_purge(vp)
+	struct vnode *vp;
+{
+
+	CTR1(KTR_VFS, "cache_purge(%p)", vp);
+	SDT_PROBE(vfs, namecache, purge, done, vp, 0, 0, 0, 0);
+	CACHE_WLOCK();
+	while (!LIST_EMPTY(&vp->v_cache_src))
+		cache_zap(LIST_FIRST(&vp->v_cache_src));
+	while (!TAILQ_EMPTY(&vp->v_cache_dst))
+		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
+	if (vp->v_cache_dd != NULL) {
+		KASSERT(vp->v_cache_dd->nc_flag & NCF_ISDOTDOT,
+		   ("lost dotdot link"));
+		cache_zap(vp->v_cache_dd);
+	}
+	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
+	CACHE_WUNLOCK();
+}
+
+/*
+ * Invalidate all negative entries for a particular directory vnode.
+ */
+void
+cache_purge_negative(vp)
+	struct vnode *vp;
+{
+	struct namecache *cp, *ncp;
+
+	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
+	SDT_PROBE(vfs, namecache, purge_negative, done, vp, 0, 0, 0, 0);
+	CACHE_WLOCK();
+	LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) {
+		if (cp->nc_vp == NULL)
+			cache_zap(cp);
+	}
+	CACHE_WUNLOCK();
+}
+
+/*
+ * Flush all entries referencing a particular filesystem.
+ */
+void
+cache_purgevfs(mp)
+	struct mount *mp;
+{
+	struct nchashhead *ncpp;
+	struct namecache *ncp, *nnp;
+
+	/* Scan hash tables for applicable entries */
+	SDT_PROBE(vfs, namecache, purgevfs, done, mp, 0, 0, 0, 0);
+	CACHE_WLOCK();
+	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
+		LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
+			if (ncp->nc_dvp->v_mount == mp)
+				cache_zap(ncp);
+		}
+	}
+	CACHE_WUNLOCK();
+}
+
+/*
+ * Perform canonical checks and cache lookup and pass on to filesystem
+ * through the vop_cachedlookup only if needed.
+ */
+
+int
+vfs_cache_lookup(ap)
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap;
+{
+	struct vnode *dvp;
+	int error;
+	struct vnode **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	struct ucred *cred = cnp->cn_cred;
+	int flags = cnp->cn_flags;
+	struct thread *td = cnp->cn_thread;
+
+	*vpp = NULL;
+	dvp = ap->a_dvp;
+
+	if (dvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+		return (EROFS);
+
+	error = VOP_ACCESS(dvp, VEXEC, cred, td);
+	if (error)
+		return (error);
+
+	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
+	if (error == 0)
+		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
+	if (error == -1)
+		return (0);
+	return (error);
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct  __getcwd_args {
+	u_char	*buf;
+	u_int	buflen;
+};
+#endif
+
+/*
+ * XXX All of these sysctls would probably be more productive dead.
+ */
+static int disablecwd;
+SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
+   "Disable the getcwd syscall");
+
+/* Implementation of the getcwd syscall. */
+int
+sys___getcwd(td, uap)
+	struct thread *td;
+	struct __getcwd_args *uap;
+{
+
+	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
+}
+
+int
+kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
+{
+	char *bp, *tmpbuf;
+	struct filedesc *fdp;
+	struct vnode *cdir, *rdir;
+	int error;
+
+	if (disablecwd)
+		return (ENODEV);
+	if (buflen < 2)
+		return (EINVAL);
+	if (buflen > MAXPATHLEN)
+		buflen = MAXPATHLEN;
+
+	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
+	fdp = td->td_proc->p_fd;
+	FILEDESC_SLOCK(fdp);
+	cdir = fdp->fd_cdir;
+	VREF(cdir);
+	rdir = fdp->fd_rdir;
+	VREF(rdir);
+	FILEDESC_SUNLOCK(fdp);
+	error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
+	vrele(rdir);
+	vrele(cdir);
+
+	if (!error) {
+		if (bufseg == UIO_SYSSPACE)
+			bcopy(bp, buf, strlen(bp) + 1);
+		else
+			error = copyout(bp, buf, strlen(bp) + 1);
+#ifdef KTRACE
+	if (KTRPOINT(curthread, KTR_NAMEI))
+		ktrnamei(bp);
+#endif
+	}
+	free(tmpbuf, M_TEMP);
+	return (error);
+}
+
+/*
+ * Thus begins the fullpath magic.
+ */
+
+#undef STATNODE
+#define STATNODE(name, descr)						\
+	static u_int name;						\
+	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr)
+
+static int disablefullpath;
+SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
+    "Disable the vn_fullpath function");
+
+/* These count for kern___getcwd(), too. */
+STATNODE(numfullpathcalls, "Number of fullpath search calls");
+STATNODE(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
+STATNODE(numfullpathfail2,
+    "Number of fullpath search errors (VOP_VPTOCNP failures)");
+STATNODE(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
+STATNODE(numfullpathfound, "Number of successful fullpath calls");
+
+/*
+ * Retrieve the full filesystem path that correspond to a vnode from the name
+ * cache (if available)
+ */
+int
+vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
+{
+	char *buf;
+	struct filedesc *fdp;
+	struct vnode *rdir;
+	int error;
+
+	if (disablefullpath)
+		return (ENODEV);
+	if (vn == NULL)
+		return (EINVAL);
+
+	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	fdp = td->td_proc->p_fd;
+	FILEDESC_SLOCK(fdp);
+	rdir = fdp->fd_rdir;
+	VREF(rdir);
+	FILEDESC_SUNLOCK(fdp);
+	error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
+	vrele(rdir);
+
+	if (!error)
+		*freebuf = buf;
+	else
+		free(buf, M_TEMP);
+	return (error);
+}
+
+/*
+ * This function is similar to vn_fullpath, but it attempts to lookup the
+ * pathname relative to the global root mount point.  This is required for the
+ * auditing sub-system, as audited pathnames must be absolute, relative to the
+ * global root mount point.
+ */
+int
+vn_fullpath_global(struct thread *td, struct vnode *vn,
+    char **retbuf, char **freebuf)
+{
+	char *buf;
+	int error;
+
+	if (disablefullpath)
+		return (ENODEV);
+	if (vn == NULL)
+		return (EINVAL);
+	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
+	if (!error)
+		*freebuf = buf;
+	else
+		free(buf, M_TEMP);
+	return (error);
+}
+
+int
+vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
+{
+	int error;
+
+	CACHE_RLOCK();
+	error = vn_vptocnp_locked(vp, cred, buf, buflen);
+	if (error == 0)
+		CACHE_RUNLOCK();
+	return (error);
+}
+
+static int
+vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
+    u_int *buflen)
+{
+	struct vnode *dvp;
+	struct namecache *ncp;
+	int error;
+
+	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
+		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+			break;
+	}
+	if (ncp != NULL) {
+		if (*buflen < ncp->nc_nlen) {
+			CACHE_RUNLOCK();
+			vrele(*vp);
+			numfullpathfail4++;
+			error = ENOMEM;
+			SDT_PROBE(vfs, namecache, fullpath, return, error,
+			    vp, NULL, 0, 0);
+			return (error);
+		}
+		*buflen -= ncp->nc_nlen;
+		memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen);
+		SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
+		    nc_get_name(ncp), vp, 0, 0);
+		dvp = *vp;
+		*vp = ncp->nc_dvp;
+		vref(*vp);
+		CACHE_RUNLOCK();
+		vrele(dvp);
+		CACHE_RLOCK();
+		return (0);
+	}
+	SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0, 0, 0);
+
+	CACHE_RUNLOCK();
+	vn_lock(*vp, LK_SHARED | LK_RETRY);
+	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
+	vput(*vp);
+	if (error) {
+		numfullpathfail2++;
+		SDT_PROBE(vfs, namecache, fullpath, return,  error, vp,
+		    NULL, 0, 0);
+		return (error);
+	}
+
+	*vp = dvp;
+	CACHE_RLOCK();
+	if (dvp->v_iflag & VI_DOOMED) {
+		/* forced unmount */
+		CACHE_RUNLOCK();
+		vrele(dvp);
+		error = ENOENT;
+		SDT_PROBE(vfs, namecache, fullpath, return, error, vp,
+		    NULL, 0, 0);
+		return (error);
+	}
+	/*
+	 * *vp has its use count incremented still.
+	 */
+
+	return (0);
+}
+
+/*
+ * The magic behind kern___getcwd() and vn_fullpath().
+ */
+static int
+vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
+    char *buf, char **retbuf, u_int buflen)
+{
+	int error, slash_prefixed;
+#ifdef KDTRACE_HOOKS
+	struct vnode *startvp = vp;
+#endif
+	struct vnode *vp1;
+
+	buflen--;
+	buf[buflen] = '\0';
+	error = 0;
+	slash_prefixed = 0;
+
+	SDT_PROBE(vfs, namecache, fullpath, entry, vp, 0, 0, 0, 0);
+	numfullpathcalls++;
+	vref(vp);
+	CACHE_RLOCK();
+	if (vp->v_type != VDIR) {
+		error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
+		if (error)
+			return (error);
+		if (buflen == 0) {
+			CACHE_RUNLOCK();
+			vrele(vp);
+			return (ENOMEM);
+		}
+		buf[--buflen] = '/';
+		slash_prefixed = 1;
+	}
+	while (vp != rdir && vp != rootvnode) {
+		if (vp->v_vflag & VV_ROOT) {
+			if (vp->v_iflag & VI_DOOMED) {	/* forced unmount */
+				CACHE_RUNLOCK();
+				vrele(vp);
+				error = ENOENT;
+				SDT_PROBE(vfs, namecache, fullpath, return,
+				    error, vp, NULL, 0, 0);
+				break;
+			}
+			vp1 = vp->v_mount->mnt_vnodecovered;
+			vref(vp1);
+			CACHE_RUNLOCK();
+			vrele(vp);
+			vp = vp1;
+			CACHE_RLOCK();
+			continue;
+		}
+		if (vp->v_type != VDIR) {
+			CACHE_RUNLOCK();
+			vrele(vp);
+			numfullpathfail1++;
+			error = ENOTDIR;
+			SDT_PROBE(vfs, namecache, fullpath, return,
+			    error, vp, NULL, 0, 0);
+			break;
+		}
+		error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
+		if (error)
+			break;
+		if (buflen == 0) {
+			CACHE_RUNLOCK();
+			vrele(vp);
+			error = ENOMEM;
+			SDT_PROBE(vfs, namecache, fullpath, return, error,
+			    startvp, NULL, 0, 0);
+			break;
+		}
+		buf[--buflen] = '/';
+		slash_prefixed = 1;
+	}
+	if (error)
+		return (error);
+	if (!slash_prefixed) {
+		if (buflen == 0) {
+			CACHE_RUNLOCK();
+			vrele(vp);
+			numfullpathfail4++;
+			SDT_PROBE(vfs, namecache, fullpath, return, ENOMEM,
+			    startvp, NULL, 0, 0);
+			return (ENOMEM);
+		}
+		buf[--buflen] = '/';
+	}
+	numfullpathfound++;
+	CACHE_RUNLOCK();
+	vrele(vp);
+
+	SDT_PROBE(vfs, namecache, fullpath, return, 0, startvp, buf + buflen,
+	    0, 0);
+	*retbuf = buf + buflen;
+	return (0);
+}
+
+struct vnode *
+vn_dir_dd_ino(struct vnode *vp)
+{
+	struct namecache *ncp;
+	struct vnode *ddvp;
+
+	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
+	CACHE_RLOCK();
+	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
+		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
+			continue;
+		ddvp = ncp->nc_dvp;
+		VI_LOCK(ddvp);
+		CACHE_RUNLOCK();
+		if (vget(ddvp, LK_INTERLOCK | LK_SHARED | LK_NOWAIT, curthread))
+			return (NULL);
+		return (ddvp);
+	}
+	CACHE_RUNLOCK();
+	return (NULL);
+}
+
+int
+vn_commname(struct vnode *vp, char *buf, u_int buflen)
+{
+	struct namecache *ncp;
+	int l;
+
+	CACHE_RLOCK();
+	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
+		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+			break;
+	if (ncp == NULL) {
+		CACHE_RUNLOCK();
+		return (ENOENT);
+	}
+	l = min(ncp->nc_nlen, buflen - 1);
+	memcpy(buf, nc_get_name(ncp), l);
+	CACHE_RUNLOCK();
+	buf[l] = '\0';
+	return (0);
+}
+
+/* ABI compat shims for old kernel modules. */
+#undef cache_enter
+
+void	cache_enter(struct vnode *dvp, struct vnode *vp,
+	    struct componentname *cnp);
+
+void
+cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
+{
+
+	cache_enter_time(dvp, vp, cnp, NULL, NULL);
+}
+
+/*
+ * This function updates path string to vnode's full global path
+ * and checks the size of the new path string against the pathlen argument.
+ *
+ * Requires a locked, referenced vnode and GIANT lock held.
+ * Vnode is re-locked on success or ENODEV, otherwise unlocked.
+ *
+ * If sysctl debug.disablefullpath is set, ENODEV is returned,
+ * vnode is left locked and path remain untouched.
+ *
+ * If vp is a directory, the call to vn_fullpath_global() always succeeds
+ * because it falls back to the ".." lookup if the namecache lookup fails.
+ */
+int
+vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
+    u_int pathlen)
+{
+	struct nameidata nd;
+	struct vnode *vp1;
+	char *rpath, *fbuf;
+	int error;
+
+	ASSERT_VOP_ELOCKED(vp, __func__);
+
+	/* Return ENODEV if sysctl debug.disablefullpath==1 */
+	if (disablefullpath)
+		return (ENODEV);
+
+	/* Construct global filesystem path from vp. */
+	VOP_UNLOCK(vp, 0);
+	error = vn_fullpath_global(td, vp, &rpath, &fbuf);
+
+	if (error != 0) {
+		vrele(vp);
+		return (error);
+	}
+
+	if (strlen(rpath) >= pathlen) {
+		vrele(vp);
+		error = ENAMETOOLONG;
+		goto out;
+	}
+
+	/*
+	 * Re-lookup the vnode by path to detect a possible rename.
+	 * As a side effect, the vnode is relocked.
+	 * If vnode was renamed, return ENOENT.
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+	    UIO_SYSSPACE, path, td);
+	error = namei(&nd);
+	if (error != 0) {
+		vrele(vp);
+		goto out;
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp1 = nd.ni_vp;
+	vrele(vp);
+	if (vp1 == vp)
+		strcpy(path, rpath);
+	else {
+		vput(vp1);
+		error = ENOENT;
+	}
+
+out:
+	free(fbuf, M_TEMP);
+	return (error);
+}
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
new file mode 100644
index 0000000..9601082
--- /dev/null
+++ b/sys/kern/vfs_cluster.c
@@ -0,0 +1,1058 @@
+/*-
+ * Copyright (c) 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * Modifications/enhancements:
+ * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_debug_cluster.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/vmmeter.h>
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <sys/sysctl.h>
+
+#if defined(CLUSTERDEBUG)
+static int	rcluster= 0;
+SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
+    "Debug VFS clustering code");
+#endif
+
+static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
+
+static struct cluster_save *cluster_collectbufs(struct vnode *vp,
+	    struct buf *last_bp, int gbflags);
+static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize,
+	    daddr_t lbn, daddr_t blkno, long size, int run, int gbflags,
+	    struct buf *fbp);
+static void cluster_callback(struct buf *);
+
+static int write_behind = 1;
+SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
+    "Cluster write-behind; 0: disable, 1: enable, 2: backed off");
+
+static int read_max = 64;
+SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
+    "Cluster read-ahead max block count");
+
+static int read_min = 1;
+SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0,
+    "Cluster read min block count");
+
+/* Page expended to mark partially backed buffers */
+extern vm_page_t	bogus_page;
+
+/*
+ * Read data to a buf, including read-ahead if we find this to be beneficial.
+ * cluster_read replaces bread.
+ */
+int
+cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
+    struct ucred *cred, long totread, int seqcount, int gbflags,
+    struct buf **bpp)
+{
+	struct buf *bp, *rbp, *reqbp;
+	struct bufobj *bo;
+	daddr_t blkno, origblkno;
+	int maxra, racluster;
+	int error, ncontig;
+	int i;
+
+	error = 0;
+	bo = &vp->v_bufobj;
+	if (!unmapped_buf_allowed)
+		gbflags &= ~GB_UNMAPPED;
+
+	/*
+	 * Try to limit the amount of read-ahead by a few
+	 * ad-hoc parameters.  This needs work!!!
+	 */
+	racluster = vp->v_mount->mnt_iosize_max / size;
+	maxra = seqcount;
+	maxra = min(read_max, maxra);
+	maxra = min(nbuf/8, maxra);
+	if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize)
+		maxra = (filesize / size) - lblkno;
+
+	/*
+	 * get the requested block
+	 */
+	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags);
+	origblkno = lblkno;
+
+	/*
+	 * if it is in the cache, then check to see if the reads have been
+	 * sequential.  If they have, then try some read-ahead, otherwise
+	 * back-off on prospective read-aheads.
+	 */
+	if (bp->b_flags & B_CACHE) {
+		if (!seqcount) {
+			return 0;
+		} else if ((bp->b_flags & B_RAM) == 0) {
+			return 0;
+		} else {
+			bp->b_flags &= ~B_RAM;
+			BO_RLOCK(bo);
+			for (i = 1; i < maxra; i++) {
+				/*
+				 * Stop if the buffer does not exist or it
+				 * is invalid (about to go away?)
+				 */
+				rbp = gbincore(&vp->v_bufobj, lblkno+i);
+				if (rbp == NULL || (rbp->b_flags & B_INVAL))
+					break;
+
+				/*
+				 * Set another read-ahead mark so we know 
+				 * to check again. (If we can lock the
+				 * buffer without waiting)
+				 */
+				if ((((i % racluster) == (racluster - 1)) ||
+				    (i == (maxra - 1))) 
+				    && (0 == BUF_LOCK(rbp, 
+					LK_EXCLUSIVE | LK_NOWAIT, NULL))) {
+					rbp->b_flags |= B_RAM;
+					BUF_UNLOCK(rbp);
+				}			
+			}
+			BO_RUNLOCK(bo);
+			if (i >= maxra) {
+				return 0;
+			}
+			lblkno += i;
+		}
+		reqbp = bp = NULL;
+	/*
+	 * If it isn't in the cache, then get a chunk from
+	 * disk if sequential, otherwise just get the block.
+	 */
+	} else {
+		off_t firstread = bp->b_offset;
+		int nblks;
+		long minread;
+
+		KASSERT(bp->b_offset != NOOFFSET,
+		    ("cluster_read: no buffer offset"));
+
+		ncontig = 0;
+
+		/*
+		 * Adjust totread if needed
+		 */
+		minread = read_min * size;
+		if (minread > totread)
+			totread = minread;
+
+		/*
+		 * Compute the total number of blocks that we should read
+		 * synchronously.
+		 */
+		if (firstread + totread > filesize)
+			totread = filesize - firstread;
+		nblks = howmany(totread, size);
+		if (nblks > racluster)
+			nblks = racluster;
+
+		/*
+		 * Now compute the number of contiguous blocks.
+		 */
+		if (nblks > 1) {
+	    		error = VOP_BMAP(vp, lblkno, NULL,
+				&blkno, &ncontig, NULL);
+			/*
+			 * If this failed to map just do the original block.
+			 */
+			if (error || blkno == -1)
+				ncontig = 0;
+		}
+
+		/*
+		 * If we have contiguous data available do a cluster
+		 * otherwise just read the requested block.
+		 */
+		if (ncontig) {
+			/* Account for our first block. */
+			ncontig = min(ncontig + 1, nblks);
+			if (ncontig < nblks)
+				nblks = ncontig;
+			bp = cluster_rbuild(vp, filesize, lblkno,
+			    blkno, size, nblks, gbflags, bp);
+			lblkno += (bp->b_bufsize / size);
+		} else {
+			bp->b_flags |= B_RAM;
+			bp->b_iocmd = BIO_READ;
+			lblkno += 1;
+		}
+	}
+
+	/*
+	 * handle the synchronous read so that it is available ASAP.
+	 */
+	if (bp) {
+		if ((bp->b_flags & B_CLUSTER) == 0) {
+			vfs_busy_pages(bp, 0);
+		}
+		bp->b_flags &= ~B_INVAL;
+		bp->b_ioflags &= ~BIO_ERROR;
+		if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
+			BUF_KERNPROC(bp);
+		bp->b_iooffset = dbtob(bp->b_blkno);
+		bstrategy(bp);
+		curthread->td_ru.ru_inblock++;
+	}
+
+	/*
+	 * If we have been doing sequential I/O, then do some read-ahead.
+	 */
+	while (lblkno < (origblkno + maxra)) {
+		error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL);
+		if (error)
+			break;
+
+		if (blkno == -1)
+			break;
+
+		/*
+		 * We could throttle ncontig here by maxra but we might as
+		 * well read the data if it is contiguous.  We're throttled
+		 * by racluster anyway.
+		 */
+		if (ncontig) {
+			ncontig = min(ncontig + 1, racluster);
+			rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
+			    size, ncontig, gbflags, NULL);
+			lblkno += (rbp->b_bufsize / size);
+			if (rbp->b_flags & B_DELWRI) {
+				bqrelse(rbp);
+				continue;
+			}
+		} else {
+			rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
+			lblkno += 1;
+			if (rbp->b_flags & B_DELWRI) {
+				bqrelse(rbp);
+				continue;
+			}
+			rbp->b_flags |= B_ASYNC | B_RAM;
+			rbp->b_iocmd = BIO_READ;
+			rbp->b_blkno = blkno;
+		}
+		if (rbp->b_flags & B_CACHE) {
+			rbp->b_flags &= ~B_ASYNC;
+			bqrelse(rbp);
+			continue;
+		}
+		if ((rbp->b_flags & B_CLUSTER) == 0) {
+			vfs_busy_pages(rbp, 0);
+		}
+		rbp->b_flags &= ~B_INVAL;
+		rbp->b_ioflags &= ~BIO_ERROR;
+		if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
+			BUF_KERNPROC(rbp);
+		rbp->b_iooffset = dbtob(rbp->b_blkno);
+		bstrategy(rbp);
+		curthread->td_ru.ru_inblock++;
+	}
+
+	if (reqbp)
+		return (bufwait(reqbp));
+	else
+		return (error);
+}
+
+/*
+ * If blocks are contiguous on disk, use this to provide clustered
+ * read ahead.  We will read as many blocks as possible sequentially
+ * and then parcel them up into logical blocks in the buffer hash table.
+ */
+static struct buf *
+cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+    daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
+{
+	struct bufobj *bo;
+	struct buf *bp, *tbp;
+	daddr_t bn;
+	off_t off;
+	long tinc, tsize;
+	int i, inc, j, k, toff;
+
+	KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
+	    ("cluster_rbuild: size %ld != f_iosize %jd\n",
+	    size, (intmax_t)vp->v_mount->mnt_stat.f_iosize));
+
+	/*
+	 * avoid a division
+	 */
+	while ((u_quad_t) size * (lbn + run) > filesize) {
+		--run;
+	}
+
+	if (fbp) {
+		tbp = fbp;
+		tbp->b_iocmd = BIO_READ; 
+	} else {
+		tbp = getblk(vp, lbn, size, 0, 0, gbflags);
+		if (tbp->b_flags & B_CACHE)
+			return tbp;
+		tbp->b_flags |= B_ASYNC | B_RAM;
+		tbp->b_iocmd = BIO_READ;
+	}
+	tbp->b_blkno = blkno;
+	if( (tbp->b_flags & B_MALLOC) ||
+		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
+		return tbp;
+
+	bp = trypbuf(&cluster_pbuf_freecnt);
+	if (bp == 0)
+		return tbp;
+
+	/*
+	 * We are synthesizing a buffer out of vm_page_t's, but
+	 * if the block size is not page aligned then the starting
+	 * address may not be either.  Inherit the b_data offset
+	 * from the original buffer.
+	 */
+	bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
+	if ((gbflags & GB_UNMAPPED) != 0) {
+		bp->b_flags |= B_UNMAPPED;
+		bp->b_data = unmapped_buf;
+	} else {
+		bp->b_data = (char *)((vm_offset_t)bp->b_data |
+		    ((vm_offset_t)tbp->b_data & PAGE_MASK));
+	}
+	bp->b_iocmd = BIO_READ;
+	bp->b_iodone = cluster_callback;
+	bp->b_blkno = blkno;
+	bp->b_lblkno = lbn;
+	bp->b_offset = tbp->b_offset;
+	KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
+	pbgetvp(vp, bp);
+
+	TAILQ_INIT(&bp->b_cluster.cluster_head);
+
+	bp->b_bcount = 0;
+	bp->b_bufsize = 0;
+	bp->b_npages = 0;
+
+	inc = btodb(size);
+	bo = &vp->v_bufobj;
+	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
+		if (i == 0) {
+			VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+			vfs_drain_busy_pages(tbp);
+			vm_object_pip_add(tbp->b_bufobj->bo_object,
+			    tbp->b_npages);
+			for (k = 0; k < tbp->b_npages; k++)
+				vm_page_sbusy(tbp->b_pages[k]);
+			VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+		} else {
+			if ((bp->b_npages * PAGE_SIZE) +
+			    round_page(size) > vp->v_mount->mnt_iosize_max) {
+				break;
+			}
+
+			tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT |
+			    (gbflags & GB_UNMAPPED));
+
+			/* Don't wait around for locked bufs. */
+			if (tbp == NULL)
+				break;
+
+			/*
+			 * Stop scanning if the buffer is fully valid
+			 * (marked B_CACHE), or locked (may be doing a
+			 * background write), or if the buffer is not
+			 * VMIO backed.  The clustering code can only deal
+			 * with VMIO-backed buffers.  The bo lock is not
+			 * required for the BKGRDINPROG check since it
+			 * can not be set without the buf lock.
+			 */
+			if ((tbp->b_vflags & BV_BKGRDINPROG) ||
+			    (tbp->b_flags & B_CACHE) ||
+			    (tbp->b_flags & B_VMIO) == 0) {
+				bqrelse(tbp);
+				break;
+			}
+
+			/*
+			 * The buffer must be completely invalid in order to
+			 * take part in the cluster.  If it is partially valid
+			 * then we stop.
+			 */
+			off = tbp->b_offset;
+			tsize = size;
+			VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+			for (j = 0; tsize > 0; j++) {
+				toff = off & PAGE_MASK;
+				tinc = tsize;
+				if (toff + tinc > PAGE_SIZE)
+					tinc = PAGE_SIZE - toff;
+				VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object);
+				if ((tbp->b_pages[j]->valid &
+				    vm_page_bits(toff, tinc)) != 0)
+					break;
+				if (vm_page_xbusied(tbp->b_pages[j]))
+					break;
+				vm_object_pip_add(tbp->b_bufobj->bo_object, 1);
+				vm_page_sbusy(tbp->b_pages[j]);
+				off += tinc;
+				tsize -= tinc;
+			}
+			if (tsize > 0) {
+clean_sbusy:
+				vm_object_pip_add(tbp->b_bufobj->bo_object, -j);
+				for (k = 0; k < j; k++)
+					vm_page_sunbusy(tbp->b_pages[k]);
+				VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+				bqrelse(tbp);
+				break;
+			}
+			VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+
+			/*
+			 * Set a read-ahead mark as appropriate
+			 */
+			if ((fbp && (i == 1)) || (i == (run - 1)))
+				tbp->b_flags |= B_RAM;
+
+			/*
+			 * Set the buffer up for an async read (XXX should
+			 * we do this only if we do not wind up brelse()ing?).
+			 * Set the block number if it isn't set, otherwise
+			 * if it is make sure it matches the block number we
+			 * expect.
+			 */
+			tbp->b_flags |= B_ASYNC;
+			tbp->b_iocmd = BIO_READ;
+			if (tbp->b_blkno == tbp->b_lblkno) {
+				tbp->b_blkno = bn;
+			} else if (tbp->b_blkno != bn) {
+				VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+				goto clean_sbusy;
+			}
+		}
+		/*
+		 * XXX fbp from caller may not be B_ASYNC, but we are going
+		 * to biodone() it in cluster_callback() anyway
+		 */
+		BUF_KERNPROC(tbp);
+		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+			tbp, b_cluster.cluster_entry);
+		VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+		for (j = 0; j < tbp->b_npages; j += 1) {
+			vm_page_t m;
+			m = tbp->b_pages[j];
+			if ((bp->b_npages == 0) ||
+			    (bp->b_pages[bp->b_npages-1] != m)) {
+				bp->b_pages[bp->b_npages] = m;
+				bp->b_npages++;
+			}
+			if (m->valid == VM_PAGE_BITS_ALL)
+				tbp->b_pages[j] = bogus_page;
+		}
+		VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+		/*
+		 * Don't inherit tbp->b_bufsize as it may be larger due to
+		 * a non-page-aligned size.  Instead just aggregate using
+		 * 'size'.
+		 */
+		if (tbp->b_bcount != size)
+			printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
+		if (tbp->b_bufsize != size)
+			printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
+		bp->b_bcount += size;
+		bp->b_bufsize += size;
+	}
+
+	/*
+	 * Fully valid pages in the cluster are already good and do not need
+	 * to be re-read from disk.  Replace the page with bogus_page
+	 */
+	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+	for (j = 0; j < bp->b_npages; j++) {
+		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object);
+		if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL)
+			bp->b_pages[j] = bogus_page;
+	}
+	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+	if (bp->b_bufsize > bp->b_kvasize)
+		panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+		    bp->b_bufsize, bp->b_kvasize);
+	bp->b_kvasize = bp->b_bufsize;
+
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+		    (vm_page_t *)bp->b_pages, bp->b_npages);
+	}
+	return (bp);
+}
+
+/*
+ * Cleanup after a clustered read or write.
+ * This is complicated by the fact that any of the buffers might have
+ * extra memory (if there were no empty buffer headers at allocbuf time)
+ * that we will need to shift around.
+ */
+static void
+cluster_callback(bp)
+	struct buf *bp;
+{
+	struct buf *nbp, *tbp;
+	int error = 0;
+
+	/*
+	 * Must propogate errors to all the components.
+	 */
+	if (bp->b_ioflags & BIO_ERROR)
+		error = bp->b_error;
+
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
+		    bp->b_npages);
+	}
+	/*
+	 * Move memory from the large cluster buffer into the component
+	 * buffers and mark IO as done on these.
+	 */
+	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
+		tbp; tbp = nbp) {
+		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
+		if (error) {
+			tbp->b_ioflags |= BIO_ERROR;
+			tbp->b_error = error;
+		} else {
+			tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+			tbp->b_flags &= ~B_INVAL;
+			tbp->b_ioflags &= ~BIO_ERROR;
+			/*
+			 * XXX the bdwrite()/bqrelse() issued during
+			 * cluster building clears B_RELBUF (see bqrelse()
+			 * comment).  If direct I/O was specified, we have
+			 * to restore it here to allow the buffer and VM
+			 * to be freed.
+			 */
+			if (tbp->b_flags & B_DIRECT)
+				tbp->b_flags |= B_RELBUF;
+		}
+		bufdone(tbp);
+	}
+	pbrelvp(bp);
+	relpbuf(bp, &cluster_pbuf_freecnt);
+}
+
+/*
+ *	cluster_wbuild_wb:
+ *
+ *	Implement modified write build for cluster.
+ *
+ *		write_behind = 0	write behind disabled
+ *		write_behind = 1	write behind normal (default)
+ *		write_behind = 2	write behind backed-off
+ */
+
+static __inline int
+cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len,
+    int gbflags)
+{
+	int r = 0;
+
+	switch (write_behind) {
+	case 2:
+		if (start_lbn < len)
+			break;
+		start_lbn -= len;
+		/* FALLTHROUGH */
+	case 1:
+		r = cluster_wbuild(vp, size, start_lbn, len, gbflags);
+		/* FALLTHROUGH */
+	default:
+		/* FALLTHROUGH */
+		break;
+	}
+	return(r);
+}
+
+/*
+ * Do clustered write for FFS.
+ *
+ * Three cases:
+ *	1. Write is not sequential (write asynchronously)
+ *	Write is sequential:
+ *	2.	beginning of cluster - begin cluster
+ *	3.	middle of a cluster - add to cluster
+ *	4.	end of a cluster - asynchronously write cluster
+ */
+void
+cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
+    int gbflags)
+{
+	daddr_t lbn;
+	int maxclen, cursize;
+	int lblocksize;
+	int async;
+
+	if (!unmapped_buf_allowed)
+		gbflags &= ~GB_UNMAPPED;
+
+	if (vp->v_type == VREG) {
+		async = DOINGASYNC(vp);
+		lblocksize = vp->v_mount->mnt_stat.f_iosize;
+	} else {
+		async = 0;
+		lblocksize = bp->b_bufsize;
+	}
+	lbn = bp->b_lblkno;
+	KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
+
+	/* Initialize vnode to beginning of file. */
+	if (lbn == 0)
+		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+
+	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
+	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
+		maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1;
+		if (vp->v_clen != 0) {
+			/*
+			 * Next block is not sequential.
+			 *
+			 * If we are not writing at end of file, the process
+			 * seeked to another point in the file since its last
+			 * write, or we have reached our maximum cluster size,
+			 * then push the previous cluster. Otherwise try
+			 * reallocating to make it sequential.
+			 *
+			 * Change to algorithm: only push previous cluster if
+			 * it was sequential from the point of view of the
+			 * seqcount heuristic, otherwise leave the buffer 
+			 * intact so we can potentially optimize the I/O
+			 * later on in the buf_daemon or update daemon
+			 * flush.
+			 */
+			cursize = vp->v_lastw - vp->v_cstart + 1;
+			if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
+			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
+				if (!async && seqcount > 0) {
+					cluster_wbuild_wb(vp, lblocksize,
+					    vp->v_cstart, cursize, gbflags);
+				}
+			} else {
+				struct buf **bpp, **endbp;
+				struct cluster_save *buflist;
+
+				buflist = cluster_collectbufs(vp, bp, gbflags);
+				endbp = &buflist->bs_children
+				    [buflist->bs_nchildren - 1];
+				if (VOP_REALLOCBLKS(vp, buflist)) {
+					/*
+					 * Failed, push the previous cluster
+					 * if *really* writing sequentially
+					 * in the logical file (seqcount > 1),
+					 * otherwise delay it in the hopes that
+					 * the low level disk driver can
+					 * optimize the write ordering.
+					 */
+					for (bpp = buflist->bs_children;
+					     bpp < endbp; bpp++)
+						brelse(*bpp);
+					free(buflist, M_SEGMENT);
+					if (seqcount > 1) {
+						cluster_wbuild_wb(vp, 
+						    lblocksize, vp->v_cstart, 
+						    cursize, gbflags);
+					}
+				} else {
+					/*
+					 * Succeeded, keep building cluster.
+					 */
+					for (bpp = buflist->bs_children;
+					     bpp <= endbp; bpp++)
+						bdwrite(*bpp);
+					free(buflist, M_SEGMENT);
+					vp->v_lastw = lbn;
+					vp->v_lasta = bp->b_blkno;
+					return;
+				}
+			}
+		}
+		/*
+		 * Consider beginning a cluster. If at end of file, make
+		 * cluster as large as possible, otherwise find size of
+		 * existing cluster.
+		 */
+		if ((vp->v_type == VREG) &&
+			((u_quad_t) bp->b_offset + lblocksize) != filesize &&
+		    (bp->b_blkno == bp->b_lblkno) &&
+		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
+		     bp->b_blkno == -1)) {
+			bawrite(bp);
+			vp->v_clen = 0;
+			vp->v_lasta = bp->b_blkno;
+			vp->v_cstart = lbn + 1;
+			vp->v_lastw = lbn;
+			return;
+		}
+		vp->v_clen = maxclen;
+		if (!async && maxclen == 0) {	/* I/O not contiguous */
+			vp->v_cstart = lbn + 1;
+			bawrite(bp);
+		} else {	/* Wait for rest of cluster */
+			vp->v_cstart = lbn;
+			bdwrite(bp);
+		}
+	} else if (lbn == vp->v_cstart + vp->v_clen) {
+		/*
+		 * At end of cluster, write it out if seqcount tells us we
+		 * are operating sequentially, otherwise let the buf or
+		 * update daemon handle it.
+		 */
+		bdwrite(bp);
+		if (seqcount > 1) {
+			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
+			    vp->v_clen + 1, gbflags);
+		}
+		vp->v_clen = 0;
+		vp->v_cstart = lbn + 1;
+	} else if (vm_page_count_severe()) {
+		/*
+		 * We are low on memory, get it going NOW
+		 */
+		bawrite(bp);
+	} else {
+		/*
+		 * In the middle of a cluster, so just delay the I/O for now.
+		 */
+		bdwrite(bp);
+	}
+	vp->v_lastw = lbn;
+	vp->v_lasta = bp->b_blkno;
+}
+
+
+/*
+ * This is an awful lot like cluster_rbuild...wish they could be combined.
+ * The last lbn argument is the current block on which I/O is being
+ * performed.  Check to see that it doesn't fall in the middle of
+ * the current block (if last_bp == NULL).
+ */
+int
+cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
+    int gbflags)
+{
+	struct buf *bp, *tbp;
+	struct bufobj *bo;
+	int i, j;
+	int totalwritten = 0;
+	int dbsize = btodb(size);
+
+	if (!unmapped_buf_allowed)
+		gbflags &= ~GB_UNMAPPED;
+
+	bo = &vp->v_bufobj;
+	while (len > 0) {
+		/*
+		 * If the buffer is not delayed-write (i.e. dirty), or it
+		 * is delayed-write but either locked or inval, it cannot
+		 * partake in the clustered write.
+		 */
+		BO_LOCK(bo);
+		if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL ||
+		    (tbp->b_vflags & BV_BKGRDINPROG)) {
+			BO_UNLOCK(bo);
+			++start_lbn;
+			--len;
+			continue;
+		}
+		if (BUF_LOCK(tbp,
+		    LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) {
+			++start_lbn;
+			--len;
+			continue;
+		}
+		if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) {
+			BUF_UNLOCK(tbp);
+			++start_lbn;
+			--len;
+			continue;
+		}
+		if (tbp->b_pin_count >  0) {
+			BUF_UNLOCK(tbp);
+			++start_lbn;
+			--len;
+			continue;
+		}
+		bremfree(tbp);
+		tbp->b_flags &= ~B_DONE;
+
+		/*
+		 * Extra memory in the buffer, punt on this buffer.
+		 * XXX we could handle this in most cases, but we would
+		 * have to push the extra memory down to after our max
+		 * possible cluster size and then potentially pull it back
+		 * up if the cluster was terminated prematurely--too much
+		 * hassle.
+		 */
+		if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != 
+		     (B_CLUSTEROK | B_VMIO)) ||
+		  (tbp->b_bcount != tbp->b_bufsize) ||
+		  (tbp->b_bcount != size) ||
+		  (len == 1) ||
+		  ((bp = (vp->v_vflag & VV_MD) != 0 ?
+		  trypbuf(&cluster_pbuf_freecnt) :
+		  getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
+			totalwritten += tbp->b_bufsize;
+			bawrite(tbp);
+			++start_lbn;
+			--len;
+			continue;
+		}
+
+		/*
+		 * We got a pbuf to make the cluster in.
+		 * so initialise it.
+		 */
+		TAILQ_INIT(&bp->b_cluster.cluster_head);
+		bp->b_bcount = 0;
+		bp->b_bufsize = 0;
+		bp->b_npages = 0;
+		if (tbp->b_wcred != NOCRED)
+			bp->b_wcred = crhold(tbp->b_wcred);
+
+		bp->b_blkno = tbp->b_blkno;
+		bp->b_lblkno = tbp->b_lblkno;
+		bp->b_offset = tbp->b_offset;
+
+		/*
+		 * We are synthesizing a buffer out of vm_page_t's, but
+		 * if the block size is not page aligned then the starting
+		 * address may not be either.  Inherit the b_data offset
+		 * from the original buffer.
+		 */
+		if ((gbflags & GB_UNMAPPED) == 0 ||
+		    (tbp->b_flags & B_VMIO) == 0) {
+			bp->b_data = (char *)((vm_offset_t)bp->b_data |
+			    ((vm_offset_t)tbp->b_data & PAGE_MASK));
+		} else {
+			bp->b_flags |= B_UNMAPPED;
+			bp->b_data = unmapped_buf;
+		}
+		bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO |
+		    B_NEEDCOMMIT));
+		bp->b_iodone = cluster_callback;
+		pbgetvp(vp, bp);
+		/*
+		 * From this location in the file, scan forward to see
+		 * if there are buffers with adjacent data that need to
+		 * be written as well.
+		 */
+		for (i = 0; i < len; ++i, ++start_lbn) {
+			if (i != 0) { /* If not the first buffer */
+				/*
+				 * If the adjacent data is not even in core it
+				 * can't need to be written.
+				 */
+				BO_LOCK(bo);
+				if ((tbp = gbincore(bo, start_lbn)) == NULL ||
+				    (tbp->b_vflags & BV_BKGRDINPROG)) {
+					BO_UNLOCK(bo);
+					break;
+				}
+
+				/*
+				 * If it IS in core, but has different
+				 * characteristics, or is locked (which
+				 * means it could be undergoing a background
+				 * I/O or be in a weird state), then don't
+				 * cluster with it.
+				 */
+				if (BUF_LOCK(tbp,
+				    LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
+				    BO_LOCKPTR(bo)))
+					break;
+
+				if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
+				    B_INVAL | B_DELWRI | B_NEEDCOMMIT))
+				    != (B_DELWRI | B_CLUSTEROK |
+				    (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
+				    tbp->b_wcred != bp->b_wcred) {
+					BUF_UNLOCK(tbp);
+					break;
+				}
+
+				/*
+				 * Check that the combined cluster
+				 * would make sense with regard to pages
+				 * and would not be too large
+				 */
+				if ((tbp->b_bcount != size) ||
+				  ((bp->b_blkno + (dbsize * i)) !=
+				    tbp->b_blkno) ||
+				  ((tbp->b_npages + bp->b_npages) >
+				    (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
+					BUF_UNLOCK(tbp);
+					break;
+				}
+
+				/*
+				 * Do not pull in pinned buffers.
+				 */
+				if (tbp->b_pin_count > 0) {
+					BUF_UNLOCK(tbp);
+					break;
+				}
+
+				/*
+				 * Ok, it's passed all the tests,
+				 * so remove it from the free list
+				 * and mark it busy. We will use it.
+				 */
+				bremfree(tbp);
+				tbp->b_flags &= ~B_DONE;
+			} /* end of code for non-first buffers only */
+			/*
+			 * If the IO is via the VM then we do some
+			 * special VM hackery (yuck).  Since the buffer's
+			 * block size may not be page-aligned it is possible
+			 * for a page to be shared between two buffers.  We
+			 * have to get rid of the duplication when building
+			 * the cluster.
+			 */
+			if (tbp->b_flags & B_VMIO) {
+				vm_page_t m;
+
+				VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+				if (i == 0) {
+					vfs_drain_busy_pages(tbp);
+				} else { /* if not first buffer */
+					for (j = 0; j < tbp->b_npages; j += 1) {
+						m = tbp->b_pages[j];
+						if (vm_page_xbusied(m)) {
+							VM_OBJECT_WUNLOCK(
+							    tbp->b_object);
+							bqrelse(tbp);
+							goto finishcluster;
+						}
+					}
+				}
+				for (j = 0; j < tbp->b_npages; j += 1) {
+					m = tbp->b_pages[j];
+					vm_page_sbusy(m);
+					vm_object_pip_add(m->object, 1);
+					if ((bp->b_npages == 0) ||
+					  (bp->b_pages[bp->b_npages - 1] != m)) {
+						bp->b_pages[bp->b_npages] = m;
+						bp->b_npages++;
+					}
+				}
+				VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+			}
+			bp->b_bcount += size;
+			bp->b_bufsize += size;
+			/*
+			 * If any of the clustered buffers have their
+			 * B_BARRIER flag set, transfer that request to
+			 * the cluster.
+			 */
+			bp->b_flags |= (tbp->b_flags & B_BARRIER);
+			tbp->b_flags &= ~(B_DONE | B_BARRIER);
+			tbp->b_flags |= B_ASYNC;
+			tbp->b_ioflags &= ~BIO_ERROR;
+			tbp->b_iocmd = BIO_WRITE;
+			bundirty(tbp);
+			reassignbuf(tbp);		/* put on clean list */
+			bufobj_wref(tbp->b_bufobj);
+			BUF_KERNPROC(tbp);
+			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+				tbp, b_cluster.cluster_entry);
+		}
+	finishcluster:
+		if ((bp->b_flags & B_UNMAPPED) == 0) {
+			pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+			    (vm_page_t *)bp->b_pages, bp->b_npages);
+		}
+		if (bp->b_bufsize > bp->b_kvasize)
+			panic(
+			    "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+			    bp->b_bufsize, bp->b_kvasize);
+		bp->b_kvasize = bp->b_bufsize;
+		totalwritten += bp->b_bufsize;
+		bp->b_dirtyoff = 0;
+		bp->b_dirtyend = bp->b_bufsize;
+		bawrite(bp);
+
+		len -= i;
+	}
+	return totalwritten;
+}
+
+/*
+ * Collect together all the buffers in a cluster.
+ * Plus add one additional buffer.
+ */
+static struct cluster_save *
+cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags)
+{
+	struct cluster_save *buflist;
+	struct buf *bp;
+	daddr_t lbn;
+	int i, len;
+
+	len = vp->v_lastw - vp->v_cstart + 1;
+	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
+	    M_SEGMENT, M_WAITOK);
+	buflist->bs_nchildren = 0;
+	buflist->bs_children = (struct buf **) (buflist + 1);
+	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
+		(void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED,
+		    gbflags, &bp);
+		buflist->bs_children[i] = bp;
+		if (bp->b_blkno == bp->b_lblkno)
+			VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
+				NULL, NULL);
+	}
+	buflist->bs_children[i] = bp = last_bp;
+	if (bp->b_blkno == bp->b_lblkno)
+		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+	buflist->bs_nchildren = i + 1;
+	return (buflist);
+}
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
new file mode 100644
index 0000000..2c01117
--- /dev/null
+++ b/sys/kern/vfs_default.c
@@ -0,0 +1,1269 @@
+/*-
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/event.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/lockf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/rwlock.h>
+#include <sys/fcntl.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/dirent.h>
+#include <sys/poll.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
+
+static int	vop_nolookup(struct vop_lookup_args *);
+static int	vop_norename(struct vop_rename_args *);
+static int	vop_nostrategy(struct vop_strategy_args *);
+static int	get_next_dirent(struct vnode *vp, struct dirent **dpp,
+				char *dirbuf, int dirbuflen, off_t *off,
+				char **cpos, int *len, int *eofflag,
+				struct thread *td);
+static int	dirent_exists(struct vnode *vp, const char *dirname,
+			      struct thread *td);
+
+#define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4)
+
+static int vop_stdis_text(struct vop_is_text_args *ap);
+static int vop_stdset_text(struct vop_set_text_args *ap);
+static int vop_stdunset_text(struct vop_unset_text_args *ap);
+static int vop_stdget_writecount(struct vop_get_writecount_args *ap);
+static int vop_stdadd_writecount(struct vop_add_writecount_args *ap);
+
+/*
+ * This vnode table stores what we want to do if the filesystem doesn't
+ * implement a particular VOP.
+ *
+ * If there is no specific entry here, we will return EOPNOTSUPP.
+ *
+ * Note that every filesystem has to implement either vop_access
+ * or vop_accessx; failing to do so will result in immediate crash
+ * due to stack overflow, as vop_stdaccess() calls vop_stdaccessx(),
+ * which calls vop_stdaccess() etc.
+ */
+
+struct vop_vector default_vnodeops = {
+	.vop_default =		NULL,
+	.vop_bypass =		VOP_EOPNOTSUPP,
+
+	.vop_access =		vop_stdaccess,
+	.vop_accessx =		vop_stdaccessx,
+	.vop_advise =		vop_stdadvise,
+	.vop_advlock =		vop_stdadvlock,
+	.vop_advlockasync =	vop_stdadvlockasync,
+	.vop_advlockpurge =	vop_stdadvlockpurge,
+	.vop_allocate =		vop_stdallocate,
+	.vop_bmap =		vop_stdbmap,
+	.vop_close =		VOP_NULL,
+	.vop_fsync =		VOP_NULL,
+	.vop_getpages =		vop_stdgetpages,
+	.vop_getwritemount = 	vop_stdgetwritemount,
+	.vop_inactive =		VOP_NULL,
+	.vop_ioctl =		VOP_ENOTTY,
+	.vop_kqfilter =		vop_stdkqfilter,
+	.vop_islocked =		vop_stdislocked,
+	.vop_lock1 =		vop_stdlock,
+	.vop_lookup =		vop_nolookup,
+	.vop_open =		VOP_NULL,
+	.vop_pathconf =		VOP_EINVAL,
+	.vop_poll =		vop_nopoll,
+	.vop_putpages =		vop_stdputpages,
+	.vop_readlink =		VOP_EINVAL,
+	.vop_rename =		vop_norename,
+	.vop_revoke =		VOP_PANIC,
+	.vop_strategy =		vop_nostrategy,
+	.vop_unlock =		vop_stdunlock,
+	.vop_vptocnp =		vop_stdvptocnp,
+	.vop_vptofh =		vop_stdvptofh,
+	.vop_unp_bind =		vop_stdunp_bind,
+	.vop_unp_connect =	vop_stdunp_connect,
+	.vop_unp_detach =	vop_stdunp_detach,
+	.vop_is_text =		vop_stdis_text,
+	.vop_set_text =		vop_stdset_text,
+	.vop_unset_text =	vop_stdunset_text,
+	.vop_get_writecount =	vop_stdget_writecount,
+	.vop_add_writecount =	vop_stdadd_writecount,
+};
+
+/*
+ * Series of placeholder functions for various error returns for
+ * VOPs.
+ */
+
+int
+vop_eopnotsupp(struct vop_generic_args *ap)
+{
+	/*
+	printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name);
+	*/
+
+	return (EOPNOTSUPP);
+}
+
+int
+vop_ebadf(struct vop_generic_args *ap)
+{
+
+	return (EBADF);
+}
+
+int
+vop_enotty(struct vop_generic_args *ap)
+{
+
+	return (ENOTTY);
+}
+
+int
+vop_einval(struct vop_generic_args *ap)
+{
+
+	return (EINVAL);
+}
+
+int
+vop_enoent(struct vop_generic_args *ap)
+{
+
+	return (ENOENT);
+}
+
+int
+vop_null(struct vop_generic_args *ap)
+{
+
+	return (0);
+}
+
+/*
+ * Helper function to panic on some bad VOPs in some filesystems.
+ */
+int
+vop_panic(struct vop_generic_args *ap)
+{
+
+	panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name);
+}
+
+/*
+ * vop_std<something> and vop_no<something> are default functions for use by
+ * filesystems that need the "default reasonable" implementation for a
+ * particular operation.
+ *
+ * The documentation for the operations they implement exists (if it exists)
+ * in the VOP_<SOMETHING>(9) manpage (all uppercase).
+ */
+
+/*
+ * Default vop for filesystems that do not support name lookup
+ */
+static int
+vop_nolookup(ap)
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap;
+{
+
+	*ap->a_vpp = NULL;
+	return (ENOTDIR);
+}
+
+/*
+ * vop_norename:
+ *
+ * Handle unlock and reference counting for arguments of vop_rename
+ * for filesystems that do not implement rename operation.
+ */
+static int
+vop_norename(struct vop_rename_args *ap)
+{
+
+	vop_rename_fail(ap);
+	return (EOPNOTSUPP);
+}
+
+/*
+ *	vop_nostrategy:
+ *
+ *	Strategy routine for VFS devices that have none.
+ *
+ *	BIO_ERROR and B_INVAL must be cleared prior to calling any strategy
+ *	routine.  Typically this is done for a BIO_READ strategy call.
+ *	Typically B_INVAL is assumed to already be clear prior to a write
+ *	and should not be cleared manually unless you just made the buffer
+ *	invalid.  BIO_ERROR should be cleared either way.
+ */
+
+static int
+vop_nostrategy (struct vop_strategy_args *ap)
+{
+	printf("No strategy for buffer at %p\n", ap->a_bp);
+	vprint("vnode", ap->a_vp);
+	ap->a_bp->b_ioflags |= BIO_ERROR;
+	ap->a_bp->b_error = EOPNOTSUPP;
+	bufdone(ap->a_bp);
+	return (EOPNOTSUPP);
+}
+
+static int
+get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf,
+		int dirbuflen, off_t *off, char **cpos, int *len,
+		int *eofflag, struct thread *td)
+{
+	int error, reclen;
+	struct uio uio;
+	struct iovec iov;
+	struct dirent *dp;
+
+	KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
+	KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
+
+	if (*len == 0) {
+		iov.iov_base = dirbuf;
+		iov.iov_len = dirbuflen;
+
+		uio.uio_iov = &iov;
+		uio.uio_iovcnt = 1;
+		uio.uio_offset = *off;
+		uio.uio_resid = dirbuflen;
+		uio.uio_segflg = UIO_SYSSPACE;
+		uio.uio_rw = UIO_READ;
+		uio.uio_td = td;
+
+		*eofflag = 0;
+
+#ifdef MAC
+		error = mac_vnode_check_readdir(td->td_ucred, vp);
+		if (error == 0)
+#endif
+			error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag,
+		    		NULL, NULL);
+		if (error)
+			return (error);
+
+		*off = uio.uio_offset;
+
+		*cpos = dirbuf;
+		*len = (dirbuflen - uio.uio_resid);
+
+		if (*len == 0)
+			return (ENOENT);
+	}
+
+	dp = (struct dirent *)(*cpos);
+	reclen = dp->d_reclen;
+	*dpp = dp;
+
+	/* check for malformed directory.. */
+	if (reclen < DIRENT_MINSIZE)
+		return (EINVAL);
+
+	*cpos += reclen;
+	*len -= reclen;
+
+	return (0);
+}
+
+/*
+ * Check if a named file exists in a given directory vnode.
+ */
+static int
+dirent_exists(struct vnode *vp, const char *dirname, struct thread *td)
+{
+	char *dirbuf, *cpos;
+	int error, eofflag, dirbuflen, len, found;
+	off_t off;
+	struct dirent *dp;
+	struct vattr va;
+
+	KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
+	KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
+
+	found = 0;
+
+	error = VOP_GETATTR(vp, &va, td->td_ucred);
+	if (error)
+		return (found);
+
+	dirbuflen = DEV_BSIZE;
+	if (dirbuflen < va.va_blocksize)
+		dirbuflen = va.va_blocksize;
+	dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
+
+	off = 0;
+	len = 0;
+	do {
+		error = get_next_dirent(vp, &dp, dirbuf, dirbuflen, &off,
+					&cpos, &len, &eofflag, td);
+		if (error)
+			goto out;
+
+		if (dp->d_type != DT_WHT && dp->d_fileno != 0 &&
+		    strcmp(dp->d_name, dirname) == 0) {
+			found = 1;
+			goto out;
+		}
+	} while (len > 0 || !eofflag);
+
+out:
+	free(dirbuf, M_TEMP);
+	return (found);
+}
+
+int
+vop_stdaccess(struct vop_access_args *ap)
+{
+
+	KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN |
+	    VAPPEND)) == 0, ("invalid bit in accmode"));
+
+	return (VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred, ap->a_td));
+}
+
+int
+vop_stdaccessx(struct vop_accessx_args *ap)
+{
+	int error;
+	accmode_t accmode = ap->a_accmode;
+
+	error = vfs_unixify_accmode(&accmode);
+	if (error != 0)
+		return (error);
+
+	if (accmode == 0)
+		return (0);
+
+	return (VOP_ACCESS(ap->a_vp, accmode, ap->a_cred, ap->a_td));
+}
+
+/*
+ * Advisory record locking support
+ */
+int
+vop_stdadvlock(struct vop_advlock_args *ap)
+{
+	struct vnode *vp;
+	struct ucred *cred;
+	struct vattr vattr;
+	int error;
+
+	vp = ap->a_vp;
+	cred = curthread->td_ucred;
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	error = VOP_GETATTR(vp, &vattr, cred);
+	VOP_UNLOCK(vp, 0);
+	if (error)
+		return (error);
+
+	return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size));
+}
+
+int
+vop_stdadvlockasync(struct vop_advlockasync_args *ap)
+{
+	struct vnode *vp;
+	struct ucred *cred;
+	struct vattr vattr;
+	int error;
+
+	vp = ap->a_vp;
+	cred = curthread->td_ucred;
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	error = VOP_GETATTR(vp, &vattr, cred);
+	VOP_UNLOCK(vp, 0);
+	if (error)
+		return (error);
+
+	return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size));
+}
+
+int
+vop_stdadvlockpurge(struct vop_advlockpurge_args *ap)
+{
+	struct vnode *vp;
+
+	vp = ap->a_vp;
+	lf_purgelocks(vp, &vp->v_lockf);
+	return (0);
+}
+
+/*
+ * vop_stdpathconf:
+ *
+ * Standard implementation of POSIX pathconf, to get information about limits
+ * for a filesystem.
+ * Override per filesystem for the case where the filesystem has smaller
+ * limits.
+ */
+int
+vop_stdpathconf(ap)
+	struct vop_pathconf_args /* {
+	struct vnode *a_vp;
+	int a_name;
+	int *a_retval;
+	} */ *ap;
+{
+
+	switch (ap->a_name) {
+		case _PC_NAME_MAX:
+			*ap->a_retval = NAME_MAX;
+			return (0);
+		case _PC_PATH_MAX:
+			*ap->a_retval = PATH_MAX;
+			return (0);
+		case _PC_LINK_MAX:
+			*ap->a_retval = LINK_MAX;
+			return (0);
+		case _PC_MAX_CANON:
+			*ap->a_retval = MAX_CANON;
+			return (0);
+		case _PC_MAX_INPUT:
+			*ap->a_retval = MAX_INPUT;
+			return (0);
+		case _PC_PIPE_BUF:
+			*ap->a_retval = PIPE_BUF;
+			return (0);
+		case _PC_CHOWN_RESTRICTED:
+			*ap->a_retval = 1;
+			return (0);
+		case _PC_VDISABLE:
+			*ap->a_retval = _POSIX_VDISABLE;
+			return (0);
+		default:
+			return (EINVAL);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Standard lock, unlock and islocked functions.
+ */
+int
+vop_stdlock(ap)
+	struct vop_lock1_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		char *file;
+		int line;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	return (_lockmgr_args(vp->v_vnlock, ap->a_flags, VI_MTX(vp),
+	    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, ap->a_file,
+	    ap->a_line));
+}
+
+/* See above. */
+int
+vop_stdunlock(ap)
+	struct vop_unlock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	return (lockmgr(vp->v_vnlock, ap->a_flags | LK_RELEASE, VI_MTX(vp)));
+}
+
+/* See above. */
+int
+vop_stdislocked(ap)
+	struct vop_islocked_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+
+	return (lockstatus(ap->a_vp->v_vnlock));
+}
+
+/*
+ * Return true for select/poll.
+ */
+int
+vop_nopoll(ap)
+	struct vop_poll_args /* {
+		struct vnode *a_vp;
+		int  a_events;
+		struct ucred *a_cred;
+		struct thread *a_td;
+	} */ *ap;
+{
+
+	return (poll_no_poll(ap->a_events));
+}
+
+/*
+ * Implement poll for local filesystems that support it.
+ */
+int
+vop_stdpoll(ap)
+	struct vop_poll_args /* {
+		struct vnode *a_vp;
+		int  a_events;
+		struct ucred *a_cred;
+		struct thread *a_td;
+	} */ *ap;
+{
+	if (ap->a_events & ~POLLSTANDARD)
+		return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events));
+	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Return our mount point, as we will take charge of the writes.
+ */
+int
+vop_stdgetwritemount(ap)
+	struct vop_getwritemount_args /* {
+		struct vnode *a_vp;
+		struct mount **a_mpp;
+	} */ *ap;
+{
+	struct mount *mp;
+
+	/*
+	 * XXX Since this is called unlocked we may be recycled while
+	 * attempting to ref the mount.  If this is the case or mountpoint
+	 * will be set to NULL.  We only have to prevent this call from
+	 * returning with a ref to an incorrect mountpoint.  It is not
+	 * harmful to return with a ref to our previous mountpoint.
+	 */
+	mp = ap->a_vp->v_mount;
+	if (mp != NULL) {
+		vfs_ref(mp);
+		if (mp != ap->a_vp->v_mount) {
+			vfs_rel(mp);
+			mp = NULL;
+		}
+	}
+	*(ap->a_mpp) = mp;
+	return (0);
+}
+
+/* XXX Needs good comment and VOP_BMAP(9) manpage */
+int
+vop_stdbmap(ap)
+	struct vop_bmap_args /* {
+		struct vnode *a_vp;
+		daddr_t  a_bn;
+		struct bufobj **a_bop;
+		daddr_t *a_bnp;
+		int *a_runp;
+		int *a_runb;
+	} */ *ap;
+{
+
+	if (ap->a_bop != NULL)
+		*ap->a_bop = &ap->a_vp->v_bufobj;
+	if (ap->a_bnp != NULL)
+		*ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize);
+	if (ap->a_runp != NULL)
+		*ap->a_runp = 0;
+	if (ap->a_runb != NULL)
+		*ap->a_runb = 0;
+	return (0);
+}
+
+int
+vop_stdfsync(ap)
+	struct vop_fsync_args /* {
+		struct vnode *a_vp;
+		struct ucred *a_cred;
+		int a_waitfor;
+		struct thread *a_td;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+	struct buf *bp;
+	struct bufobj *bo;
+	struct buf *nbp;
+	int error = 0;
+	int maxretry = 1000;     /* large, arbitrarily chosen */
+
+	bo = &vp->v_bufobj;
+	BO_LOCK(bo);
+loop1:
+	/*
+	 * MARK/SCAN initialization to avoid infinite loops.
+	 */
+        TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
+                bp->b_vflags &= ~BV_SCANNED;
+		bp->b_error = 0;
+	}
+
+	/*
+	 * Flush all dirty buffers associated with a vnode.
+	 */
+loop2:
+	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+		if ((bp->b_vflags & BV_SCANNED) != 0)
+			continue;
+		bp->b_vflags |= BV_SCANNED;
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
+			if (ap->a_waitfor != MNT_WAIT)
+				continue;
+			if (BUF_LOCK(bp,
+			    LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
+			    BO_LOCKPTR(bo)) != 0) {
+				BO_LOCK(bo);
+				goto loop1;
+			}
+			BO_LOCK(bo);
+		}
+		BO_UNLOCK(bo);
+		KASSERT(bp->b_bufobj == bo,
+		    ("bp %p wrong b_bufobj %p should be %p",
+		    bp, bp->b_bufobj, bo));
+		if ((bp->b_flags & B_DELWRI) == 0)
+			panic("fsync: not dirty");
+		if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
+			vfs_bio_awrite(bp);
+		} else {
+			bremfree(bp);
+			bawrite(bp);
+		}
+		BO_LOCK(bo);
+		goto loop2;
+	}
+
+	/*
+	 * If synchronous the caller expects us to completely resolve all
+	 * dirty buffers in the system.  Wait for in-progress I/O to
+	 * complete (which could include background bitmap writes), then
+	 * retry if dirty blocks still exist.
+	 */
+	if (ap->a_waitfor == MNT_WAIT) {
+		bufobj_wwait(bo, 0, 0);
+		if (bo->bo_dirty.bv_cnt > 0) {
+			/*
+			 * If we are unable to write any of these buffers
+			 * then we fail now rather than trying endlessly
+			 * to write them out.
+			 */
+			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
+				if ((error = bp->b_error) == 0)
+					continue;
+			if (error == 0 && --maxretry >= 0)
+				goto loop1;
+			error = EAGAIN;
+		}
+	}
+	BO_UNLOCK(bo);
+	if (error == EAGAIN)
+		vprint("fsync: giving up on dirty", vp);
+
+	return (error);
+}
+
+/* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */
+int
+vop_stdgetpages(ap)
+	struct vop_getpages_args /* {
+		struct vnode *a_vp;
+		vm_page_t *a_m;
+		int a_count;
+		int a_reqpage;
+		vm_ooffset_t a_offset;
+	} */ *ap;
+{
+
+	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
+	    ap->a_count, ap->a_reqpage);
+}
+
+int
+vop_stdkqfilter(struct vop_kqfilter_args *ap)
+{
+	return vfs_kqfilter(ap);
+}
+
+/* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */
+int
+vop_stdputpages(ap)
+	struct vop_putpages_args /* {
+		struct vnode *a_vp;
+		vm_page_t *a_m;
+		int a_count;
+		int a_sync;
+		int *a_rtvals;
+		vm_ooffset_t a_offset;
+	} */ *ap;
+{
+
+	return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
+	     ap->a_sync, ap->a_rtvals);
+}
+
+int
+vop_stdvptofh(struct vop_vptofh_args *ap)
+{
+	return (EOPNOTSUPP);
+}
+
+int
+vop_stdvptocnp(struct vop_vptocnp_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct vnode **dvp = ap->a_vpp;
+	struct ucred *cred = ap->a_cred;
+	char *buf = ap->a_buf;
+	int *buflen = ap->a_buflen;
+	char *dirbuf, *cpos;
+	int i, error, eofflag, dirbuflen, flags, locked, len, covered;
+	off_t off;
+	ino_t fileno;
+	struct vattr va;
+	struct nameidata nd;
+	struct thread *td;
+	struct dirent *dp;
+	struct vnode *mvp;
+
+	i = *buflen;
+	error = 0;
+	covered = 0;
+	td = curthread;
+
+	if (vp->v_type != VDIR)
+		return (ENOENT);
+
+	error = VOP_GETATTR(vp, &va, cred);
+	if (error)
+		return (error);
+
+	VREF(vp);
+	locked = VOP_ISLOCKED(vp);
+	VOP_UNLOCK(vp, 0);
+	NDINIT_ATVP(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
+	    "..", vp, td);
+	flags = FREAD;
+	error = vn_open_cred(&nd, &flags, 0, VN_OPEN_NOAUDIT, cred, NULL);
+	if (error) {
+		vn_lock(vp, locked | LK_RETRY);
+		return (error);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	mvp = *dvp = nd.ni_vp;
+
+	if (vp->v_mount != (*dvp)->v_mount &&
+	    ((*dvp)->v_vflag & VV_ROOT) &&
+	    ((*dvp)->v_mount->mnt_flag & MNT_UNION)) {
+		*dvp = (*dvp)->v_mount->mnt_vnodecovered;
+		VREF(mvp);
+		VOP_UNLOCK(mvp, 0);
+		vn_close(mvp, FREAD, cred, td);
+		VREF(*dvp);
+		vn_lock(*dvp, LK_EXCLUSIVE | LK_RETRY);
+		covered = 1;
+	}
+
+	fileno = va.va_fileid;
+
+	dirbuflen = DEV_BSIZE;
+	if (dirbuflen < va.va_blocksize)
+		dirbuflen = va.va_blocksize;
+	dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
+
+	if ((*dvp)->v_type != VDIR) {
+		error = ENOENT;
+		goto out;
+	}
+
+	off = 0;
+	len = 0;
+	do {
+		/* call VOP_READDIR of parent */
+		error = get_next_dirent(*dvp, &dp, dirbuf, dirbuflen, &off,
+					&cpos, &len, &eofflag, td);
+		if (error)
+			goto out;
+
+		if ((dp->d_type != DT_WHT) &&
+		    (dp->d_fileno == fileno)) {
+			if (covered) {
+				VOP_UNLOCK(*dvp, 0);
+				vn_lock(mvp, LK_EXCLUSIVE | LK_RETRY);
+				if (dirent_exists(mvp, dp->d_name, td)) {
+					error = ENOENT;
+					VOP_UNLOCK(mvp, 0);
+					vn_lock(*dvp, LK_EXCLUSIVE | LK_RETRY);
+					goto out;
+				}
+				VOP_UNLOCK(mvp, 0);
+				vn_lock(*dvp, LK_EXCLUSIVE | LK_RETRY);
+			}
+			i -= dp->d_namlen;
+
+			if (i < 0) {
+				error = ENOMEM;
+				goto out;
+			}
+			if (dp->d_namlen == 1 && dp->d_name[0] == '.') {
+				error = ENOENT;
+			} else {
+				bcopy(dp->d_name, buf + i, dp->d_namlen);
+				error = 0;
+			}
+			goto out;
+		}
+	} while (len > 0 || !eofflag);
+	error = ENOENT;
+
+out:
+	free(dirbuf, M_TEMP);
+	if (!error) {
+		*buflen = i;
+		vref(*dvp);
+	}
+	if (covered) {
+		vput(*dvp);
+		vrele(mvp);
+	} else {
+		VOP_UNLOCK(mvp, 0);
+		vn_close(mvp, FREAD, cred, td);
+	}
+	vn_lock(vp, locked | LK_RETRY);
+	return (error);
+}
+
+int
+vop_stdallocate(struct vop_allocate_args *ap)
+{
+#ifdef __notyet__
+	struct statfs sfs;
+#endif
+	struct iovec aiov;
+	struct vattr vattr, *vap;
+	struct uio auio;
+	off_t fsize, len, cur, offset;
+	uint8_t *buf;
+	struct thread *td;
+	struct vnode *vp;
+	size_t iosize;
+	int error;
+
+	buf = NULL;
+	error = 0;
+	td = curthread;
+	vap = &vattr;
+	vp = ap->a_vp;
+	len = *ap->a_len;
+	offset = *ap->a_offset;
+
+	error = VOP_GETATTR(vp, vap, td->td_ucred);
+	if (error != 0)
+		goto out;
+	fsize = vap->va_size;
+	iosize = vap->va_blocksize;
+	if (iosize == 0)
+		iosize = BLKDEV_IOSIZE;
+	if (iosize > MAXPHYS)
+		iosize = MAXPHYS;
+	buf = malloc(iosize, M_TEMP, M_WAITOK);
+
+#ifdef __notyet__
+	/*
+	 * Check if the filesystem sets f_maxfilesize; if not use
+	 * VOP_SETATTR to perform the check.
+	 */
+	error = VFS_STATFS(vp->v_mount, &sfs, td);
+	if (error != 0)
+		goto out;
+	if (sfs.f_maxfilesize) {
+		if (offset > sfs.f_maxfilesize || len > sfs.f_maxfilesize ||
+		    offset + len > sfs.f_maxfilesize) {
+			error = EFBIG;
+			goto out;
+		}
+	} else
+#endif
+	if (offset + len > vap->va_size) {
+		/*
+		 * Test offset + len against the filesystem's maxfilesize.
+		 */
+		VATTR_NULL(vap);
+		vap->va_size = offset + len;
+		error = VOP_SETATTR(vp, vap, td->td_ucred);
+		if (error != 0)
+			goto out;
+		VATTR_NULL(vap);
+		vap->va_size = fsize;
+		error = VOP_SETATTR(vp, vap, td->td_ucred);
+		if (error != 0)
+			goto out;
+	}
+
+	for (;;) {
+		/*
+		 * Read and write back anything below the nominal file
+		 * size.  There's currently no way outside the filesystem
+		 * to know whether this area is sparse or not.
+		 */
+		cur = iosize;
+		if ((offset % iosize) != 0)
+			cur -= (offset % iosize);
+		if (cur > len)
+			cur = len;
+		if (offset < fsize) {
+			aiov.iov_base = buf;
+			aiov.iov_len = cur;
+			auio.uio_iov = &aiov;
+			auio.uio_iovcnt = 1;
+			auio.uio_offset = offset;
+			auio.uio_resid = cur;
+			auio.uio_segflg = UIO_SYSSPACE;
+			auio.uio_rw = UIO_READ;
+			auio.uio_td = td;
+			error = VOP_READ(vp, &auio, 0, td->td_ucred);
+			if (error != 0)
+				break;
+			if (auio.uio_resid > 0) {
+				bzero(buf + cur - auio.uio_resid,
+				    auio.uio_resid);
+			}
+		} else {
+			bzero(buf, cur);
+		}
+
+		aiov.iov_base = buf;
+		aiov.iov_len = cur;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = offset;
+		auio.uio_resid = cur;
+		auio.uio_segflg = UIO_SYSSPACE;
+		auio.uio_rw = UIO_WRITE;
+		auio.uio_td = td;
+
+		error = VOP_WRITE(vp, &auio, 0, td->td_ucred);
+		if (error != 0)
+			break;
+
+		len -= cur;
+		offset += cur;
+		if (len == 0)
+			break;
+		if (should_yield())
+			break;
+	}
+
+ out:
+	*ap->a_len = len;
+	*ap->a_offset = offset;
+	free(buf, M_TEMP);
+	return (error);
+}
+
+int
+vop_stdadvise(struct vop_advise_args *ap)
+{
+	struct vnode *vp;
+	off_t start, end;
+	int error;
+
+	vp = ap->a_vp;
+	switch (ap->a_advice) {
+	case POSIX_FADV_WILLNEED:
+		/*
+		 * Do nothing for now.  Filesystems should provide a
+		 * custom method which starts an asynchronous read of
+		 * the requested region.
+		 */
+		error = 0;
+		break;
+	case POSIX_FADV_DONTNEED:
+		/*
+		 * Flush any open FS buffers and then remove pages
+		 * from the backing VM object.  Using vinvalbuf() here
+		 * is a bit heavy-handed as it flushes all buffers for
+		 * the given vnode, not just the buffers covering the
+		 * requested range.
+		 */
+		error = 0;
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		if (vp->v_iflag & VI_DOOMED) {
+			VOP_UNLOCK(vp, 0);
+			break;
+		}
+		vinvalbuf(vp, V_CLEANONLY, 0, 0);
+		if (vp->v_object != NULL) {
+			start = trunc_page(ap->a_start);
+			end = round_page(ap->a_end);
+			VM_OBJECT_WLOCK(vp->v_object);
+			vm_object_page_cache(vp->v_object, OFF_TO_IDX(start),
+			    OFF_TO_IDX(end));
+			VM_OBJECT_WUNLOCK(vp->v_object);
+		}
+		VOP_UNLOCK(vp, 0);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+int
+vop_stdunp_bind(struct vop_unp_bind_args *ap)
+{
+
+	ap->a_vp->v_socket = ap->a_socket;
+	return (0);
+}
+
+int
+vop_stdunp_connect(struct vop_unp_connect_args *ap)
+{
+
+	*ap->a_socket = ap->a_vp->v_socket;
+	return (0);
+}
+
+int
+vop_stdunp_detach(struct vop_unp_detach_args *ap)
+{
+
+	ap->a_vp->v_socket = NULL;
+	return (0);
+}
+
+static int
+vop_stdis_text(struct vop_is_text_args *ap)
+{
+
+	return ((ap->a_vp->v_vflag & VV_TEXT) != 0);
+}
+
+static int
+vop_stdset_text(struct vop_set_text_args *ap)
+{
+
+	ap->a_vp->v_vflag |= VV_TEXT;
+	return (0);
+}
+
+static int
+vop_stdunset_text(struct vop_unset_text_args *ap)
+{
+
+	ap->a_vp->v_vflag &= ~VV_TEXT;
+	return (0);
+}
+
+static int
+vop_stdget_writecount(struct vop_get_writecount_args *ap)
+{
+
+	*ap->a_writecount = ap->a_vp->v_writecount;
+	return (0);
+}
+
+static int
+vop_stdadd_writecount(struct vop_add_writecount_args *ap)
+{
+
+	ap->a_vp->v_writecount += ap->a_inc;
+	return (0);
+}
+
+/*
+ * vfs default ops
+ * used to fill the vfs function table to get reasonable default return values.
+ */
+int
+vfs_stdroot (mp, flags, vpp)
+	struct mount *mp;
+	int flags;
+	struct vnode **vpp;
+{
+
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdstatfs (mp, sbp)
+	struct mount *mp;
+	struct statfs *sbp;
+{
+
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdquotactl (mp, cmds, uid, arg)
+	struct mount *mp;
+	int cmds;
+	uid_t uid;
+	void *arg;
+{
+
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdsync(mp, waitfor)
+	struct mount *mp;
+	int waitfor;
+{
+	struct vnode *vp, *mvp;
+	struct thread *td;
+	int error, lockreq, allerror = 0;
+
+	td = curthread;
+	lockreq = LK_EXCLUSIVE | LK_INTERLOCK;
+	if (waitfor != MNT_WAIT)
+		lockreq |= LK_NOWAIT;
+	/*
+	 * Force stale buffer cache information to be flushed.
+	 */
+loop:
+	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
+		if (vp->v_bufobj.bo_dirty.bv_cnt == 0) {
+			VI_UNLOCK(vp);
+			continue;
+		}
+		if ((error = vget(vp, lockreq, td)) != 0) {
+			if (error == ENOENT) {
+				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+				goto loop;
+			}
+			continue;
+		}
+		error = VOP_FSYNC(vp, waitfor, td);
+		if (error)
+			allerror = error;
+		vput(vp);
+	}
+	return (allerror);
+}
+
+int
+vfs_stdnosync (mp, waitfor)
+	struct mount *mp;
+	int waitfor;
+{
+
+	return (0);
+}
+
+int
+vfs_stdvget (mp, ino, flags, vpp)
+	struct mount *mp;
+	ino_t ino;
+	int flags;
+	struct vnode **vpp;
+{
+
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdfhtovp (mp, fhp, flags, vpp)
+	struct mount *mp;
+	struct fid *fhp;
+	int flags;
+	struct vnode **vpp;
+{
+
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdinit (vfsp)
+	struct vfsconf *vfsp;
+{
+
+	return (0);
+}
+
+int
+vfs_stduninit (vfsp)
+	struct vfsconf *vfsp;
+{
+
+	return(0);
+}
+
+int
+vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)
+	struct mount *mp;
+	int cmd;
+	struct vnode *filename_vp;
+	int attrnamespace;
+	const char *attrname;
+{
+
+	if (filename_vp != NULL)
+		VOP_UNLOCK(filename_vp, 0);
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdsysctl(mp, op, req)
+	struct mount *mp;
+	fsctlop_t op;
+	struct sysctl_req *req;
+{
+
+	return (EOPNOTSUPP);
+}
+
+/* end of vfs default ops */
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
new file mode 100644
index 0000000..6a3f291
--- /dev/null
+++ b/sys/kern/vfs_export.c
@@ -0,0 +1,493 @@
+/*-
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/dirent.h>
+#include <sys/domain.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/refcount.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+#include <net/radix.h>
+
+static MALLOC_DEFINE(M_NETADDR, "export_host", "Export host address structure");
+
+static void	vfs_free_addrlist(struct netexport *nep);
+static int	vfs_free_netcred(struct radix_node *rn, void *w);
+static int	vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+		    struct export_args *argp);
+static struct netcred *vfs_export_lookup(struct mount *, struct sockaddr *);
+
+/*
+ * Network address lookup element
+ */
+struct netcred {
+	struct	radix_node netc_rnodes[2];
+	int	netc_exflags;
+	struct	ucred *netc_anon;
+	int	netc_numsecflavors;
+	int	netc_secflavors[MAXSECFLAVORS];
+};
+
+/*
+ * Network export information
+ */
+struct netexport {
+	struct	netcred ne_defexported;		      /* Default export */
+	struct	radix_node_head *ne_rtable[AF_MAX+1]; /* Individual exports */
+};
+
+/*
+ * Build hash lists of net addresses and hang them off the mount point.
+ * Called by vfs_export() to set up the lists of export addresses.
+ */
+static int
+vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+    struct export_args *argp)
+{
+	register struct netcred *np;
+	register struct radix_node_head *rnh;
+	register int i;
+	struct radix_node *rn;
+	struct sockaddr *saddr, *smask = 0;
+	struct domain *dom;
+	int error;
+
+	/*
+	 * XXX: This routine converts from a `struct xucred'
+	 * (argp->ex_anon) to a `struct ucred' (np->netc_anon).  This
+	 * operation is questionable; for example, what should be done
+	 * with fields like cr_uidinfo and cr_prison?  Currently, this
+	 * routine does not touch them (leaves them as NULL).
+	 */
+	if (argp->ex_anon.cr_version != XUCRED_VERSION) {
+		vfs_mount_error(mp, "ex_anon.cr_version: %d != %d",
+		    argp->ex_anon.cr_version, XUCRED_VERSION);
+		return (EINVAL);
+	}
+
+	if (argp->ex_addrlen == 0) {
+		if (mp->mnt_flag & MNT_DEFEXPORTED) {
+			vfs_mount_error(mp,
+			    "MNT_DEFEXPORTED already set for mount %p", mp);
+			return (EPERM);
+		}
+		np = &nep->ne_defexported;
+		np->netc_exflags = argp->ex_flags;
+		np->netc_anon = crget();
+		np->netc_anon->cr_uid = argp->ex_anon.cr_uid;
+		crsetgroups(np->netc_anon, argp->ex_anon.cr_ngroups,
+		    argp->ex_anon.cr_groups);
+		np->netc_anon->cr_prison = &prison0;
+		prison_hold(np->netc_anon->cr_prison);
+		np->netc_numsecflavors = argp->ex_numsecflavors;
+		bcopy(argp->ex_secflavors, np->netc_secflavors,
+		    sizeof(np->netc_secflavors));
+		MNT_ILOCK(mp);
+		mp->mnt_flag |= MNT_DEFEXPORTED;
+		MNT_IUNLOCK(mp);
+		return (0);
+	}
+
+#if MSIZE <= 256
+	if (argp->ex_addrlen > MLEN) {
+		vfs_mount_error(mp, "ex_addrlen %d is greater than %d",
+		    argp->ex_addrlen, MLEN);
+		return (EINVAL);
+	}
+#endif
+
+	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
+	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO);
+	saddr = (struct sockaddr *) (np + 1);
+	if ((error = copyin(argp->ex_addr, saddr, argp->ex_addrlen)))
+		goto out;
+	if (saddr->sa_family == AF_UNSPEC || saddr->sa_family > AF_MAX) {
+		error = EINVAL;
+		vfs_mount_error(mp, "Invalid saddr->sa_family: %d");
+		goto out;
+	}
+	if (saddr->sa_len > argp->ex_addrlen)
+		saddr->sa_len = argp->ex_addrlen;
+	if (argp->ex_masklen) {
+		smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
+		error = copyin(argp->ex_mask, smask, argp->ex_masklen);
+		if (error)
+			goto out;
+		if (smask->sa_len > argp->ex_masklen)
+			smask->sa_len = argp->ex_masklen;
+	}
+	i = saddr->sa_family;
+	if ((rnh = nep->ne_rtable[i]) == NULL) {
+		/*
+		 * Seems silly to initialize every AF when most are not used,
+		 * do so on demand here
+		 */
+		for (dom = domains; dom; dom = dom->dom_next) {
+			KASSERT(((i == AF_INET) || (i == AF_INET6)), 
+			    ("unexpected protocol in vfs_hang_addrlist"));
+			if (dom->dom_family == i && dom->dom_rtattach) {
+				/*
+				 * XXX MRT 
+				 * The INET and INET6 domains know the
+				 * offset already. We don't need to send it
+				 * So we just use it as a flag to say that
+				 * we are or are not setting up a real routing
+				 * table. Only IP and IPV6 need have this
+				 * be 0 so all other protocols can stay the 
+				 * same (ABI compatible).
+				 */ 
+				dom->dom_rtattach(
+				    (void **) &nep->ne_rtable[i], 0);
+				break;
+			}
+		}
+		if ((rnh = nep->ne_rtable[i]) == NULL) {
+			error = ENOBUFS;
+			vfs_mount_error(mp, "%s %s %d",
+			    "Unable to initialize radix node head ",
+			    "for address family", i);
+			goto out;
+		}
+	}
+	RADIX_NODE_HEAD_LOCK(rnh);
+	rn = (*rnh->rnh_addaddr)(saddr, smask, rnh, np->netc_rnodes);
+	RADIX_NODE_HEAD_UNLOCK(rnh);
+	if (rn == NULL || np != (struct netcred *)rn) {	/* already exists */
+		error = EPERM;
+		vfs_mount_error(mp, "Invalid radix node head, rn: %p %p",
+		    rn, np);
+		goto out;
+	}
+	np->netc_exflags = argp->ex_flags;
+	np->netc_anon = crget();
+	np->netc_anon->cr_uid = argp->ex_anon.cr_uid;
+	crsetgroups(np->netc_anon, argp->ex_anon.cr_ngroups,
+	    argp->ex_anon.cr_groups);
+	np->netc_anon->cr_prison = &prison0;
+	prison_hold(np->netc_anon->cr_prison);
+	np->netc_numsecflavors = argp->ex_numsecflavors;
+	bcopy(argp->ex_secflavors, np->netc_secflavors,
+	    sizeof(np->netc_secflavors));
+	return (0);
+out:
+	free(np, M_NETADDR);
+	return (error);
+}
+
+/* Helper for vfs_free_addrlist. */
+/* ARGSUSED */
+static int
+vfs_free_netcred(struct radix_node *rn, void *w)
+{
+	struct radix_node_head *rnh = (struct radix_node_head *) w;
+	struct ucred *cred;
+
+	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+	cred = ((struct netcred *)rn)->netc_anon;
+	if (cred != NULL)
+		crfree(cred);
+	free(rn, M_NETADDR);
+	return (0);
+}
+
+/*
+ * Free the net address hash lists that are hanging off the mount points.
+ */
+static void
+vfs_free_addrlist(struct netexport *nep)
+{
+	int i;
+	struct radix_node_head *rnh;
+	struct ucred *cred;
+
+	for (i = 0; i <= AF_MAX; i++) {
+		if ((rnh = nep->ne_rtable[i])) {
+			RADIX_NODE_HEAD_LOCK(rnh);
+			(*rnh->rnh_walktree) (rnh, vfs_free_netcred, rnh);
+			RADIX_NODE_HEAD_UNLOCK(rnh);
+			RADIX_NODE_HEAD_DESTROY(rnh);
+			free(rnh, M_RTABLE);
+			nep->ne_rtable[i] = NULL;	/* not SMP safe XXX */
+		}
+	}
+	cred = nep->ne_defexported.netc_anon;
+	if (cred != NULL)
+		crfree(cred);
+
+}
+
+/*
+ * High level function to manipulate export options on a mount point
+ * and the passed in netexport.
+ * Struct export_args *argp is the variable used to twiddle options,
+ * the structure is described in sys/mount.h
+ */
+int
+vfs_export(struct mount *mp, struct export_args *argp)
+{
+	struct netexport *nep;
+	int error;
+
+	if (argp->ex_numsecflavors < 0
+	    || argp->ex_numsecflavors >= MAXSECFLAVORS)
+		return (EINVAL);
+
+	error = 0;
+	lockmgr(&mp->mnt_explock, LK_EXCLUSIVE, NULL);
+	nep = mp->mnt_export;
+	if (argp->ex_flags & MNT_DELEXPORT) {
+		if (nep == NULL) {
+			error = ENOENT;
+			goto out;
+		}
+		if (mp->mnt_flag & MNT_EXPUBLIC) {
+			vfs_setpublicfs(NULL, NULL, NULL);
+			MNT_ILOCK(mp);
+			mp->mnt_flag &= ~MNT_EXPUBLIC;
+			MNT_IUNLOCK(mp);
+		}
+		vfs_free_addrlist(nep);
+		mp->mnt_export = NULL;
+		free(nep, M_MOUNT);
+		nep = NULL;
+		MNT_ILOCK(mp);
+		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+		MNT_IUNLOCK(mp);
+	}
+	if (argp->ex_flags & MNT_EXPORTED) {
+		if (nep == NULL) {
+			nep = malloc(sizeof(struct netexport), M_MOUNT, M_WAITOK | M_ZERO);
+			mp->mnt_export = nep;
+		}
+		if (argp->ex_flags & MNT_EXPUBLIC) {
+			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
+				goto out;
+			MNT_ILOCK(mp);
+			mp->mnt_flag |= MNT_EXPUBLIC;
+			MNT_IUNLOCK(mp);
+		}
+		if ((error = vfs_hang_addrlist(mp, nep, argp)))
+			goto out;
+		MNT_ILOCK(mp);
+		mp->mnt_flag |= MNT_EXPORTED;
+		MNT_IUNLOCK(mp);
+	}
+
+out:
+	lockmgr(&mp->mnt_explock, LK_RELEASE, NULL);
+	/*
+	 * Once we have executed the vfs_export() command, we do
+	 * not want to keep the "export" option around in the
+	 * options list, since that will cause subsequent MNT_UPDATE
+	 * calls to fail.  The export information is saved in
+	 * mp->mnt_export, so we can safely delete the "export" mount option
+	 * here.
+	 */
+	vfs_deleteopt(mp->mnt_optnew, "export");
+	vfs_deleteopt(mp->mnt_opt, "export");
+	return (error);
+}
+
+/*
+ * Set the publicly exported filesystem (WebNFS). Currently, only
+ * one public filesystem is possible in the spec (RFC 2054 and 2055)
+ */
+int
+vfs_setpublicfs(struct mount *mp, struct netexport *nep,
+    struct export_args *argp)
+{
+	int error;
+	struct vnode *rvp;
+	char *cp;
+
+	/*
+	 * mp == NULL -> invalidate the current info, the FS is
+	 * no longer exported. May be called from either vfs_export
+	 * or unmount, so check if it hasn't already been done.
+	 */
+	if (mp == NULL) {
+		if (nfs_pub.np_valid) {
+			nfs_pub.np_valid = 0;
+			if (nfs_pub.np_index != NULL) {
+				free(nfs_pub.np_index, M_TEMP);
+				nfs_pub.np_index = NULL;
+			}
+		}
+		return (0);
+	}
+
+	/*
+	 * Only one allowed at a time.
+	 */
+	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
+		return (EBUSY);
+
+	/*
+	 * Get real filehandle for root of exported FS.
+	 */
+	bzero(&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
+	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
+
+	if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp)))
+		return (error);
+
+	if ((error = VOP_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
+		return (error);
+
+	vput(rvp);
+
+	/*
+	 * If an indexfile was specified, pull it in.
+	 */
+	if (argp->ex_indexfile != NULL) {
+		if (nfs_pub.np_index != NULL)
+			nfs_pub.np_index = malloc(MAXNAMLEN + 1, M_TEMP,
+			    M_WAITOK);
+		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
+		    MAXNAMLEN, (size_t *)0);
+		if (!error) {
+			/*
+			 * Check for illegal filenames.
+			 */
+			for (cp = nfs_pub.np_index; *cp; cp++) {
+				if (*cp == '/') {
+					error = EINVAL;
+					break;
+				}
+			}
+		}
+		if (error) {
+			free(nfs_pub.np_index, M_TEMP);
+			nfs_pub.np_index = NULL;
+			return (error);
+		}
+	}
+
+	nfs_pub.np_mount = mp;
+	nfs_pub.np_valid = 1;
+	return (0);
+}
+
+/*
+ * Used by the filesystems to determine if a given network address
+ * (passed in 'nam') is present in their exports list, returns a pointer
+ * to struct netcred so that the filesystem can examine it for
+ * access rights (read/write/etc).
+ */
+static struct netcred *
+vfs_export_lookup(struct mount *mp, struct sockaddr *nam)
+{
+	struct netexport *nep;
+	register struct netcred *np;
+	register struct radix_node_head *rnh;
+	struct sockaddr *saddr;
+
+	nep = mp->mnt_export;
+	if (nep == NULL)
+		return (NULL);
+	np = NULL;
+	if (mp->mnt_flag & MNT_EXPORTED) {
+		/*
+		 * Lookup in the export list first.
+		 */
+		if (nam != NULL) {
+			saddr = nam;
+			rnh = nep->ne_rtable[saddr->sa_family];
+			if (rnh != NULL) {
+				RADIX_NODE_HEAD_RLOCK(rnh);
+				np = (struct netcred *)
+				    (*rnh->rnh_matchaddr)(saddr, rnh);
+				RADIX_NODE_HEAD_RUNLOCK(rnh);
+				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
+					np = NULL;
+			}
+		}
+		/*
+		 * If no address match, use the default if it exists.
+		 */
+		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
+			np = &nep->ne_defexported;
+	}
+	return (np);
+}
+
+/*
+ * XXX: This comment comes from the deprecated ufs_check_export()
+ * XXX: and may not entirely apply, but lacking something better:
+ * This is the generic part of fhtovp called after the underlying
+ * filesystem has validated the file handle.
+ *
+ * Verify that a host should have access to a filesystem.
+ */
+
+int 
+vfs_stdcheckexp(struct mount *mp, struct sockaddr *nam, int *extflagsp,
+    struct ucred **credanonp, int *numsecflavors, int **secflavors)
+{
+	struct netcred *np;
+
+	lockmgr(&mp->mnt_explock, LK_SHARED, NULL);
+	np = vfs_export_lookup(mp, nam);
+	if (np == NULL) {
+		lockmgr(&mp->mnt_explock, LK_RELEASE, NULL);
+		*credanonp = NULL;
+		return (EACCES);
+	}
+	*extflagsp = np->netc_exflags;
+	if ((*credanonp = np->netc_anon) != NULL)
+		crhold(*credanonp);
+	if (numsecflavors)
+		*numsecflavors = np->netc_numsecflavors;
+	if (secflavors)
+		*secflavors = np->netc_secflavors;
+	lockmgr(&mp->mnt_explock, LK_RELEASE, NULL);
+	return (0);
+}
+
diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c
new file mode 100644
index 0000000..bc7b942
--- /dev/null
+++ b/sys/kern/vfs_extattr.c
@@ -0,0 +1,765 @@
+/*-
+ * Copyright (c) 1999-2001 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/capability.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/fcntl.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/limits.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/extattr.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+/*
+ * Syscall to push extended attribute configuration information into the VFS.
+ * Accepts a path, which it converts to a mountpoint, as well as a command
+ * (int cmd), and attribute name and misc data.
+ *
+ * Currently this is used only by UFS1 extended attributes.
+ */
+int
+sys_extattrctl(td, uap)
+	struct thread *td;
+	struct extattrctl_args /* {
+		const char *path;
+		int cmd;
+		const char *filename;
+		int attrnamespace;
+		const char *attrname;
+	} */ *uap;
+{
+	struct vnode *filename_vp;
+	struct nameidata nd;
+	struct mount *mp, *mp_writable;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	AUDIT_ARG_CMD(uap->cmd);
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	/*
+	 * uap->attrname is not always defined.  We check again later when we
+	 * invoke the VFS call so as to pass in NULL there if needed.
+	 */
+	if (uap->attrname != NULL) {
+		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
+		    NULL);
+		if (error)
+			return (error);
+	}
+	AUDIT_ARG_TEXT(attrname);
+
+	mp = NULL;
+	filename_vp = NULL;
+	if (uap->filename != NULL) {
+		NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE2,
+		    UIO_USERSPACE, uap->filename, td);
+		error = namei(&nd);
+		if (error)
+			return (error);
+		filename_vp = nd.ni_vp;
+		NDFREE(&nd, NDF_NO_VP_RELE);
+	}
+
+	/* uap->path is always defined. */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+	    UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error)
+		goto out;
+	mp = nd.ni_vp->v_mount;
+	error = vfs_busy(mp, 0);
+	if (error) {
+		NDFREE(&nd, 0);
+		mp = NULL;
+		goto out;
+	}
+	VOP_UNLOCK(nd.ni_vp, 0);
+	error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
+	NDFREE(&nd, NDF_NO_VP_UNLOCK);
+	if (error)
+		goto out;
+	if (filename_vp != NULL) {
+		/*
+		 * uap->filename is not always defined.  If it is,
+		 * grab a vnode lock, which VFS_EXTATTRCTL() will
+		 * later release.
+		 */
+		error = vn_lock(filename_vp, LK_EXCLUSIVE);
+		if (error) {
+			vn_finished_write(mp_writable);
+			goto out;
+		}
+	}
+
+	error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace,
+	    uap->attrname != NULL ? attrname : NULL);
+
+	vn_finished_write(mp_writable);
+out:
+	if (mp != NULL)
+		vfs_unbusy(mp);
+
+	/*
+	 * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp,
+	 * so vrele it if it is defined.
+	 */
+	if (filename_vp != NULL)
+		vrele(filename_vp);
+	return (error);
+}
+
+/*-
+ * Set a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", userspace buffer
+ *            pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    void *data, size_t nbytes, struct thread *td)
+{
+	struct mount *mp;
+	struct uio auio;
+	struct iovec aiov;
+	ssize_t cnt;
+	int error;
+
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error)
+		return (error);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+	aiov.iov_base = data;
+	aiov.iov_len = nbytes;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = 0;
+	if (nbytes > IOSIZE_MAX) {
+		error = EINVAL;
+		goto done;
+	}
+	auio.uio_resid = nbytes;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+	cnt = nbytes;
+
+#ifdef MAC
+	error = mac_vnode_check_setextattr(td->td_ucred, vp, attrnamespace,
+	    attrname);
+	if (error)
+		goto done;
+#endif
+
+	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
+	    td->td_ucred, td);
+	cnt -= auio.uio_resid;
+	td->td_retval[0] = cnt;
+
+done:
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+int
+sys_extattr_set_fd(td, uap)
+	struct thread *td;
+	struct extattr_set_fd_args /* {
+		int fd;
+		int attrnamespace;
+		const char *attrname;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct file *fp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG_TEXT(attrname);
+
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_EXTATTR_SET), &fp);
+	if (error)
+		return (error);
+
+	error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
+	    attrname, uap->data, uap->nbytes, td);
+	fdrop(fp, td);
+
+	return (error);
+}
+
+int
+sys_extattr_set_file(td, uap)
+	struct thread *td;
+	struct extattr_set_file_args /* {
+		const char *path;
+		int attrnamespace;
+		const char *attrname;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG_TEXT(attrname);
+
+	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
+	    uap->data, uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+int
+sys_extattr_set_link(td, uap)
+	struct thread *td;
+	struct extattr_set_link_args /* {
+		const char *path;
+		int attrnamespace;
+		const char *attrname;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG_TEXT(attrname);
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
+	    uap->data, uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*-
+ * Get a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", userspace buffer
+ *            pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    void *data, size_t nbytes, struct thread *td)
+{
+	struct uio auio, *auiop;
+	struct iovec aiov;
+	ssize_t cnt;
+	size_t size, *sizep;
+	int error;
+
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+
+	/*
+	 * Slightly unusual semantics: if the user provides a NULL data
+	 * pointer, they don't want to receive the data, just the maximum
+	 * read length.
+	 */
+	auiop = NULL;
+	sizep = NULL;
+	cnt = 0;
+	if (data != NULL) {
+		aiov.iov_base = data;
+		aiov.iov_len = nbytes;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		if (nbytes > IOSIZE_MAX) {
+			error = EINVAL;
+			goto done;
+		}
+		auio.uio_resid = nbytes;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_USERSPACE;
+		auio.uio_td = td;
+		auiop = &auio;
+		cnt = nbytes;
+	} else
+		sizep = &size;
+
+#ifdef MAC
+	error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace,
+	    attrname);
+	if (error)
+		goto done;
+#endif
+
+	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
+	    td->td_ucred, td);
+
+	if (auiop != NULL) {
+		cnt -= auio.uio_resid;
+		td->td_retval[0] = cnt;
+	} else
+		td->td_retval[0] = size;
+
+done:
+	VOP_UNLOCK(vp, 0);
+	return (error);
+}
+
+int
+sys_extattr_get_fd(td, uap)
+	struct thread *td;
+	struct extattr_get_fd_args /* {
+		int fd;
+		int attrnamespace;
+		const char *attrname;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct file *fp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG_TEXT(attrname);
+
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_EXTATTR_GET), &fp);
+	if (error)
+		return (error);
+
+	error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
+	    attrname, uap->data, uap->nbytes, td);
+
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_extattr_get_file(td, uap)
+	struct thread *td;
+	struct extattr_get_file_args /* {
+		const char *path;
+		int attrnamespace;
+		const char *attrname;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG_TEXT(attrname);
+
+	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
+	    uap->data, uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+int
+sys_extattr_get_link(td, uap)
+	struct thread *td;
+	struct extattr_get_link_args /* {
+		const char *path;
+		int attrnamespace;
+		const char *attrname;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG_TEXT(attrname);
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
+	    td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
+	    uap->data, uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * extattr_delete_vp(): Delete a named extended attribute on a file or
+ *                      directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", proc "p"
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    struct thread *td)
+{
+	struct mount *mp;
+	int error;
+
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error)
+		return (error);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+#ifdef MAC
+	error = mac_vnode_check_deleteextattr(td->td_ucred, vp, attrnamespace,
+	    attrname);
+	if (error)
+		goto done;
+#endif
+
+	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred,
+	    td);
+	if (error == EOPNOTSUPP)
+		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
+		    td->td_ucred, td);
+#ifdef MAC
+done:
+#endif
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+int
+sys_extattr_delete_fd(td, uap)
+	struct thread *td;
+	struct extattr_delete_fd_args /* {
+		int fd;
+		int attrnamespace;
+		const char *attrname;
+	} */ *uap;
+{
+	struct file *fp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG_TEXT(attrname);
+
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_EXTATTR_DELETE), &fp);
+	if (error)
+		return (error);
+
+	error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
+	    attrname, td);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_extattr_delete_file(td, uap)
+	struct thread *td;
+	struct extattr_delete_file_args /* {
+		const char *path;
+		int attrnamespace;
+		const char *attrname;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return(error);
+	AUDIT_ARG_TEXT(attrname);
+
+	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return(error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
+	vrele(nd.ni_vp);
+	return(error);
+}
+
+int
+sys_extattr_delete_link(td, uap)
+	struct thread *td;
+	struct extattr_delete_link_args /* {
+		const char *path;
+		int attrnamespace;
+		const char *attrname;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return(error);
+	AUDIT_ARG_TEXT(attrname);
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return(error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
+	vrele(nd.ni_vp);
+	return(error);
+}
+
+/*-
+ * Retrieve a list of extended attributes on a file or directory.
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace",
+ *            userspace buffer pointer "data", buffer length "nbytes",
+ *            thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_list_vp(struct vnode *vp, int attrnamespace, void *data,
+    size_t nbytes, struct thread *td)
+{
+	struct uio auio, *auiop;
+	size_t size, *sizep;
+	struct iovec aiov;
+	ssize_t cnt;
+	int error;
+
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+	auiop = NULL;
+	sizep = NULL;
+	cnt = 0;
+	if (data != NULL) {
+		aiov.iov_base = data;
+		aiov.iov_len = nbytes;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		if (nbytes > IOSIZE_MAX) {
+			error = EINVAL;
+			goto done;
+		}
+		auio.uio_resid = nbytes;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_USERSPACE;
+		auio.uio_td = td;
+		auiop = &auio;
+		cnt = nbytes;
+	} else
+		sizep = &size;
+
+#ifdef MAC
+	error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace);
+	if (error)
+		goto done;
+#endif
+
+	error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep,
+	    td->td_ucred, td);
+
+	if (auiop != NULL) {
+		cnt -= auio.uio_resid;
+		td->td_retval[0] = cnt;
+	} else
+		td->td_retval[0] = size;
+
+done:
+	VOP_UNLOCK(vp, 0);
+	return (error);
+}
+
+
+int
+sys_extattr_list_fd(td, uap)
+	struct thread *td;
+	struct extattr_list_fd_args /* {
+		int fd;
+		int attrnamespace;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_EXTATTR_LIST), &fp);
+	if (error)
+		return (error);
+
+	error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
+	    uap->nbytes, td);
+
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_extattr_list_file(td, uap)
+	struct thread*td;
+	struct extattr_list_file_args /* {
+		const char *path;
+		int attrnamespace;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	int error;
+
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
+	    uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+int
+sys_extattr_list_link(td, uap)
+	struct thread*td;
+	struct extattr_list_link_args /* {
+		const char *path;
+		int attrnamespace;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	int error;
+
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
+	    td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
+	    uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	return (error);
+}
diff --git a/sys/kern/vfs_hash.c b/sys/kern/vfs_hash.c
new file mode 100644
index 0000000..0271e49
--- /dev/null
+++ b/sys/kern/vfs_hash.c
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 2005 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+
+static MALLOC_DEFINE(M_VFS_HASH, "vfs_hash", "VFS hash table");
+
+static LIST_HEAD(vfs_hash_head, vnode)	*vfs_hash_tbl;
+static LIST_HEAD(,vnode)		vfs_hash_side;
+static u_long				vfs_hash_mask;
+static struct mtx			vfs_hash_mtx;
+
+static void
+vfs_hashinit(void *dummy __unused)
+{
+
+	vfs_hash_tbl = hashinit(desiredvnodes, M_VFS_HASH, &vfs_hash_mask);
+	mtx_init(&vfs_hash_mtx, "vfs hash", NULL, MTX_DEF);
+	LIST_INIT(&vfs_hash_side);
+}
+
+/* Must be SI_ORDER_SECOND so desiredvnodes is available */
+SYSINIT(vfs_hash, SI_SUB_VFS, SI_ORDER_SECOND, vfs_hashinit, NULL);
+
+u_int
+vfs_hash_index(struct vnode *vp)
+{
+
+	return (vp->v_hash + vp->v_mount->mnt_hashseed);
+}
+
+static struct vfs_hash_head *
+vfs_hash_bucket(const struct mount *mp, u_int hash)
+{
+
+	return (&vfs_hash_tbl[(hash + mp->mnt_hashseed) & vfs_hash_mask]);
+}
+
+int
+vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+{
+	struct vnode *vp;
+	int error;
+
+	while (1) {
+		mtx_lock(&vfs_hash_mtx);
+		LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) {
+			if (vp->v_hash != hash)
+				continue;
+			if (vp->v_mount != mp)
+				continue;
+			if (fn != NULL && fn(vp, arg))
+				continue;
+			VI_LOCK(vp);
+			mtx_unlock(&vfs_hash_mtx);
+			error = vget(vp, flags | LK_INTERLOCK, td);
+			if (error == ENOENT && (flags & LK_NOWAIT) == 0)
+				break;
+			if (error)
+				return (error);
+			*vpp = vp;
+			return (0);
+		}
+		if (vp == NULL) {
+			mtx_unlock(&vfs_hash_mtx);
+			*vpp = NULL;
+			return (0);
+		}
+	}
+}
+
+void
+vfs_hash_remove(struct vnode *vp)
+{
+
+	mtx_lock(&vfs_hash_mtx);
+	LIST_REMOVE(vp, v_hashlist);
+	mtx_unlock(&vfs_hash_mtx);
+}
+
+int
+vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+{
+	struct vnode *vp2;
+	int error;
+
+	*vpp = NULL;
+	while (1) {
+		mtx_lock(&vfs_hash_mtx);
+		LIST_FOREACH(vp2,
+		    vfs_hash_bucket(vp->v_mount, hash), v_hashlist) {
+			if (vp2->v_hash != hash)
+				continue;
+			if (vp2->v_mount != vp->v_mount)
+				continue;
+			if (fn != NULL && fn(vp2, arg))
+				continue;
+			VI_LOCK(vp2);
+			mtx_unlock(&vfs_hash_mtx);
+			error = vget(vp2, flags | LK_INTERLOCK, td);
+			if (error == ENOENT && (flags & LK_NOWAIT) == 0)
+				break;
+			mtx_lock(&vfs_hash_mtx);
+			LIST_INSERT_HEAD(&vfs_hash_side, vp, v_hashlist);
+			mtx_unlock(&vfs_hash_mtx);
+			vput(vp);
+			if (!error)
+				*vpp = vp2;
+			return (error);
+		}
+		if (vp2 == NULL)
+			break;
+			
+	}
+	vp->v_hash = hash;
+	LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist);
+	mtx_unlock(&vfs_hash_mtx);
+	return (0);
+}
+
+void
+vfs_hash_rehash(struct vnode *vp, u_int hash)
+{
+
+	mtx_lock(&vfs_hash_mtx);
+	LIST_REMOVE(vp, v_hashlist);
+	LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist);
+	vp->v_hash = hash;
+	mtx_unlock(&vfs_hash_mtx);
+}
diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c
new file mode 100644
index 0000000..eab48fb
--- /dev/null
+++ b/sys/kern/vfs_init.c
@@ -0,0 +1,344 @@
+/*-
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_init.c	8.3 (Berkeley) 1/4/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fnv_hash.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+
+static int	vfs_register(struct vfsconf *);
+static int	vfs_unregister(struct vfsconf *);
+
+MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
+
+/*
+ * The highest defined VFS number.
+ */
+int maxvfsconf = VFS_GENERIC + 1;
+
+/*
+ * Single-linked list of configured VFSes.
+ * New entries are added/deleted by vfs_register()/vfs_unregister()
+ */
+struct vfsconfhead vfsconf = TAILQ_HEAD_INITIALIZER(vfsconf);
+
+/*
+ * Loader.conf variable vfs.typenumhash enables setting vfc_typenum using a hash
+ * calculation on vfc_name, so that it doesn't change when file systems are
+ * loaded in a different order. This will avoid the NFS server file handles from
+ * changing for file systems that use vfc_typenum in their fsid.
+ */
+static int	vfs_typenumhash = 1;
+TUNABLE_INT("vfs.typenumhash", &vfs_typenumhash);
+SYSCTL_INT(_vfs, OID_AUTO, typenumhash, CTLFLAG_RDTUN, &vfs_typenumhash, 0,
+    "Set vfc_typenum using a hash calculation on vfc_name, so that it does not"
+    "change when file systems are loaded in a different order.");
+
+/*
+ * A Zen vnode attribute structure.
+ *
+ * Initialized when the first filesystem registers by vfs_register().
+ */
+struct vattr va_null;
+
+/*
+ * vfs_init.c
+ *
+ * Allocate and fill in operations vectors.
+ *
+ * An undocumented feature of this approach to defining operations is that
+ * there can be multiple entries in vfs_opv_descs for the same operations
+ * vector. This allows third parties to extend the set of operations
+ * supported by another layer in a binary compatibile way. For example,
+ * assume that NFS needed to be modified to support Ficus. NFS has an entry
+ * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by
+ * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions)
+ * listing those new operations Ficus adds to NFS, all without modifying the
+ * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but
+ * that is a(whole)nother story.) This is a feature.
+ */
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+
+struct vfsconf *
+vfs_byname(const char *name)
+{
+	struct vfsconf *vfsp;
+
+	if (!strcmp(name, "ffs"))
+		name = "ufs";
+	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
+		if (!strcmp(name, vfsp->vfc_name))
+			return (vfsp);
+	return (NULL);
+}
+
+struct vfsconf *
+vfs_byname_kld(const char *fstype, struct thread *td, int *error)
+{
+	struct vfsconf *vfsp;
+	int fileid, loaded;
+
+	vfsp = vfs_byname(fstype);
+	if (vfsp != NULL)
+		return (vfsp);
+
+	/* Try to load the respective module. */
+	*error = kern_kldload(td, fstype, &fileid);
+	loaded = (*error == 0);
+	if (*error == EEXIST)
+		*error = 0;
+	if (*error)
+		return (NULL);
+
+	/* Look up again to see if the VFS was loaded. */
+	vfsp = vfs_byname(fstype);
+	if (vfsp == NULL) {
+		if (loaded)
+			(void)kern_kldunload(td, fileid, LINKER_UNLOAD_FORCE);
+		*error = ENODEV;
+		return (NULL);
+	}
+	return (vfsp);
+}
+
+
+/* Register a new filesystem type in the global table */
+static int
+vfs_register(struct vfsconf *vfc)
+{
+	struct sysctl_oid *oidp;
+	struct vfsops *vfsops;
+	static int once;
+	struct vfsconf *tvfc;
+	uint32_t hashval;
+	int secondpass;
+
+	if (!once) {
+		vattr_null(&va_null);
+		once = 1;
+	}
+	
+	if (vfc->vfc_version != VFS_VERSION) {
+		printf("ERROR: filesystem %s, unsupported ABI version %x\n",
+		    vfc->vfc_name, vfc->vfc_version);
+		return (EINVAL);
+	}
+	if (vfs_byname(vfc->vfc_name) != NULL)
+		return (EEXIST);
+
+	if (vfs_typenumhash != 0) {
+		/*
+		 * Calculate a hash on vfc_name to use for vfc_typenum. Unless
+		 * all of 1<->255 are assigned, it is limited to 8bits since
+		 * that is what ZFS uses from vfc_typenum and is also the
+		 * preferred range for vfs_getnewfsid().
+		 */
+		hashval = fnv_32_str(vfc->vfc_name, FNV1_32_INIT);
+		hashval &= 0xff;
+		secondpass = 0;
+		do {
+			/* Look for and fix any collision. */
+			TAILQ_FOREACH(tvfc, &vfsconf, vfc_list) {
+				if (hashval == tvfc->vfc_typenum) {
+					if (hashval == 255 && secondpass == 0) {
+						hashval = 1;
+						secondpass = 1;
+					} else
+						hashval++;
+					break;
+				}
+			}
+		} while (tvfc != NULL);
+		vfc->vfc_typenum = hashval;
+		if (vfc->vfc_typenum >= maxvfsconf)
+			maxvfsconf = vfc->vfc_typenum + 1;
+	} else
+		vfc->vfc_typenum = maxvfsconf++;
+	TAILQ_INSERT_TAIL(&vfsconf, vfc, vfc_list);
+
+	/*
+	 * If this filesystem has a sysctl node under vfs
+	 * (i.e. vfs.xxfs), then change the oid number of that node to 
+	 * match the filesystem's type number.  This allows user code
+	 * which uses the type number to read sysctl variables defined
+	 * by the filesystem to continue working. Since the oids are
+	 * in a sorted list, we need to make sure the order is
+	 * preserved by re-registering the oid after modifying its
+	 * number.
+	 */
+	sysctl_lock();
+	SLIST_FOREACH(oidp, &sysctl__vfs_children, oid_link)
+		if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) {
+			sysctl_unregister_oid(oidp);
+			oidp->oid_number = vfc->vfc_typenum;
+			sysctl_register_oid(oidp);
+			break;
+		}
+	sysctl_unlock();
+
+	/*
+	 * Initialise unused ``struct vfsops'' fields, to use
+	 * the vfs_std*() functions.  Note, we need the mount
+	 * and unmount operations, at the least.  The check
+	 * for vfsops available is just a debugging aid.
+	 */
+	KASSERT(vfc->vfc_vfsops != NULL,
+	    ("Filesystem %s has no vfsops", vfc->vfc_name));
+	/*
+	 * Check the mount and unmount operations.
+	 */
+	vfsops = vfc->vfc_vfsops;
+	KASSERT(vfsops->vfs_mount != NULL,
+	    ("Filesystem %s has no mount op", vfc->vfc_name));
+	KASSERT(vfsops->vfs_unmount != NULL,
+	    ("Filesystem %s has no unmount op", vfc->vfc_name));
+
+	if (vfsops->vfs_root == NULL)
+		/* return file system's root vnode */
+		vfsops->vfs_root =	vfs_stdroot;
+	if (vfsops->vfs_quotactl == NULL)
+		/* quota control */
+		vfsops->vfs_quotactl =	vfs_stdquotactl;
+	if (vfsops->vfs_statfs == NULL)
+		/* return file system's status */
+		vfsops->vfs_statfs =	vfs_stdstatfs;
+	if (vfsops->vfs_sync == NULL)
+		/*
+		 * flush unwritten data (nosync)
+		 * file systems can use vfs_stdsync
+		 * explicitly by setting it in the
+		 * vfsop vector.
+		 */
+		vfsops->vfs_sync =	vfs_stdnosync;
+	if (vfsops->vfs_vget == NULL)
+		/* convert an inode number to a vnode */
+		vfsops->vfs_vget =	vfs_stdvget;
+	if (vfsops->vfs_fhtovp == NULL)
+		/* turn an NFS file handle into a vnode */
+		vfsops->vfs_fhtovp =	vfs_stdfhtovp;
+	if (vfsops->vfs_checkexp == NULL)
+		/* check if file system is exported */
+		vfsops->vfs_checkexp =	vfs_stdcheckexp;
+	if (vfsops->vfs_init == NULL)
+		/* file system specific initialisation */
+		vfsops->vfs_init =	vfs_stdinit;
+	if (vfsops->vfs_uninit == NULL)
+		/* file system specific uninitialisation */
+		vfsops->vfs_uninit =	vfs_stduninit;
+	if (vfsops->vfs_extattrctl == NULL)
+		/* extended attribute control */
+		vfsops->vfs_extattrctl = vfs_stdextattrctl;
+	if (vfsops->vfs_sysctl == NULL)
+		vfsops->vfs_sysctl = vfs_stdsysctl;
+	
+	/*
+	 * Call init function for this VFS...
+	 */
+	(*(vfc->vfc_vfsops->vfs_init))(vfc);
+
+	return 0;
+}
+
+
+/* Remove registration of a filesystem type */
+static int
+vfs_unregister(struct vfsconf *vfc)
+{
+	struct vfsconf *vfsp;
+	int error, i, maxtypenum;
+
+	i = vfc->vfc_typenum;
+
+	vfsp = vfs_byname(vfc->vfc_name);
+	if (vfsp == NULL)
+		return EINVAL;
+	if (vfsp->vfc_refcount)
+		return EBUSY;
+	if (vfc->vfc_vfsops->vfs_uninit != NULL) {
+		error = (*vfc->vfc_vfsops->vfs_uninit)(vfsp);
+		if (error)
+			return (error);
+	}
+	TAILQ_REMOVE(&vfsconf, vfsp, vfc_list);
+	maxtypenum = VFS_GENERIC;
+	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
+		if (maxtypenum < vfsp->vfc_typenum)
+			maxtypenum = vfsp->vfc_typenum;
+	maxvfsconf = maxtypenum + 1;
+	return 0;
+}
+
+/*
+ * Standard kernel module handling code for filesystem modules.
+ * Referenced from VFS_SET().
+ */
+int
+vfs_modevent(module_t mod, int type, void *data)
+{
+	struct vfsconf *vfc;
+	int error = 0;
+
+	vfc = (struct vfsconf *)data;
+
+	switch (type) {
+	case MOD_LOAD:
+		if (vfc)
+			error = vfs_register(vfc);
+		break;
+
+	case MOD_UNLOAD:
+		if (vfc)
+			error = vfs_unregister(vfc);
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+	return (error);
+}
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
new file mode 100644
index 0000000..d4d0166
--- /dev/null
+++ b/sys/kern/vfs_lookup.c
@@ -0,0 +1,1254 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/capability.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/sdt.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/uma.h>
+
+#define	NAMEI_DIAGNOSTIC 1
+#undef NAMEI_DIAGNOSTIC
+
+SDT_PROVIDER_DECLARE(vfs);
+SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, entry, "struct vnode *", "char *",
+    "unsigned long");
+SDT_PROBE_DEFINE2(vfs, namei, lookup, return, return, "int", "struct vnode *");
+
+/*
+ * Allocation zone for namei
+ */
+uma_zone_t namei_zone;
+/*
+ * Placeholder vnode for mp traversal
+ */
+static struct vnode *vp_crossmp;
+
+static void
+nameiinit(void *dummy __unused)
+{
+
+	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_PTR, 0);
+	getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp);
+	vn_lock(vp_crossmp, LK_EXCLUSIVE);
+	VN_LOCK_ASHARE(vp_crossmp);
+	VOP_UNLOCK(vp_crossmp, 0);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
+
+static int lookup_shared = 1;
+SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0,
+    "Enables/Disables shared locks for path name translation");
+TUNABLE_INT("vfs.lookup_shared", &lookup_shared);
+
+/*
+ * Convert a pathname into a pointer to a locked vnode.
+ *
+ * The FOLLOW flag is set when symbolic links are to be followed
+ * when they occur at the end of the name translation process.
+ * Symbolic links are always followed for all other pathname
+ * components other than the last.
+ *
+ * The segflg defines whether the name is to be copied from user
+ * space or kernel space.
+ *
+ * Overall outline of namei:
+ *
+ *	copy in name
+ *	get starting directory
+ *	while (!done && !error) {
+ *		call lookup to search path.
+ *		if symbolic link, massage name in buffer and continue
+ *	}
+ */
+int
+namei(struct nameidata *ndp)
+{
+	struct filedesc *fdp;	/* pointer to file descriptor state */
+	char *cp;		/* pointer into pathname argument */
+	struct vnode *dp;	/* the directory we are searching */
+	struct iovec aiov;		/* uio for reading symbolic links */
+	struct uio auio;
+	int error, linklen;
+	struct componentname *cnp = &ndp->ni_cnd;
+	struct thread *td = cnp->cn_thread;
+	struct proc *p = td->td_proc;
+
+	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
+	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
+	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
+	    ("namei: nameiop contaminated with flags"));
+	KASSERT((cnp->cn_flags & OPMASK) == 0,
+	    ("namei: flags contaminated with nameiops"));
+	if (!lookup_shared)
+		cnp->cn_flags &= ~LOCKSHARED;
+	fdp = p->p_fd;
+
+	/* We will set this ourselves if we need it. */
+	cnp->cn_flags &= ~TRAILINGSLASH;
+
+	/*
+	 * Get a buffer for the name to be translated, and copy the
+	 * name into the buffer.
+	 */
+	if ((cnp->cn_flags & HASBUF) == 0)
+		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
+	if (ndp->ni_segflg == UIO_SYSSPACE)
+		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
+			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+	else
+		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
+			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+
+	/*
+	 * Don't allow empty pathnames.
+	 */
+	if (!error && *cnp->cn_pnbuf == '\0')
+		error = ENOENT;
+
+#ifdef CAPABILITY_MODE
+	/*
+	 * In capability mode, lookups must be "strictly relative" (i.e.
+	 * not an absolute path, and not containing '..' components) to
+	 * a real file descriptor, not the pseudo-descriptor AT_FDCWD.
+	 */
+	if (error == 0 && IN_CAPABILITY_MODE(td) &&
+	    (cnp->cn_flags & NOCAPCHECK) == 0) {
+		ndp->ni_strictrelative = 1;
+		if (ndp->ni_dirfd == AT_FDCWD) {
+#ifdef KTRACE
+			if (KTRPOINT(td, KTR_CAPFAIL))
+				ktrcapfail(CAPFAIL_LOOKUP, 0, 0);
+#endif
+			error = ECAPMODE;
+		}
+	}
+#endif
+	if (error) {
+		uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+		cnp->cn_pnbuf = NULL;
+		cnp->cn_nameptr = NULL;
+#endif
+		ndp->ni_vp = NULL;
+		return (error);
+	}
+	ndp->ni_loopcnt = 0;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_NAMEI)) {
+		KASSERT(cnp->cn_thread == curthread,
+		    ("namei not using curthread"));
+		ktrnamei(cnp->cn_pnbuf);
+	}
+#endif
+	/*
+	 * Get starting point for the translation.
+	 */
+	FILEDESC_SLOCK(fdp);
+	ndp->ni_rootdir = fdp->fd_rdir;
+	ndp->ni_topdir = fdp->fd_jdir;
+
+	/*
+	 * If we are auditing the kernel pathname, save the user pathname.
+	 */
+	if (cnp->cn_flags & AUDITVNODE1)
+		AUDIT_ARG_UPATH1(td, ndp->ni_dirfd, cnp->cn_pnbuf);
+	if (cnp->cn_flags & AUDITVNODE2)
+		AUDIT_ARG_UPATH2(td, ndp->ni_dirfd, cnp->cn_pnbuf);
+
+	dp = NULL;
+	if (cnp->cn_pnbuf[0] != '/') {
+		if (ndp->ni_startdir != NULL) {
+			dp = ndp->ni_startdir;
+			error = 0;
+		} else if (ndp->ni_dirfd != AT_FDCWD) {
+			cap_rights_t rights;
+
+			rights = ndp->ni_rightsneeded;
+			cap_rights_set(&rights, CAP_LOOKUP);
+
+			if (cnp->cn_flags & AUDITVNODE1)
+				AUDIT_ARG_ATFD1(ndp->ni_dirfd);
+			if (cnp->cn_flags & AUDITVNODE2)
+				AUDIT_ARG_ATFD2(ndp->ni_dirfd);
+			error = fgetvp_rights(td, ndp->ni_dirfd,
+			    &rights, &ndp->ni_filecaps, &dp);
+#ifdef CAPABILITIES
+			/*
+			 * If file descriptor doesn't have all rights,
+			 * all lookups relative to it must also be
+			 * strictly relative.
+			 */
+			CAP_ALL(&rights);
+			if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights,
+			    &rights) ||
+			    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
+			    ndp->ni_filecaps.fc_nioctls != -1) {
+				ndp->ni_strictrelative = 1;
+			}
+#endif
+		}
+		if (error != 0 || dp != NULL) {
+			FILEDESC_SUNLOCK(fdp);
+			if (error == 0 && dp->v_type != VDIR) {
+				vrele(dp);
+				error = ENOTDIR;
+			}
+		}
+		if (error) {
+			uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+			cnp->cn_pnbuf = NULL;
+			cnp->cn_nameptr = NULL;
+#endif
+			return (error);
+		}
+	}
+	if (dp == NULL) {
+		dp = fdp->fd_cdir;
+		VREF(dp);
+		FILEDESC_SUNLOCK(fdp);
+		if (ndp->ni_startdir != NULL)
+			vrele(ndp->ni_startdir);
+	}
+	SDT_PROBE(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
+	    cnp->cn_flags, 0, 0);
+	for (;;) {
+		/*
+		 * Check if root directory should replace current directory.
+		 * Done at start of translation and after symbolic link.
+		 */
+		cnp->cn_nameptr = cnp->cn_pnbuf;
+		if (*(cnp->cn_nameptr) == '/') {
+			vrele(dp);
+			if (ndp->ni_strictrelative != 0) {
+#ifdef KTRACE
+				if (KTRPOINT(curthread, KTR_CAPFAIL))
+					ktrcapfail(CAPFAIL_LOOKUP, 0, 0);
+#endif
+				return (ENOTCAPABLE);
+			}
+			while (*(cnp->cn_nameptr) == '/') {
+				cnp->cn_nameptr++;
+				ndp->ni_pathlen--;
+			}
+			dp = ndp->ni_rootdir;
+			VREF(dp);
+		}
+		ndp->ni_startdir = dp;
+		error = lookup(ndp);
+		if (error) {
+			uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+			cnp->cn_pnbuf = NULL;
+			cnp->cn_nameptr = NULL;
+#endif
+			SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0,
+			    0, 0);
+			return (error);
+		}
+		/*
+		 * If not a symbolic link, we're done.
+		 */
+		if ((cnp->cn_flags & ISSYMLINK) == 0) {
+			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
+				uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+				cnp->cn_pnbuf = NULL;
+				cnp->cn_nameptr = NULL;
+#endif
+			} else
+				cnp->cn_flags |= HASBUF;
+
+			SDT_PROBE(vfs, namei, lookup, return, 0, ndp->ni_vp,
+			    0, 0, 0);
+			return (0);
+		}
+		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
+			error = ELOOP;
+			break;
+		}
+#ifdef MAC
+		if ((cnp->cn_flags & NOMACCHECK) == 0) {
+			error = mac_vnode_check_readlink(td->td_ucred,
+			    ndp->ni_vp);
+			if (error)
+				break;
+		}
+#endif
+		if (ndp->ni_pathlen > 1)
+			cp = uma_zalloc(namei_zone, M_WAITOK);
+		else
+			cp = cnp->cn_pnbuf;
+		aiov.iov_base = cp;
+		aiov.iov_len = MAXPATHLEN;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_SYSSPACE;
+		auio.uio_td = td;
+		auio.uio_resid = MAXPATHLEN;
+		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
+		if (error) {
+			if (ndp->ni_pathlen > 1)
+				uma_zfree(namei_zone, cp);
+			break;
+		}
+		linklen = MAXPATHLEN - auio.uio_resid;
+		if (linklen == 0) {
+			if (ndp->ni_pathlen > 1)
+				uma_zfree(namei_zone, cp);
+			error = ENOENT;
+			break;
+		}
+		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
+			if (ndp->ni_pathlen > 1)
+				uma_zfree(namei_zone, cp);
+			error = ENAMETOOLONG;
+			break;
+		}
+		if (ndp->ni_pathlen > 1) {
+			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
+			uma_zfree(namei_zone, cnp->cn_pnbuf);
+			cnp->cn_pnbuf = cp;
+		} else
+			cnp->cn_pnbuf[linklen] = '\0';
+		ndp->ni_pathlen += linklen;
+		vput(ndp->ni_vp);
+		dp = ndp->ni_dvp;
+	}
+	uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+	cnp->cn_pnbuf = NULL;
+	cnp->cn_nameptr = NULL;
+#endif
+	vput(ndp->ni_vp);
+	ndp->ni_vp = NULL;
+	vrele(ndp->ni_dvp);
+	SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0, 0, 0);
+	return (error);
+}
+
+static int
+compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags)
+{
+
+	if (mp == NULL || ((lkflags & LK_SHARED) &&
+	    (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) ||
+	    ((cnflags & ISDOTDOT) &&
+	    (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) {
+		lkflags &= ~LK_SHARED;
+		lkflags |= LK_EXCLUSIVE;
+	}
+	return (lkflags);
+}
+
+static __inline int
+needs_exclusive_leaf(struct mount *mp, int flags)
+{
+
+	/*
+	 * Intermediate nodes can use shared locks, we only need to
+	 * force an exclusive lock for leaf nodes.
+	 */
+	if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF))
+		return (0);
+
+	/* Always use exclusive locks if LOCKSHARED isn't set. */
+	if (!(flags & LOCKSHARED))
+		return (1);
+
+	/*
+	 * For lookups during open(), if the mount point supports
+	 * extended shared operations, then use a shared lock for the
+	 * leaf node, otherwise use an exclusive lock.
+	 */
+	if (flags & ISOPEN) {
+		if (mp != NULL &&
+		    (mp->mnt_kern_flag & MNTK_EXTENDED_SHARED))
+			return (0);
+		else
+			return (1);
+	}
+
+	/*
+	 * Lookup requests outside of open() that specify LOCKSHARED
+	 * only need a shared lock on the leaf vnode.
+	 */
+	return (0);
+}
+
+/*
+ * Search a pathname.
+ * This is a very central and rather complicated routine.
+ *
+ * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
+ * The starting directory is taken from ni_startdir. The pathname is
+ * descended until done, or a symbolic link is encountered. The variable
+ * ni_more is clear if the path is completed; it is set to one if a
+ * symbolic link needing interpretation is encountered.
+ *
+ * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
+ * whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it, the parent directory is returned
+ * locked. If flag has WANTPARENT or'ed into it, the parent directory is
+ * returned unlocked. Otherwise the parent directory is not returned. If
+ * the target of the pathname exists and LOCKLEAF is or'ed into the flag
+ * the target is returned locked, otherwise it is returned unlocked.
+ * When creating or renaming and LOCKPARENT is specified, the target may not
+ * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
+ *
+ * Overall outline of lookup:
+ *
+ * dirloop:
+ *	identify next component of name at ndp->ni_ptr
+ *	handle degenerate case where name is null string
+ *	if .. and crossing mount points and on mounted filesys, find parent
+ *	call VOP_LOOKUP routine for next component name
+ *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
+ *	    component vnode returned in ni_vp (if it exists), locked.
+ *	if result vnode is mounted on and crossing mount points,
+ *	    find mounted on vnode
+ *	if more components of name, do next level at dirloop
+ *	return the answer in ni_vp, locked if LOCKLEAF set
+ *	    if LOCKPARENT set, return locked parent in ni_dvp
+ *	    if WANTPARENT set, return unlocked parent in ni_dvp
+ */
+int
+lookup(struct nameidata *ndp)
+{
+	char *cp;		/* pointer into pathname argument */
+	struct vnode *dp = 0;	/* the directory we are searching */
+	struct vnode *tdp;		/* saved dp */
+	struct mount *mp;		/* mount table entry */
+	struct prison *pr;
+	int docache;			/* == 0 do not cache last component */
+	int wantparent;			/* 1 => wantparent or lockparent flag */
+	int rdonly;			/* lookup read-only flag bit */
+	int error = 0;
+	int dpunlocked = 0;		/* dp has already been unlocked */
+	struct componentname *cnp = &ndp->ni_cnd;
+	int lkflags_save;
+	int ni_dvp_unlocked;
+	
+	/*
+	 * Setup: break out flag bits into variables.
+	 */
+	ni_dvp_unlocked = 0;
+	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
+	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
+	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
+	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
+	if (cnp->cn_nameiop == DELETE ||
+	    (wantparent && cnp->cn_nameiop != CREATE &&
+	     cnp->cn_nameiop != LOOKUP))
+		docache = 0;
+	rdonly = cnp->cn_flags & RDONLY;
+	cnp->cn_flags &= ~ISSYMLINK;
+	ndp->ni_dvp = NULL;
+	/*
+	 * We use shared locks until we hit the parent of the last cn then
+	 * we adjust based on the requesting flags.
+	 */
+	if (lookup_shared)
+		cnp->cn_lkflags = LK_SHARED;
+	else
+		cnp->cn_lkflags = LK_EXCLUSIVE;
+	dp = ndp->ni_startdir;
+	ndp->ni_startdir = NULLVP;
+	vn_lock(dp,
+	    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY,
+	    cnp->cn_flags));
+
+dirloop:
+	/*
+	 * Search a new directory.
+	 *
+	 * The last component of the filename is left accessible via
+	 * cnp->cn_nameptr for callers that need the name. Callers needing
+	 * the name set the SAVENAME flag. When done, they assume
+	 * responsibility for freeing the pathname buffer.
+	 */
+	cnp->cn_consume = 0;
+	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+		continue;
+	cnp->cn_namelen = cp - cnp->cn_nameptr;
+	if (cnp->cn_namelen > NAME_MAX) {
+		error = ENAMETOOLONG;
+		goto bad;
+	}
+#ifdef NAMEI_DIAGNOSTIC
+	{ char c = *cp;
+	*cp = '\0';
+	printf("{%s}: ", cnp->cn_nameptr);
+	*cp = c; }
+#endif
+	ndp->ni_pathlen -= cnp->cn_namelen;
+	ndp->ni_next = cp;
+
+	/*
+	 * Replace multiple slashes by a single slash and trailing slashes
+	 * by a null.  This must be done before VOP_LOOKUP() because some
+	 * fs's don't know about trailing slashes.  Remember if there were
+	 * trailing slashes to handle symlinks, existing non-directories
+	 * and non-existing files that won't be directories specially later.
+	 */
+	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+		cp++;
+		ndp->ni_pathlen--;
+		if (*cp == '\0') {
+			*ndp->ni_next = '\0';
+			cnp->cn_flags |= TRAILINGSLASH;
+		}
+	}
+	ndp->ni_next = cp;
+
+	cnp->cn_flags |= MAKEENTRY;
+	if (*cp == '\0' && docache == 0)
+		cnp->cn_flags &= ~MAKEENTRY;
+	if (cnp->cn_namelen == 2 &&
+	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
+		cnp->cn_flags |= ISDOTDOT;
+	else
+		cnp->cn_flags &= ~ISDOTDOT;
+	if (*ndp->ni_next == 0)
+		cnp->cn_flags |= ISLASTCN;
+	else
+		cnp->cn_flags &= ~ISLASTCN;
+
+	if ((cnp->cn_flags & ISLASTCN) != 0 &&
+	    cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+		error = EINVAL;
+		goto bad;
+	}
+
+	/*
+	 * Check for degenerate name (e.g. / or "")
+	 * which is a way of talking about a directory,
+	 * e.g. like "/." or ".".
+	 */
+	if (cnp->cn_nameptr[0] == '\0') {
+		if (dp->v_type != VDIR) {
+			error = ENOTDIR;
+			goto bad;
+		}
+		if (cnp->cn_nameiop != LOOKUP) {
+			error = EISDIR;
+			goto bad;
+		}
+		if (wantparent) {
+			ndp->ni_dvp = dp;
+			VREF(dp);
+		}
+		ndp->ni_vp = dp;
+
+		if (cnp->cn_flags & AUDITVNODE1)
+			AUDIT_ARG_VNODE1(dp);
+		else if (cnp->cn_flags & AUDITVNODE2)
+			AUDIT_ARG_VNODE2(dp);
+
+		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
+			VOP_UNLOCK(dp, 0);
+		/* XXX This should probably move to the top of function. */
+		if (cnp->cn_flags & SAVESTART)
+			panic("lookup: SAVESTART");
+		goto success;
+	}
+
+	/*
+	 * Handle "..": five special cases.
+	 * 0. If doing a capability lookup, return ENOTCAPABLE (this is a
+	 *    fairly conservative design choice, but it's the only one that we
+	 *    are satisfied guarantees the property we're looking for).
+	 * 1. Return an error if this is the last component of
+	 *    the name and the operation is DELETE or RENAME.
+	 * 2. If at root directory (e.g. after chroot)
+	 *    or at absolute root directory
+	 *    then ignore it so can't get out.
+	 * 3. If this vnode is the root of a mounted
+	 *    filesystem, then replace it with the
+	 *    vnode which was mounted on so we take the
+	 *    .. in the other filesystem.
+	 * 4. If the vnode is the top directory of
+	 *    the jail or chroot, don't let them out.
+	 */
+	if (cnp->cn_flags & ISDOTDOT) {
+		if (ndp->ni_strictrelative != 0) {
+#ifdef KTRACE
+			if (KTRPOINT(curthread, KTR_CAPFAIL))
+				ktrcapfail(CAPFAIL_LOOKUP, 0, 0);
+#endif
+			error = ENOTCAPABLE;
+			goto bad;
+		}
+		if ((cnp->cn_flags & ISLASTCN) != 0 &&
+		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+			error = EINVAL;
+			goto bad;
+		}
+		for (;;) {
+			for (pr = cnp->cn_cred->cr_prison; pr != NULL;
+			     pr = pr->pr_parent)
+				if (dp == pr->pr_root)
+					break;
+			if (dp == ndp->ni_rootdir || 
+			    dp == ndp->ni_topdir || 
+			    dp == rootvnode ||
+			    pr != NULL ||
+			    ((dp->v_vflag & VV_ROOT) != 0 &&
+			     (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
+				ndp->ni_dvp = dp;
+				ndp->ni_vp = dp;
+				VREF(dp);
+				goto nextname;
+			}
+			if ((dp->v_vflag & VV_ROOT) == 0)
+				break;
+			if (dp->v_iflag & VI_DOOMED) {	/* forced unmount */
+				error = ENOENT;
+				goto bad;
+			}
+			tdp = dp;
+			dp = dp->v_mount->mnt_vnodecovered;
+			VREF(dp);
+			vput(tdp);
+			vn_lock(dp,
+			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
+			    LK_RETRY, ISDOTDOT));
+		}
+	}
+
+	/*
+	 * We now have a segment name to search for, and a directory to search.
+	 */
+unionlookup:
+#ifdef MAC
+	if ((cnp->cn_flags & NOMACCHECK) == 0) {
+		error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp,
+		    cnp);
+		if (error)
+			goto bad;
+	}
+#endif
+	ndp->ni_dvp = dp;
+	ndp->ni_vp = NULL;
+	ASSERT_VOP_LOCKED(dp, "lookup");
+	/*
+	 * If we have a shared lock we may need to upgrade the lock for the
+	 * last operation.
+	 */
+	if (dp != vp_crossmp &&
+	    VOP_ISLOCKED(dp) == LK_SHARED &&
+	    (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT))
+		vn_lock(dp, LK_UPGRADE|LK_RETRY);
+	if ((dp->v_iflag & VI_DOOMED) != 0) {
+		error = ENOENT;
+		goto bad;
+	}
+	/*
+	 * If we're looking up the last component and we need an exclusive
+	 * lock, adjust our lkflags.
+	 */
+	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags))
+		cnp->cn_lkflags = LK_EXCLUSIVE;
+#ifdef NAMEI_DIAGNOSTIC
+	vprint("lookup in", dp);
+#endif
+	lkflags_save = cnp->cn_lkflags;
+	cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags,
+	    cnp->cn_flags);
+	if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
+		cnp->cn_lkflags = lkflags_save;
+		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
+#ifdef NAMEI_DIAGNOSTIC
+		printf("not found\n");
+#endif
+		if ((error == ENOENT) &&
+		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
+		    (dp->v_mount->mnt_flag & MNT_UNION)) {
+			tdp = dp;
+			dp = dp->v_mount->mnt_vnodecovered;
+			VREF(dp);
+			vput(tdp);
+			vn_lock(dp,
+			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
+			    LK_RETRY, cnp->cn_flags));
+			goto unionlookup;
+		}
+
+		if (error != EJUSTRETURN)
+			goto bad;
+		/*
+		 * At this point, we know we're at the end of the
+		 * pathname.  If creating / renaming, we can consider
+		 * allowing the file or directory to be created / renamed,
+		 * provided we're not on a read-only filesystem.
+		 */
+		if (rdonly) {
+			error = EROFS;
+			goto bad;
+		}
+		/* trailing slash only allowed for directories */
+		if ((cnp->cn_flags & TRAILINGSLASH) &&
+		    !(cnp->cn_flags & WILLBEDIR)) {
+			error = ENOENT;
+			goto bad;
+		}
+		if ((cnp->cn_flags & LOCKPARENT) == 0)
+			VOP_UNLOCK(dp, 0);
+		/*
+		 * We return with ni_vp NULL to indicate that the entry
+		 * doesn't currently exist, leaving a pointer to the
+		 * (possibly locked) directory vnode in ndp->ni_dvp.
+		 */
+		if (cnp->cn_flags & SAVESTART) {
+			ndp->ni_startdir = ndp->ni_dvp;
+			VREF(ndp->ni_startdir);
+		}
+		goto success;
+	} else
+		cnp->cn_lkflags = lkflags_save;
+#ifdef NAMEI_DIAGNOSTIC
+	printf("found\n");
+#endif
+	/*
+	 * Take into account any additional components consumed by
+	 * the underlying filesystem.
+	 */
+	if (cnp->cn_consume > 0) {
+		cnp->cn_nameptr += cnp->cn_consume;
+		ndp->ni_next += cnp->cn_consume;
+		ndp->ni_pathlen -= cnp->cn_consume;
+		cnp->cn_consume = 0;
+	}
+
+	dp = ndp->ni_vp;
+
+	/*
+	 * Check to see if the vnode has been mounted on;
+	 * if so find the root of the mounted filesystem.
+	 */
+	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
+	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+		if (vfs_busy(mp, 0))
+			continue;
+		vput(dp);
+		if (dp != ndp->ni_dvp)
+			vput(ndp->ni_dvp);
+		else
+			vrele(ndp->ni_dvp);
+		vref(vp_crossmp);
+		ndp->ni_dvp = vp_crossmp;
+		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags,
+		    cnp->cn_flags), &tdp);
+		vfs_unbusy(mp);
+		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
+			panic("vp_crossmp exclusively locked or reclaimed");
+		if (error) {
+			dpunlocked = 1;
+			goto bad2;
+		}
+		ndp->ni_vp = dp = tdp;
+	}
+
+	/*
+	 * Check for symbolic link
+	 */
+	if ((dp->v_type == VLNK) &&
+	    ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) ||
+	     *ndp->ni_next == '/')) {
+		cnp->cn_flags |= ISSYMLINK;
+		if (dp->v_iflag & VI_DOOMED) {
+			/*
+			 * We can't know whether the directory was mounted with
+			 * NOSYMFOLLOW, so we can't follow safely.
+			 */
+			error = ENOENT;
+			goto bad2;
+		}
+		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
+			error = EACCES;
+			goto bad2;
+		}
+		/*
+		 * Symlink code always expects an unlocked dvp.
+		 */
+		if (ndp->ni_dvp != ndp->ni_vp) {
+			VOP_UNLOCK(ndp->ni_dvp, 0);
+			ni_dvp_unlocked = 1;
+		}
+		goto success;
+	}
+
+nextname:
+	/*
+	 * Not a symbolic link that we will follow.  Continue with the
+	 * next component if there is any; otherwise, we're done.
+	 */
+	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
+	    ("lookup: invalid path state."));
+	if (*ndp->ni_next == '/') {
+		cnp->cn_nameptr = ndp->ni_next;
+		while (*cnp->cn_nameptr == '/') {
+			cnp->cn_nameptr++;
+			ndp->ni_pathlen--;
+		}
+		if (ndp->ni_dvp != dp)
+			vput(ndp->ni_dvp);
+		else
+			vrele(ndp->ni_dvp);
+		goto dirloop;
+	}
+	/*
+	 * If we're processing a path with a trailing slash,
+	 * check that the end result is a directory.
+	 */
+	if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto bad2;
+	}
+	/*
+	 * Disallow directory write attempts on read-only filesystems.
+	 */
+	if (rdonly &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+		error = EROFS;
+		goto bad2;
+	}
+	if (cnp->cn_flags & SAVESTART) {
+		ndp->ni_startdir = ndp->ni_dvp;
+		VREF(ndp->ni_startdir);
+	}
+	if (!wantparent) {
+		ni_dvp_unlocked = 2;
+		if (ndp->ni_dvp != dp)
+			vput(ndp->ni_dvp);
+		else
+			vrele(ndp->ni_dvp);
+	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) {
+		VOP_UNLOCK(ndp->ni_dvp, 0);
+		ni_dvp_unlocked = 1;
+	}
+
+	if (cnp->cn_flags & AUDITVNODE1)
+		AUDIT_ARG_VNODE1(dp);
+	else if (cnp->cn_flags & AUDITVNODE2)
+		AUDIT_ARG_VNODE2(dp);
+
+	if ((cnp->cn_flags & LOCKLEAF) == 0)
+		VOP_UNLOCK(dp, 0);
+success:
+	/*
+	 * Because of lookup_shared we may have the vnode shared locked, but
+	 * the caller may want it to be exclusively locked.
+	 */
+	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) &&
+	    VOP_ISLOCKED(dp) != LK_EXCLUSIVE) {
+		vn_lock(dp, LK_UPGRADE | LK_RETRY);
+		if (dp->v_iflag & VI_DOOMED) {
+			error = ENOENT;
+			goto bad2;
+		}
+	}
+	return (0);
+
+bad2:
+	if (ni_dvp_unlocked != 2) {
+		if (dp != ndp->ni_dvp && !ni_dvp_unlocked)
+			vput(ndp->ni_dvp);
+		else
+			vrele(ndp->ni_dvp);
+	}
+bad:
+	if (!dpunlocked)
+		vput(dp);
+	ndp->ni_vp = NULL;
+	return (error);
+}
+
+/*
+ * relookup - lookup a path name component
+ *    Used by lookup to re-acquire things.
+ */
+int
+relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
+{
+	struct vnode *dp = 0;		/* the directory we are searching */
+	int wantparent;			/* 1 => wantparent or lockparent flag */
+	int rdonly;			/* lookup read-only flag bit */
+	int error = 0;
+
+	KASSERT(cnp->cn_flags & ISLASTCN,
+	    ("relookup: Not given last component."));
+	/*
+	 * Setup: break out flag bits into variables.
+	 */
+	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
+	KASSERT(wantparent, ("relookup: parent not wanted."));
+	rdonly = cnp->cn_flags & RDONLY;
+	cnp->cn_flags &= ~ISSYMLINK;
+	dp = dvp;
+	cnp->cn_lkflags = LK_EXCLUSIVE;
+	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
+
+	/*
+	 * Search a new directory.
+	 *
+	 * The last component of the filename is left accessible via
+	 * cnp->cn_nameptr for callers that need the name. Callers needing
+	 * the name set the SAVENAME flag. When done, they assume
+	 * responsibility for freeing the pathname buffer.
+	 */
+#ifdef NAMEI_DIAGNOSTIC
+	printf("{%s}: ", cnp->cn_nameptr);
+#endif
+
+	/*
+	 * Check for "" which represents the root directory after slash
+	 * removal.
+	 */
+	if (cnp->cn_nameptr[0] == '\0') {
+		/*
+		 * Support only LOOKUP for "/" because lookup()
+		 * can't succeed for CREATE, DELETE and RENAME.
+		 */
+		KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP"));
+		KASSERT(dp->v_type == VDIR, ("dp is not a directory"));
+
+		if (!(cnp->cn_flags & LOCKLEAF))
+			VOP_UNLOCK(dp, 0);
+		*vpp = dp;
+		/* XXX This should probably move to the top of function. */
+		if (cnp->cn_flags & SAVESTART)
+			panic("lookup: SAVESTART");
+		return (0);
+	}
+
+	if (cnp->cn_flags & ISDOTDOT)
+		panic ("relookup: lookup on dot-dot");
+
+	/*
+	 * We now have a segment name to search for, and a directory to search.
+	 */
+#ifdef NAMEI_DIAGNOSTIC
+	vprint("search in:", dp);
+#endif
+	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
+		KASSERT(*vpp == NULL, ("leaf should be empty"));
+		if (error != EJUSTRETURN)
+			goto bad;
+		/*
+		 * If creating and at end of pathname, then can consider
+		 * allowing file to be created.
+		 */
+		if (rdonly) {
+			error = EROFS;
+			goto bad;
+		}
+		/* ASSERT(dvp == ndp->ni_startdir) */
+		if (cnp->cn_flags & SAVESTART)
+			VREF(dvp);
+		if ((cnp->cn_flags & LOCKPARENT) == 0)
+			VOP_UNLOCK(dp, 0);
+		/*
+		 * We return with ni_vp NULL to indicate that the entry
+		 * doesn't currently exist, leaving a pointer to the
+		 * (possibly locked) directory vnode in ndp->ni_dvp.
+		 */
+		return (0);
+	}
+
+	dp = *vpp;
+
+	/*
+	 * Disallow directory write attempts on read-only filesystems.
+	 */
+	if (rdonly &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+		if (dvp == dp)
+			vrele(dvp);
+		else
+			vput(dvp);
+		error = EROFS;
+		goto bad;
+	}
+	/*
+	 * Set the parent lock/ref state to the requested state.
+	 */
+	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
+		if (wantparent)
+			VOP_UNLOCK(dvp, 0);
+		else
+			vput(dvp);
+	} else if (!wantparent)
+		vrele(dvp);
+	/*
+	 * Check for symbolic link
+	 */
+	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
+	    ("relookup: symlink found.\n"));
+
+	/* ASSERT(dvp == ndp->ni_startdir) */
+	if (cnp->cn_flags & SAVESTART)
+		VREF(dvp);
+	
+	if ((cnp->cn_flags & LOCKLEAF) == 0)
+		VOP_UNLOCK(dp, 0);
+	return (0);
+bad:
+	vput(dp);
+	*vpp = NULL;
+	return (error);
+}
+
+void
+NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags, enum uio_seg segflg,
+    const char *namep, int dirfd, struct vnode *startdir, cap_rights_t *rightsp,
+    struct thread *td)
+{
+
+	ndp->ni_cnd.cn_nameiop = op;
+	ndp->ni_cnd.cn_flags = flags;
+	ndp->ni_segflg = segflg;
+	ndp->ni_dirp = namep;
+	ndp->ni_dirfd = dirfd;
+	ndp->ni_startdir = startdir;
+	ndp->ni_strictrelative = 0;
+	if (rightsp != NULL)
+		ndp->ni_rightsneeded = *rightsp;
+	else
+		cap_rights_init(&ndp->ni_rightsneeded);
+	filecaps_init(&ndp->ni_filecaps);
+	ndp->ni_cnd.cn_thread = td;
+}
+
+/*
+ * Free data allocated by namei(); see namei(9) for details.
+ */
+void
+NDFREE(struct nameidata *ndp, const u_int flags)
+{
+	int unlock_dvp;
+	int unlock_vp;
+
+	unlock_dvp = 0;
+	unlock_vp = 0;
+
+	if (!(flags & NDF_NO_FREE_PNBUF) &&
+	    (ndp->ni_cnd.cn_flags & HASBUF)) {
+		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
+		ndp->ni_cnd.cn_flags &= ~HASBUF;
+	}
+	if (!(flags & NDF_NO_VP_UNLOCK) &&
+	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
+		unlock_vp = 1;
+	if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
+		if (unlock_vp) {
+			vput(ndp->ni_vp);
+			unlock_vp = 0;
+		} else
+			vrele(ndp->ni_vp);
+		ndp->ni_vp = NULL;
+	}
+	if (unlock_vp)
+		VOP_UNLOCK(ndp->ni_vp, 0);
+	if (!(flags & NDF_NO_DVP_UNLOCK) &&
+	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
+	    ndp->ni_dvp != ndp->ni_vp)
+		unlock_dvp = 1;
+	if (!(flags & NDF_NO_DVP_RELE) &&
+	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
+		if (unlock_dvp) {
+			vput(ndp->ni_dvp);
+			unlock_dvp = 0;
+		} else
+			vrele(ndp->ni_dvp);
+		ndp->ni_dvp = NULL;
+	}
+	if (unlock_dvp)
+		VOP_UNLOCK(ndp->ni_dvp, 0);
+	if (!(flags & NDF_NO_STARTDIR_RELE) &&
+	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
+		vrele(ndp->ni_startdir);
+		ndp->ni_startdir = NULL;
+	}
+}
+
+/*
+ * Determine if there is a suitable alternate filename under the specified
+ * prefix for the specified path.  If the create flag is set, then the
+ * alternate prefix will be used so long as the parent directory exists.
+ * This is used by the various compatiblity ABIs so that Linux binaries prefer
+ * files under /compat/linux for example.  The chosen path (whether under
+ * the prefix or under /) is returned in a kernel malloc'd buffer pointed
+ * to by pathbuf.  The caller is responsible for free'ing the buffer from
+ * the M_TEMP bucket if one is returned.
+ */
+int
+kern_alternate_path(struct thread *td, const char *prefix, const char *path,
+    enum uio_seg pathseg, char **pathbuf, int create, int dirfd)
+{
+	struct nameidata nd, ndroot;
+	char *ptr, *buf, *cp;
+	size_t len, sz;
+	int error;
+
+	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	*pathbuf = buf;
+
+	/* Copy the prefix into the new pathname as a starting point. */
+	len = strlcpy(buf, prefix, MAXPATHLEN);
+	if (len >= MAXPATHLEN) {
+		*pathbuf = NULL;
+		free(buf, M_TEMP);
+		return (EINVAL);
+	}
+	sz = MAXPATHLEN - len;
+	ptr = buf + len;
+
+	/* Append the filename to the prefix. */
+	if (pathseg == UIO_SYSSPACE)
+		error = copystr(path, ptr, sz, &len);
+	else
+		error = copyinstr(path, ptr, sz, &len);
+
+	if (error) {
+		*pathbuf = NULL;
+		free(buf, M_TEMP);
+		return (error);
+	}
+
+	/* Only use a prefix with absolute pathnames. */
+	if (*ptr != '/') {
+		error = EINVAL;
+		goto keeporig;
+	}
+
+	if (dirfd != AT_FDCWD) {
+		/*
+		 * We want the original because the "prefix" is
+		 * included in the already opened dirfd.
+		 */
+		bcopy(ptr, buf, len);
+		return (0);
+	}
+
+	/*
+	 * We know that there is a / somewhere in this pathname.
+	 * Search backwards for it, to find the file's parent dir
+	 * to see if it exists in the alternate tree. If it does,
+	 * and we want to create a file (cflag is set). We don't
+	 * need to worry about the root comparison in this case.
+	 */
+
+	if (create) {
+		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
+		*cp = '\0';
+
+		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
+		error = namei(&nd);
+		*cp = '/';
+		if (error != 0)
+			goto keeporig;
+	} else {
+		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
+
+		error = namei(&nd);
+		if (error != 0)
+			goto keeporig;
+
+		/*
+		 * We now compare the vnode of the prefix to the one
+		 * vnode asked. If they resolve to be the same, then we
+		 * ignore the match so that the real root gets used.
+		 * This avoids the problem of traversing "../.." to find the
+		 * root directory and never finding it, because "/" resolves
+		 * to the emulation root directory. This is expensive :-(
+		 */
+		NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix,
+		    td);
+
+		/* We shouldn't ever get an error from this namei(). */
+		error = namei(&ndroot);
+		if (error == 0) {
+			if (nd.ni_vp == ndroot.ni_vp)
+				error = ENOENT;
+
+			NDFREE(&ndroot, NDF_ONLY_PNBUF);
+			vrele(ndroot.ni_vp);
+		}
+	}
+
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vrele(nd.ni_vp);
+
+keeporig:
+	/* If there was an error, use the original path name. */
+	if (error)
+		bcopy(ptr, buf, len);
+	return (error);
+}
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
new file mode 100644
index 0000000..8f92e10
--- /dev/null
+++ b/sys/kern/vfs_mount.c
@@ -0,0 +1,1949 @@
+/*-
+ * Copyright (c) 1999-2004 Poul-Henning Kamp
+ * Copyright (c) 1999 Michael Smith
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/filedesc.h>
+#include <sys/reboot.h>
+#include <sys/sbuf.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <vm/uma.h>
+
+#include <geom/geom.h>
+
+#include <machine/stdarg.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
+
+static int	vfs_domount(struct thread *td, const char *fstype, char *fspath,
+		    uint64_t fsflags, struct vfsoptlist **optlist);
+static void	free_mntarg(struct mntarg *ma);
+
+static int	usermount = 0;
+SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
+    "Unprivileged users may mount and unmount file systems");
+
+MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
+static uma_zone_t mount_zone;
+
+/* List of mounted filesystems. */
+struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
+
+/* For any iteration/modification of mountlist */
+struct mtx mountlist_mtx;
+MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
+
+/*
+ * Global opts, taken by all filesystems
+ */
+static const char *global_opts[] = {
+	"errmsg",
+	"fstype",
+	"fspath",
+	"ro",
+	"rw",
+	"nosuid",
+	"noexec",
+	NULL
+};
+
+static int
+mount_init(void *mem, int size, int flags)
+{
+	struct mount *mp;
+
+	mp = (struct mount *)mem;
+	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
+	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
+	return (0);
+}
+
+static void
+mount_fini(void *mem, int size)
+{
+	struct mount *mp;
+
+	mp = (struct mount *)mem;
+	lockdestroy(&mp->mnt_explock);
+	mtx_destroy(&mp->mnt_mtx);
+}
+
+static void
+vfs_mount_init(void *dummy __unused)
+{
+
+	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
+	    NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+}
+SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);
+
+/*
+ * ---------------------------------------------------------------------
+ * Functions for building and sanitizing the mount options
+ */
+
+/* Remove one mount option. */
+static void
+vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
+{
+
+	TAILQ_REMOVE(opts, opt, link);
+	free(opt->name, M_MOUNT);
+	if (opt->value != NULL)
+		free(opt->value, M_MOUNT);
+	free(opt, M_MOUNT);
+}
+
+/* Release all resources related to the mount options. */
+void
+vfs_freeopts(struct vfsoptlist *opts)
+{
+	struct vfsopt *opt;
+
+	while (!TAILQ_EMPTY(opts)) {
+		opt = TAILQ_FIRST(opts);
+		vfs_freeopt(opts, opt);
+	}
+	free(opts, M_MOUNT);
+}
+
+void
+vfs_deleteopt(struct vfsoptlist *opts, const char *name)
+{
+	struct vfsopt *opt, *temp;
+
+	if (opts == NULL)
+		return;
+	TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
+		if (strcmp(opt->name, name) == 0)
+			vfs_freeopt(opts, opt);
+	}
+}
+
+static int
+vfs_isopt_ro(const char *opt)
+{
+
+	if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 ||
+	    strcmp(opt, "norw") == 0)
+		return (1);
+	return (0);
+}
+
+static int
+vfs_isopt_rw(const char *opt)
+{
+
+	if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0)
+		return (1);
+	return (0);
+}
+
+/*
+ * Check if options are equal (with or without the "no" prefix).
+ */
+static int
+vfs_equalopts(const char *opt1, const char *opt2)
+{
+	char *p;
+
+	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
+	if (strcmp(opt1, opt2) == 0)
+		return (1);
+	/* "noopt" vs. "opt" */
+	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
+		return (1);
+	/* "opt" vs. "noopt" */
+	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
+		return (1);
+	while ((p = strchr(opt1, '.')) != NULL &&
+	    !strncmp(opt1, opt2, ++p - opt1)) {
+		opt2 += p - opt1;
+		opt1 = p;
+		/* "foo.noopt" vs. "foo.opt" */
+		if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
+			return (1);
+		/* "foo.opt" vs. "foo.noopt" */
+		if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
+			return (1);
+	}
+	/* "ro" / "rdonly" / "norw" / "rw" / "noro" */
+	if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) &&
+	    (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2)))
+		return (1);
+	return (0);
+}
+
+/*
+ * If a mount option is specified several times,
+ * (with or without the "no" prefix) only keep
+ * the last occurrence of it.
+ */
+static void
+vfs_sanitizeopts(struct vfsoptlist *opts)
+{
+	struct vfsopt *opt, *opt2, *tmp;
+
+	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
+		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
+		while (opt2 != NULL) {
+			if (vfs_equalopts(opt->name, opt2->name)) {
+				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
+				vfs_freeopt(opts, opt2);
+				opt2 = tmp;
+			} else {
+				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
+			}
+		}
+	}
+}
+
+/*
+ * Build a linked list of mount options from a struct uio.
+ */
+int
+vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
+{
+	struct vfsoptlist *opts;
+	struct vfsopt *opt;
+	size_t memused, namelen, optlen;
+	unsigned int i, iovcnt;
+	int error;
+
+	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
+	TAILQ_INIT(opts);
+	memused = 0;
+	iovcnt = auio->uio_iovcnt;
+	for (i = 0; i < iovcnt; i += 2) {
+		namelen = auio->uio_iov[i].iov_len;
+		optlen = auio->uio_iov[i + 1].iov_len;
+		memused += sizeof(struct vfsopt) + optlen + namelen;
+		/*
+		 * Avoid consuming too much memory, and attempts to overflow
+		 * memused.
+		 */
+		if (memused > VFS_MOUNTARG_SIZE_MAX ||
+		    optlen > VFS_MOUNTARG_SIZE_MAX ||
+		    namelen > VFS_MOUNTARG_SIZE_MAX) {
+			error = EINVAL;
+			goto bad;
+		}
+
+		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
+		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
+		opt->value = NULL;
+		opt->len = 0;
+		opt->pos = i / 2;
+		opt->seen = 0;
+
+		/*
+		 * Do this early, so jumps to "bad" will free the current
+		 * option.
+		 */
+		TAILQ_INSERT_TAIL(opts, opt, link);
+
+		if (auio->uio_segflg == UIO_SYSSPACE) {
+			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
+		} else {
+			error = copyin(auio->uio_iov[i].iov_base, opt->name,
+			    namelen);
+			if (error)
+				goto bad;
+		}
+		/* Ensure names are null-terminated strings. */
+		if (namelen == 0 || opt->name[namelen - 1] != '\0') {
+			error = EINVAL;
+			goto bad;
+		}
+		if (optlen != 0) {
+			opt->len = optlen;
+			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
+			if (auio->uio_segflg == UIO_SYSSPACE) {
+				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
+				    optlen);
+			} else {
+				error = copyin(auio->uio_iov[i + 1].iov_base,
+				    opt->value, optlen);
+				if (error)
+					goto bad;
+			}
+		}
+	}
+	vfs_sanitizeopts(opts);
+	*options = opts;
+	return (0);
+bad:
+	vfs_freeopts(opts);
+	return (error);
+}
+
+/*
+ * Merge the old mount options with the new ones passed
+ * in the MNT_UPDATE case.
+ *
+ * XXX: This function will keep a "nofoo" option in the new
+ * options.  E.g, if the option's canonical name is "foo",
+ * "nofoo" ends up in the mount point's active options.
+ */
+static void
+vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts)
+{
+	struct vfsopt *opt, *new;
+
+	TAILQ_FOREACH(opt, oldopts, link) {
+		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
+		new->name = strdup(opt->name, M_MOUNT);
+		if (opt->len != 0) {
+			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
+			bcopy(opt->value, new->value, opt->len);
+		} else
+			new->value = NULL;
+		new->len = opt->len;
+		new->seen = opt->seen;
+		TAILQ_INSERT_HEAD(toopts, new, link);
+	}
+	vfs_sanitizeopts(toopts);
+}
+
+/*
+ * Mount a filesystem.
+ */
+int
+sys_nmount(td, uap)
+	struct thread *td;
+	struct nmount_args /* {
+		struct iovec *iovp;
+		unsigned int iovcnt;
+		int flags;
+	} */ *uap;
+{
+	struct uio *auio;
+	int error;
+	u_int iovcnt;
+	uint64_t flags;
+
+	/*
+	 * Mount flags are now 64-bits. On 32-bit archtectures only
+	 * 32-bits are passed in, but from here on everything handles
+	 * 64-bit flags correctly.
+	 */
+	flags = uap->flags;
+
+	AUDIT_ARG_FFLAGS(flags);
+	CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
+	    uap->iovp, uap->iovcnt, flags);
+
+	/*
+	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
+	 * userspace to set this flag, but we must filter it out if we want
+	 * MNT_UPDATE on the root file system to work.
+	 * MNT_ROOTFS should only be set by the kernel when mounting its
+	 * root file system.
+	 */
+	flags &= ~MNT_ROOTFS;
+
+	iovcnt = uap->iovcnt;
+	/*
+	 * Check that we have an even number of iovec's
+	 * and that we have at least two options.
+	 */
+	if ((iovcnt & 1) || (iovcnt < 4)) {
+		CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
+		    uap->iovcnt);
+		return (EINVAL);
+	}
+
+	error = copyinuio(uap->iovp, iovcnt, &auio);
+	if (error) {
+		CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
+		    __func__, error);
+		return (error);
+	}
+	error = vfs_donmount(td, flags, auio);
+
+	free(auio, M_IOV);
+	return (error);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * Various utility functions
+ */
+
+void
+vfs_ref(struct mount *mp)
+{
+
+	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+	MNT_ILOCK(mp);
+	MNT_REF(mp);
+	MNT_IUNLOCK(mp);
+}
+
+void
+vfs_rel(struct mount *mp)
+{
+
+	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+	MNT_ILOCK(mp);
+	MNT_REL(mp);
+	MNT_IUNLOCK(mp);
+}
+
+/*
+ * Allocate and initialize the mount point struct.
+ */
+struct mount *
+vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
+    struct ucred *cred)
+{
+	struct mount *mp;
+
+	mp = uma_zalloc(mount_zone, M_WAITOK);
+	bzero(&mp->mnt_startzero,
+	    __rangeof(struct mount, mnt_startzero, mnt_endzero));
+	TAILQ_INIT(&mp->mnt_nvnodelist);
+	mp->mnt_nvnodelistsize = 0;
+	TAILQ_INIT(&mp->mnt_activevnodelist);
+	mp->mnt_activevnodelistsize = 0;
+	mp->mnt_ref = 0;
+	(void) vfs_busy(mp, MBF_NOWAIT);
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_vfc = vfsp;
+	vfsp->vfc_refcount++;	/* XXX Unlocked */
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_gen++;
+	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+	mp->mnt_vnodecovered = vp;
+	mp->mnt_cred = crdup(cred);
+	mp->mnt_stat.f_owner = cred->cr_uid;
+	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
+	mp->mnt_iosize_max = DFLTPHYS;
+#ifdef MAC
+	mac_mount_init(mp);
+	mac_mount_create(cred, mp);
+#endif
+	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
+	TAILQ_INIT(&mp->mnt_uppers);
+	return (mp);
+}
+
+/*
+ * Destroy the mount struct previously allocated by vfs_mount_alloc().
+ */
+void
+vfs_mount_destroy(struct mount *mp)
+{
+
+	MNT_ILOCK(mp);
+	mp->mnt_kern_flag |= MNTK_REFEXPIRE;
+	if (mp->mnt_kern_flag & MNTK_MWAIT) {
+		mp->mnt_kern_flag &= ~MNTK_MWAIT;
+		wakeup(mp);
+	}
+	while (mp->mnt_ref)
+		msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
+	KASSERT(mp->mnt_ref == 0,
+	    ("%s: invalid refcount in the drain path @ %s:%d", __func__,
+	    __FILE__, __LINE__));
+	if (mp->mnt_writeopcount != 0)
+		panic("vfs_mount_destroy: nonzero writeopcount");
+	if (mp->mnt_secondary_writes != 0)
+		panic("vfs_mount_destroy: nonzero secondary_writes");
+	mp->mnt_vfc->vfc_refcount--;
+	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
+		struct vnode *vp;
+
+		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
+			vprint("", vp);
+		panic("unmount: dangling vnode");
+	}
+	KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers"));
+	if (mp->mnt_nvnodelistsize != 0)
+		panic("vfs_mount_destroy: nonzero nvnodelistsize");
+	if (mp->mnt_activevnodelistsize != 0)
+		panic("vfs_mount_destroy: nonzero activevnodelistsize");
+	if (mp->mnt_lockref != 0)
+		panic("vfs_mount_destroy: nonzero lock refcount");
+	MNT_IUNLOCK(mp);
+#ifdef MAC
+	mac_mount_destroy(mp);
+#endif
+	if (mp->mnt_opt != NULL)
+		vfs_freeopts(mp->mnt_opt);
+	crfree(mp->mnt_cred);
+	uma_zfree(mount_zone, mp);
+}
+
+int
+vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions)
+{
+	struct vfsoptlist *optlist;
+	struct vfsopt *opt, *tmp_opt;
+	char *fstype, *fspath, *errmsg;
+	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
+
+	errmsg = fspath = NULL;
+	errmsg_len = fspathlen = 0;
+	errmsg_pos = -1;
+
+	error = vfs_buildopts(fsoptions, &optlist);
+	if (error)
+		return (error);
+
+	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
+		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
+
+	/*
+	 * We need these two options before the others,
+	 * and they are mandatory for any filesystem.
+	 * Ensure they are NUL terminated as well.
+	 */
+	fstypelen = 0;
+	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
+	if (error || fstype[fstypelen - 1] != '\0') {
+		error = EINVAL;
+		if (errmsg != NULL)
+			strncpy(errmsg, "Invalid fstype", errmsg_len);
+		goto bail;
+	}
+	fspathlen = 0;
+	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
+	if (error || fspath[fspathlen - 1] != '\0') {
+		error = EINVAL;
+		if (errmsg != NULL)
+			strncpy(errmsg, "Invalid fspath", errmsg_len);
+		goto bail;
+	}
+
+	/*
+	 * We need to see if we have the "update" option
+	 * before we call vfs_domount(), since vfs_domount() has special
+	 * logic based on MNT_UPDATE.  This is very important
+	 * when we want to update the root filesystem.
+	 */
+	TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
+		if (strcmp(opt->name, "update") == 0) {
+			fsflags |= MNT_UPDATE;
+			vfs_freeopt(optlist, opt);
+		}
+		else if (strcmp(opt->name, "async") == 0)
+			fsflags |= MNT_ASYNC;
+		else if (strcmp(opt->name, "force") == 0) {
+			fsflags |= MNT_FORCE;
+			vfs_freeopt(optlist, opt);
+		}
+		else if (strcmp(opt->name, "reload") == 0) {
+			fsflags |= MNT_RELOAD;
+			vfs_freeopt(optlist, opt);
+		}
+		else if (strcmp(opt->name, "multilabel") == 0)
+			fsflags |= MNT_MULTILABEL;
+		else if (strcmp(opt->name, "noasync") == 0)
+			fsflags &= ~MNT_ASYNC;
+		else if (strcmp(opt->name, "noatime") == 0)
+			fsflags |= MNT_NOATIME;
+		else if (strcmp(opt->name, "atime") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonoatime", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "noclusterr") == 0)
+			fsflags |= MNT_NOCLUSTERR;
+		else if (strcmp(opt->name, "clusterr") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonoclusterr", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "noclusterw") == 0)
+			fsflags |= MNT_NOCLUSTERW;
+		else if (strcmp(opt->name, "clusterw") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonoclusterw", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "noexec") == 0)
+			fsflags |= MNT_NOEXEC;
+		else if (strcmp(opt->name, "exec") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonoexec", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "nosuid") == 0)
+			fsflags |= MNT_NOSUID;
+		else if (strcmp(opt->name, "suid") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonosuid", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "nosymfollow") == 0)
+			fsflags |= MNT_NOSYMFOLLOW;
+		else if (strcmp(opt->name, "symfollow") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonosymfollow", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "noro") == 0)
+			fsflags &= ~MNT_RDONLY;
+		else if (strcmp(opt->name, "rw") == 0)
+			fsflags &= ~MNT_RDONLY;
+		else if (strcmp(opt->name, "ro") == 0)
+			fsflags |= MNT_RDONLY;
+		else if (strcmp(opt->name, "rdonly") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("ro", M_MOUNT);
+			fsflags |= MNT_RDONLY;
+		}
+		else if (strcmp(opt->name, "suiddir") == 0)
+			fsflags |= MNT_SUIDDIR;
+		else if (strcmp(opt->name, "sync") == 0)
+			fsflags |= MNT_SYNCHRONOUS;
+		else if (strcmp(opt->name, "union") == 0)
+			fsflags |= MNT_UNION;
+	}
+
+	/*
+	 * Be ultra-paranoid about making sure the type and fspath
+	 * variables will fit in our mp buffers, including the
+	 * terminating NUL.
+	 */
+	if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
+		error = ENAMETOOLONG;
+		goto bail;
+	}
+
+	error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
+bail:
+	/* copyout the errmsg */
+	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
+	    && errmsg_len > 0 && errmsg != NULL) {
+		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
+			bcopy(errmsg,
+			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
+			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
+		} else {
+			copyout(errmsg,
+			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
+			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
+		}
+	}
+
+	if (optlist != NULL)
+		vfs_freeopts(optlist);
+	return (error);
+}
+
+/*
+ * Old mount API.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+	char	*type;
+	char	*path;
+	int	flags;
+	caddr_t	data;
+};
+#endif
+/* ARGSUSED */
+int
+sys_mount(td, uap)
+	struct thread *td;
+	struct mount_args /* {
+		char *type;
+		char *path;
+		int flags;
+		caddr_t data;
+	} */ *uap;
+{
+	char *fstype;
+	struct vfsconf *vfsp = NULL;
+	struct mntarg *ma = NULL;
+	uint64_t flags;
+	int error;
+
+	/*
+	 * Mount flags are now 64-bits. On 32-bit architectures only
+	 * 32-bits are passed in, but from here on everything handles
+	 * 64-bit flags correctly.
+	 */
+	flags = uap->flags;
+
+	AUDIT_ARG_FFLAGS(flags);
+
+	/*
+	 * Filter out MNT_ROOTFS.  We do not want clients of mount() in
+	 * userspace to set this flag, but we must filter it out if we want
+	 * MNT_UPDATE on the root file system to work.
+	 * MNT_ROOTFS should only be set by the kernel when mounting its
+	 * root file system.
+	 */
+	flags &= ~MNT_ROOTFS;
+
+	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
+	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
+	if (error) {
+		free(fstype, M_TEMP);
+		return (error);
+	}
+
+	AUDIT_ARG_TEXT(fstype);
+	mtx_lock(&Giant);
+	vfsp = vfs_byname_kld(fstype, td, &error);
+	free(fstype, M_TEMP);
+	if (vfsp == NULL) {
+		mtx_unlock(&Giant);
+		return (ENOENT);
+	}
+	if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
+		mtx_unlock(&Giant);
+		return (EOPNOTSUPP);
+	}
+
+	ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
+	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
+	ma = mount_argb(ma, flags & MNT_RDONLY, "noro");
+	ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid");
+	ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec");
+
+	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * vfs_domount_first(): first file system mount (not update)
+ */
+static int
+vfs_domount_first(
+	struct thread *td,		/* Calling thread. */
+	struct vfsconf *vfsp,		/* File system type. */
+	char *fspath,			/* Mount path. */
+	struct vnode *vp,		/* Vnode to be covered. */
+	uint64_t fsflags,		/* Flags common to all filesystems. */
+	struct vfsoptlist **optlist	/* Options local to the filesystem. */
+	)
+{
+	struct vattr va;
+	struct mount *mp;
+	struct vnode *newdp;
+	int error;
+
+	mtx_assert(&Giant, MA_OWNED);
+	ASSERT_VOP_ELOCKED(vp, __func__);
+	KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
+
+	/*
+	 * If the user is not root, ensure that they own the directory
+	 * onto which we are attempting to mount.
+	 */
+	error = VOP_GETATTR(vp, &va, td->td_ucred);
+	if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
+		error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0);
+	if (error == 0)
+		error = vinvalbuf(vp, V_SAVE, 0, 0);
+	if (error == 0 && vp->v_type != VDIR)
+		error = ENOTDIR;
+	if (error == 0) {
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
+			vp->v_iflag |= VI_MOUNT;
+		else
+			error = EBUSY;
+		VI_UNLOCK(vp);
+	}
+	if (error != 0) {
+		vput(vp);
+		return (error);
+	}
+	VOP_UNLOCK(vp, 0);
+
+	/* Allocate and initialize the filesystem. */
+	mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
+	/* XXXMAC: pass to vfs_mount_alloc? */
+	mp->mnt_optnew = *optlist;
+	/* Set the mount level flags. */
+	mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY));
+
+	/*
+	 * Mount the filesystem.
+	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+	 * get.  No freeing of cn_pnbuf.
+	 */
+	error = VFS_MOUNT(mp);
+	if (error != 0) {
+		vfs_unbusy(mp);
+		vfs_mount_destroy(mp);
+		VI_LOCK(vp);
+		vp->v_iflag &= ~VI_MOUNT;
+		VI_UNLOCK(vp);
+		vrele(vp);
+		return (error);
+	}
+
+	if (mp->mnt_opt != NULL)
+		vfs_freeopts(mp->mnt_opt);
+	mp->mnt_opt = mp->mnt_optnew;
+	*optlist = NULL;
+	(void)VFS_STATFS(mp, &mp->mnt_stat);
+
+	/*
+	 * Prevent external consumers of mount options from reading mnt_optnew.
+	 */
+	mp->mnt_optnew = NULL;
+
+	MNT_ILOCK(mp);
+	if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+	    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
+		mp->mnt_kern_flag |= MNTK_ASYNC;
+	else
+		mp->mnt_kern_flag &= ~MNTK_ASYNC;
+	MNT_IUNLOCK(mp);
+
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	cache_purge(vp);
+	VI_LOCK(vp);
+	vp->v_iflag &= ~VI_MOUNT;
+	VI_UNLOCK(vp);
+	vp->v_mountedhere = mp;
+	/* Place the new filesystem at the end of the mount list. */
+	mtx_lock(&mountlist_mtx);
+	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+	mtx_unlock(&mountlist_mtx);
+	vfs_event_signal(NULL, VQ_MOUNT, 0);
+	if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp))
+		panic("mount: lost mount");
+	VOP_UNLOCK(vp, 0);
+	EVENTHANDLER_INVOKE(vfs_mounted, mp, newdp, td);
+	VOP_UNLOCK(newdp, 0);
+	mountcheckdirs(vp, newdp);
+	vrele(newdp);
+	if ((mp->mnt_flag & MNT_RDONLY) == 0)
+		vfs_allocate_syncvnode(mp);
+	vfs_unbusy(mp);
+	return (0);
+}
+
+/*
+ * vfs_domount_update(): update of mounted file system
+ */
+static int
+vfs_domount_update(
+	struct thread *td,		/* Calling thread. */
+	struct vnode *vp,		/* Mount point vnode. */
+	uint64_t fsflags,		/* Flags common to all filesystems. */
+	struct vfsoptlist **optlist	/* Options local to the filesystem. */
+	)
+{
+	struct oexport_args oexport;
+	struct export_args export;
+	struct mount *mp;
+	int error, export_error;
+	uint64_t flag;
+
+	mtx_assert(&Giant, MA_OWNED);
+	ASSERT_VOP_ELOCKED(vp, __func__);
+	KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
+
+	if ((vp->v_vflag & VV_ROOT) == 0) {
+		vput(vp);
+		return (EINVAL);
+	}
+	mp = vp->v_mount;
+	/*
+	 * We only allow the filesystem to be reloaded if it
+	 * is currently mounted read-only.
+	 */
+	flag = mp->mnt_flag;
+	if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
+		vput(vp);
+		return (EOPNOTSUPP);	/* Needs translation */
+	}
+	/*
+	 * Only privileged root, or (if MNT_USER is set) the user that
+	 * did the original mount is permitted to update it.
+	 */
+	error = vfs_suser(mp, td);
+	if (error != 0) {
+		vput(vp);
+		return (error);
+	}
+	if (vfs_busy(mp, MBF_NOWAIT)) {
+		vput(vp);
+		return (EBUSY);
+	}
+	VI_LOCK(vp);
+	if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) {
+		VI_UNLOCK(vp);
+		vfs_unbusy(mp);
+		vput(vp);
+		return (EBUSY);
+	}
+	vp->v_iflag |= VI_MOUNT;
+	VI_UNLOCK(vp);
+	VOP_UNLOCK(vp, 0);
+
+	MNT_ILOCK(mp);
+	mp->mnt_flag &= ~MNT_UPDATEMASK;
+	mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
+	    MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
+	if ((mp->mnt_flag & MNT_ASYNC) == 0)
+		mp->mnt_kern_flag &= ~MNTK_ASYNC;
+	MNT_IUNLOCK(mp);
+	mp->mnt_optnew = *optlist;
+	vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
+
+	/*
+	 * Mount the filesystem.
+	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+	 * get.  No freeing of cn_pnbuf.
+	 */
+	error = VFS_MOUNT(mp);
+
+	export_error = 0;
+	if (error == 0) {
+		/* Process the export option. */
+		if (vfs_copyopt(mp->mnt_optnew, "export", &export,
+		    sizeof(export)) == 0) {
+			export_error = vfs_export(mp, &export);
+		} else if (vfs_copyopt(mp->mnt_optnew, "export", &oexport,
+		    sizeof(oexport)) == 0) {
+			export.ex_flags = oexport.ex_flags;
+			export.ex_root = oexport.ex_root;
+			export.ex_anon = oexport.ex_anon;
+			export.ex_addr = oexport.ex_addr;
+			export.ex_addrlen = oexport.ex_addrlen;
+			export.ex_mask = oexport.ex_mask;
+			export.ex_masklen = oexport.ex_masklen;
+			export.ex_indexfile = oexport.ex_indexfile;
+			export.ex_numsecflavors = 0;
+			export_error = vfs_export(mp, &export);
+		}
+	}
+
+	MNT_ILOCK(mp);
+	if (error == 0) {
+		mp->mnt_flag &=	~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE |
+		    MNT_SNAPSHOT);
+	} else {
+		/*
+		 * If we fail, restore old mount flags. MNT_QUOTA is special,
+		 * because it is not part of MNT_UPDATEMASK, but it could have
+		 * changed in the meantime if quotactl(2) was called.
+		 * All in all we want current value of MNT_QUOTA, not the old
+		 * one.
+		 */
+		mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
+	}
+	if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+	    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
+		mp->mnt_kern_flag |= MNTK_ASYNC;
+	else
+		mp->mnt_kern_flag &= ~MNTK_ASYNC;
+	MNT_IUNLOCK(mp);
+
+	if (error != 0)
+		goto end;
+
+	if (mp->mnt_opt != NULL)
+		vfs_freeopts(mp->mnt_opt);
+	mp->mnt_opt = mp->mnt_optnew;
+	*optlist = NULL;
+	(void)VFS_STATFS(mp, &mp->mnt_stat);
+	/*
+	 * Prevent external consumers of mount options from reading
+	 * mnt_optnew.
+	 */
+	mp->mnt_optnew = NULL;
+
+	if ((mp->mnt_flag & MNT_RDONLY) == 0)
+		vfs_allocate_syncvnode(mp);
+	else
+		vfs_deallocate_syncvnode(mp);
+end:
+	vfs_unbusy(mp);
+	VI_LOCK(vp);
+	vp->v_iflag &= ~VI_MOUNT;
+	VI_UNLOCK(vp);
+	vrele(vp);
+	return (error != 0 ? error : export_error);
+}
+
+/*
+ * vfs_domount(): actually attempt a filesystem mount.
+ */
+static int
+vfs_domount(
+	struct thread *td,		/* Calling thread. */
+	const char *fstype,		/* Filesystem type. */
+	char *fspath,			/* Mount path. */
+	uint64_t fsflags,		/* Flags common to all filesystems. */
+	struct vfsoptlist **optlist	/* Options local to the filesystem. */
+	)
+{
+	struct vfsconf *vfsp;
+	struct nameidata nd;
+	struct vnode *vp;
+	char *pathbuf;
+	int error;
+
+	/*
+	 * Be ultra-paranoid about making sure the type and fspath
+	 * variables will fit in our mp buffers, including the
+	 * terminating NUL.
+	 */
+	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
+		return (ENAMETOOLONG);
+
+	if (jailed(td->td_ucred) || usermount == 0) {
+		if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
+			return (error);
+	}
+
+	/*
+	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
+	 */
+	if (fsflags & MNT_EXPORTED) {
+		error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
+		if (error)
+			return (error);
+	}
+	if (fsflags & MNT_SUIDDIR) {
+		error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
+		if (error)
+			return (error);
+	}
+	/*
+	 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
+	 */
+	if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
+		if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
+			fsflags |= MNT_NOSUID | MNT_USER;
+	}
+
+	/* Load KLDs before we lock the covered vnode to avoid reversals. */
+	vfsp = NULL;
+	if ((fsflags & MNT_UPDATE) == 0) {
+		/* Don't try to load KLDs if we're mounting the root. */
+		if (fsflags & MNT_ROOTFS)
+			vfsp = vfs_byname(fstype);
+		else
+			vfsp = vfs_byname_kld(fstype, td, &error);
+		if (vfsp == NULL)
+			return (ENODEV);
+		if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL))
+			return (EPERM);
+	}
+
+	/*
+	 * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+	    UIO_SYSSPACE, fspath, td);
+	error = namei(&nd);
+	if (error != 0)
+		return (error);
+	mtx_lock(&Giant);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	if ((fsflags & MNT_UPDATE) == 0) {
+		pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
+		strcpy(pathbuf, fspath);
+		error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN);
+		/* debug.disablefullpath == 1 results in ENODEV */
+		if (error == 0 || error == ENODEV) {
+			error = vfs_domount_first(td, vfsp, pathbuf, vp,
+			    fsflags, optlist);
+		}
+		free(pathbuf, M_TEMP);
+	} else
+		error = vfs_domount_update(td, vp, fsflags, optlist);
+	mtx_unlock(&Giant);
+
+	ASSERT_VI_UNLOCKED(vp, __func__);
+	ASSERT_VOP_UNLOCKED(vp, __func__);
+
+	return (error);
+}
+
+/*
+ * Unmount a filesystem.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument, not
+ * special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+	char	*path;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+sys_unmount(td, uap)
+	struct thread *td;
+	register struct unmount_args /* {
+		char *path;
+		int flags;
+	} */ *uap;
+{
+	struct nameidata nd;
+	struct mount *mp;
+	char *pathbuf;
+	int error, id0, id1;
+
+	AUDIT_ARG_VALUE(uap->flags);
+	if (jailed(td->td_ucred) || usermount == 0) {
+		error = priv_check(td, PRIV_VFS_UNMOUNT);
+		if (error)
+			return (error);
+	}
+
+	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
+	error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
+	if (error) {
+		free(pathbuf, M_TEMP);
+		return (error);
+	}
+	mtx_lock(&Giant);
+	if (uap->flags & MNT_BYFSID) {
+		AUDIT_ARG_TEXT(pathbuf);
+		/* Decode the filesystem ID. */
+		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
+			mtx_unlock(&Giant);
+			free(pathbuf, M_TEMP);
+			return (EINVAL);
+		}
+
+		mtx_lock(&mountlist_mtx);
+		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
+			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
+			    mp->mnt_stat.f_fsid.val[1] == id1)
+				break;
+		}
+		mtx_unlock(&mountlist_mtx);
+	} else {
+		/*
+		 * Try to find global path for path argument.
+		 */
+		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+		    UIO_SYSSPACE, pathbuf, td);
+		if (namei(&nd) == 0) {
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+			error = vn_path_to_global_path(td, nd.ni_vp, pathbuf,
+			    MNAMELEN);
+			if (error == 0 || error == ENODEV)
+				vput(nd.ni_vp);
+		}
+		mtx_lock(&mountlist_mtx);
+		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
+			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
+				break;
+		}
+		mtx_unlock(&mountlist_mtx);
+	}
+	free(pathbuf, M_TEMP);
+	if (mp == NULL) {
+		/*
+		 * Previously we returned ENOENT for a nonexistent path and
+		 * EINVAL for a non-mountpoint.  We cannot tell these apart
+		 * now, so in the !MNT_BYFSID case return the more likely
+		 * EINVAL for compatibility.
+		 */
+		mtx_unlock(&Giant);
+		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
+	}
+
+	/*
+	 * Don't allow unmounting the root filesystem.
+	 */
+	if (mp->mnt_flag & MNT_ROOTFS) {
+		mtx_unlock(&Giant);
+		return (EINVAL);
+	}
+	error = dounmount(mp, uap->flags, td);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+/*
+ * Do the actual filesystem unmount.
+ */
+int
+dounmount(mp, flags, td)
+	struct mount *mp;
+	int flags;
+	struct thread *td;
+{
+	struct vnode *coveredvp, *fsrootvp;
+	int error;
+	uint64_t async_flag;
+	int mnt_gen_r;
+
+	mtx_assert(&Giant, MA_OWNED);
+
+	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
+		mnt_gen_r = mp->mnt_gen;
+		VI_LOCK(coveredvp);
+		vholdl(coveredvp);
+		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
+		vdrop(coveredvp);
+		/*
+		 * Check for mp being unmounted while waiting for the
+		 * covered vnode lock.
+		 */
+		if (coveredvp->v_mountedhere != mp ||
+		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
+			VOP_UNLOCK(coveredvp, 0);
+			return (EBUSY);
+		}
+	}
+	/*
+	 * Only privileged root, or (if MNT_USER is set) the user that did the
+	 * original mount is permitted to unmount this filesystem.
+	 */
+	error = vfs_suser(mp, td);
+	if (error) {
+		if (coveredvp)
+			VOP_UNLOCK(coveredvp, 0);
+		return (error);
+	}
+
+	vn_start_write(NULL, &mp, V_WAIT);
+	MNT_ILOCK(mp);
+	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
+	    !TAILQ_EMPTY(&mp->mnt_uppers)) {
+		MNT_IUNLOCK(mp);
+		if (coveredvp)
+			VOP_UNLOCK(coveredvp, 0);
+		vn_finished_write(mp);
+		return (EBUSY);
+	}
+	mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ;
+	/* Allow filesystems to detect that a forced unmount is in progress. */
+	if (flags & MNT_FORCE) {
+		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
+		MNT_IUNLOCK(mp);
+		/*
+		 * Must be done after setting MNTK_UNMOUNTF and before
+		 * waiting for mnt_lockref to become 0.
+		 */
+		VFS_PURGE(mp);
+		MNT_ILOCK(mp);
+	}
+	error = 0;
+	if (mp->mnt_lockref) {
+		mp->mnt_kern_flag |= MNTK_DRAINING;
+		error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
+		    "mount drain", 0);
+	}
+	MNT_IUNLOCK(mp);
+	KASSERT(mp->mnt_lockref == 0,
+	    ("%s: invalid lock refcount in the drain path @ %s:%d",
+	    __func__, __FILE__, __LINE__));
+	KASSERT(error == 0,
+	    ("%s: invalid return value for msleep in the drain path @ %s:%d",
+	    __func__, __FILE__, __LINE__));
+
+	if (mp->mnt_flag & MNT_EXPUBLIC)
+		vfs_setpublicfs(NULL, NULL, NULL);
+
+	vfs_msync(mp, MNT_WAIT);
+	MNT_ILOCK(mp);
+	async_flag = mp->mnt_flag & MNT_ASYNC;
+	mp->mnt_flag &= ~MNT_ASYNC;
+	mp->mnt_kern_flag &= ~MNTK_ASYNC;
+	MNT_IUNLOCK(mp);
+	cache_purgevfs(mp);	/* remove cache entries for this file sys */
+	vfs_deallocate_syncvnode(mp);
+	/*
+	 * For forced unmounts, move process cdir/rdir refs on the fs root
+	 * vnode to the covered vnode.  For non-forced unmounts we want
+	 * such references to cause an EBUSY error.
+	 */
+	if ((flags & MNT_FORCE) &&
+	    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
+		if (mp->mnt_vnodecovered != NULL)
+			mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
+		if (fsrootvp == rootvnode) {
+			vrele(rootvnode);
+			rootvnode = NULL;
+		}
+		vput(fsrootvp);
+	}
+	if (((mp->mnt_flag & MNT_RDONLY) ||
+	     (error = VFS_SYNC(mp, MNT_WAIT)) == 0) || (flags & MNT_FORCE) != 0)
+		error = VFS_UNMOUNT(mp, flags);
+	vn_finished_write(mp);
+	/*
+	 * If we failed to flush the dirty blocks for this mount point,
+	 * undo all the cdir/rdir and rootvnode changes we made above.
+	 * Unless we failed to do so because the device is reporting that
+	 * it doesn't exist anymore.
+	 */
+	if (error && error != ENXIO) {
+		if ((flags & MNT_FORCE) &&
+		    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
+			if (mp->mnt_vnodecovered != NULL)
+				mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
+			if (rootvnode == NULL) {
+				rootvnode = fsrootvp;
+				vref(rootvnode);
+			}
+			vput(fsrootvp);
+		}
+		MNT_ILOCK(mp);
+		mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ;
+		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+			MNT_IUNLOCK(mp);
+			vfs_allocate_syncvnode(mp);
+			MNT_ILOCK(mp);
+		}
+		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+		mp->mnt_flag |= async_flag;
+		if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+		    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
+			mp->mnt_kern_flag |= MNTK_ASYNC;
+		if (mp->mnt_kern_flag & MNTK_MWAIT) {
+			mp->mnt_kern_flag &= ~MNTK_MWAIT;
+			wakeup(mp);
+		}
+		MNT_IUNLOCK(mp);
+		if (coveredvp)
+			VOP_UNLOCK(coveredvp, 0);
+		return (error);
+	}
+	mtx_lock(&mountlist_mtx);
+	TAILQ_REMOVE(&mountlist, mp, mnt_list);
+	mtx_unlock(&mountlist_mtx);
+	EVENTHANDLER_INVOKE(vfs_unmounted, mp, td);
+	if (coveredvp != NULL) {
+		coveredvp->v_mountedhere = NULL;
+		vput(coveredvp);
+	}
+	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
+	vfs_mount_destroy(mp);
+	return (0);
+}
+
+/*
+ * Report errors during filesystem mounting.
+ */
+void
+vfs_mount_error(struct mount *mp, const char *fmt, ...)
+{
+	struct vfsoptlist *moptlist = mp->mnt_optnew;
+	va_list ap;
+	int error, len;
+	char *errmsg;
+
+	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
+	if (error || errmsg == NULL || len <= 0)
+		return;
+
+	va_start(ap, fmt);
+	vsnprintf(errmsg, (size_t)len, fmt, ap);
+	va_end(ap);
+}
+
+void
+vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...)
+{
+	va_list ap;
+	int error, len;
+	char *errmsg;
+
+	error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
+	if (error || errmsg == NULL || len <= 0)
+		return;
+
+	va_start(ap, fmt);
+	vsnprintf(errmsg, (size_t)len, fmt, ap);
+	va_end(ap);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * Functions for querying mount options/arguments from filesystems.
+ */
+
+/*
+ * Check that no unknown options are given
+ */
+int
+vfs_filteropt(struct vfsoptlist *opts, const char **legal)
+{
+	struct vfsopt *opt;
+	char errmsg[255];
+	const char **t, *p, *q;
+	int ret = 0;
+
+	TAILQ_FOREACH(opt, opts, link) {
+		p = opt->name;
+		q = NULL;
+		if (p[0] == 'n' && p[1] == 'o')
+			q = p + 2;
+		for(t = global_opts; *t != NULL; t++) {
+			if (strcmp(*t, p) == 0)
+				break;
+			if (q != NULL) {
+				if (strcmp(*t, q) == 0)
+					break;
+			}
+		}
+		if (*t != NULL)
+			continue;
+		for(t = legal; *t != NULL; t++) {
+			if (strcmp(*t, p) == 0)
+				break;
+			if (q != NULL) {
+				if (strcmp(*t, q) == 0)
+					break;
+			}
+		}
+		if (*t != NULL)
+			continue;
+		snprintf(errmsg, sizeof(errmsg),
+		    "mount option <%s> is unknown", p);
+		ret = EINVAL;
+	}
+	if (ret != 0) {
+		TAILQ_FOREACH(opt, opts, link) {
+			if (strcmp(opt->name, "errmsg") == 0) {
+				strncpy((char *)opt->value, errmsg, opt->len);
+				break;
+			}
+		}
+		if (opt == NULL)
+			printf("%s\n", errmsg);
+	}
+	return (ret);
+}
+
+/*
+ * Get a mount option by its name.
+ *
+ * Return 0 if the option was found, ENOENT otherwise.
+ * If len is non-NULL it will be filled with the length
+ * of the option. If buf is non-NULL, it will be filled
+ * with the address of the option.
+ */
+int
+vfs_getopt(opts, name, buf, len)
+	struct vfsoptlist *opts;
+	const char *name;
+	void **buf;
+	int *len;
+{
+	struct vfsopt *opt;
+
+	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) == 0) {
+			opt->seen = 1;
+			if (len != NULL)
+				*len = opt->len;
+			if (buf != NULL)
+				*buf = opt->value;
+			return (0);
+		}
+	}
+	return (ENOENT);
+}
+
+int
+vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
+{
+	struct vfsopt *opt;
+
+	if (opts == NULL)
+		return (-1);
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) == 0) {
+			opt->seen = 1;
+			return (opt->pos);
+		}
+	}
+	return (-1);
+}
+
+int
+vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value)
+{
+	char *opt_value, *vtp;
+	quad_t iv;
+	int error, opt_len;
+
+	error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len);
+	if (error != 0)
+		return (error);
+	if (opt_len == 0 || opt_value == NULL)
+		return (EINVAL);
+	if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0')
+		return (EINVAL);
+	iv = strtoq(opt_value, &vtp, 0);
+	if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0'))
+		return (EINVAL);
+	if (iv < 0)
+		return (EINVAL);
+	switch (vtp[0]) {
+	case 't':
+	case 'T':
+		iv *= 1024;
+	case 'g':
+	case 'G':
+		iv *= 1024;
+	case 'm':
+	case 'M':
+		iv *= 1024;
+	case 'k':
+	case 'K':
+		iv *= 1024;
+	case '\0':
+		break;
+	default:
+		return (EINVAL);
+	}
+	*value = iv;
+
+	return (0);
+}
+
+char *
+vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
+{
+	struct vfsopt *opt;
+
+	*error = 0;
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) != 0)
+			continue;
+		opt->seen = 1;
+		if (opt->len == 0 ||
+		    ((char *)opt->value)[opt->len - 1] != '\0') {
+			*error = EINVAL;
+			return (NULL);
+		}
+		return (opt->value);
+	}
+	*error = ENOENT;
+	return (NULL);
+}
+
+int
+vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w,
+	uint64_t val)
+{
+	struct vfsopt *opt;
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) == 0) {
+			opt->seen = 1;
+			if (w != NULL)
+				*w |= val;
+			return (1);
+		}
+	}
+	if (w != NULL)
+		*w &= ~val;
+	return (0);
+}
+
+int
+vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
+{
+	va_list ap;
+	struct vfsopt *opt;
+	int ret;
+
+	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) != 0)
+			continue;
+		opt->seen = 1;
+		if (opt->len == 0 || opt->value == NULL)
+			return (0);
+		if (((char *)opt->value)[opt->len - 1] != '\0')
+			return (0);
+		va_start(ap, fmt);
+		ret = vsscanf(opt->value, fmt, ap);
+		va_end(ap);
+		return (ret);
+	}
+	return (0);
+}
+
+int
+vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len)
+{
+	struct vfsopt *opt;
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) != 0)
+			continue;
+		opt->seen = 1;
+		if (opt->value == NULL)
+			opt->len = len;
+		else {
+			if (opt->len != len)
+				return (EINVAL);
+			bcopy(value, opt->value, len);
+		}
+		return (0);
+	}
+	return (ENOENT);
+}
+
+int
+vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len)
+{
+	struct vfsopt *opt;
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) != 0)
+			continue;
+		opt->seen = 1;
+		if (opt->value == NULL)
+			opt->len = len;
+		else {
+			if (opt->len < len)
+				return (EINVAL);
+			opt->len = len;
+			bcopy(value, opt->value, len);
+		}
+		return (0);
+	}
+	return (ENOENT);
+}
+
+int
+vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value)
+{
+	struct vfsopt *opt;
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) != 0)
+			continue;
+		opt->seen = 1;
+		if (opt->value == NULL)
+			opt->len = strlen(value) + 1;
+		else if (strlcpy(opt->value, value, opt->len) >= opt->len)
+			return (EINVAL);
+		return (0);
+	}
+	return (ENOENT);
+}
+
+/*
+ * Find and copy a mount option.
+ *
+ * The size of the buffer has to be specified
+ * in len, if it is not the same length as the
+ * mount option, EINVAL is returned.
+ * Returns ENOENT if the option is not found.
+ */
+int
+vfs_copyopt(opts, name, dest, len)
+	struct vfsoptlist *opts;
+	const char *name;
+	void *dest;
+	int len;
+{
+	struct vfsopt *opt;
+
+	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) == 0) {
+			opt->seen = 1;
+			if (len != opt->len)
+				return (EINVAL);
+			bcopy(opt->value, dest, opt->len);
+			return (0);
+		}
+	}
+	return (ENOENT);
+}
+
+int
+__vfs_statfs(struct mount *mp, struct statfs *sbp)
+{
+	int error;
+
+	error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat);
+	if (sbp != &mp->mnt_stat)
+		*sbp = mp->mnt_stat;
+	return (error);
+}
+
+void
+vfs_mountedfrom(struct mount *mp, const char *from)
+{
+
+	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
+	strlcpy(mp->mnt_stat.f_mntfromname, from,
+	    sizeof mp->mnt_stat.f_mntfromname);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * This is the api for building mount args and mounting filesystems from
+ * inside the kernel.
+ *
+ * The API works by accumulation of individual args.  First error is
+ * latched.
+ *
+ * XXX: should be documented in new manpage kernel_mount(9)
+ */
+
+/* A memory allocation which must be freed when we are done */
+struct mntaarg {
+	SLIST_ENTRY(mntaarg)	next;
+};
+
+/* The header for the mount arguments */
+struct mntarg {
+	struct iovec *v;
+	int len;
+	int error;
+	SLIST_HEAD(, mntaarg)	list;
+};
+
+/*
+ * Add a boolean argument.
+ *
+ * flag is the boolean value.
+ * name must start with "no".
+ */
+struct mntarg *
+mount_argb(struct mntarg *ma, int flag, const char *name)
+{
+
+	KASSERT(name[0] == 'n' && name[1] == 'o',
+	    ("mount_argb(...,%s): name must start with 'no'", name));
+
+	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
+}
+
+/*
+ * Add an argument printf style
+ */
+struct mntarg *
+mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
+{
+	va_list ap;
+	struct mntaarg *maa;
+	struct sbuf *sb;
+	int len;
+
+	if (ma == NULL) {
+		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
+		SLIST_INIT(&ma->list);
+	}
+	if (ma->error)
+		return (ma);
+
+	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
+	    M_MOUNT, M_WAITOK);
+	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
+	ma->v[ma->len].iov_len = strlen(name) + 1;
+	ma->len++;
+
+	sb = sbuf_new_auto();
+	va_start(ap, fmt);
+	sbuf_vprintf(sb, fmt, ap);
+	va_end(ap);
+	sbuf_finish(sb);
+	len = sbuf_len(sb) + 1;
+	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
+	SLIST_INSERT_HEAD(&ma->list, maa, next);
+	bcopy(sbuf_data(sb), maa + 1, len);
+	sbuf_delete(sb);
+
+	ma->v[ma->len].iov_base = maa + 1;
+	ma->v[ma->len].iov_len = len;
+	ma->len++;
+
+	return (ma);
+}
+
+/*
+ * Add an argument which is a userland string.
+ */
+struct mntarg *
+mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
+{
+	struct mntaarg *maa;
+	char *tbuf;
+
+	if (val == NULL)
+		return (ma);
+	if (ma == NULL) {
+		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
+		SLIST_INIT(&ma->list);
+	}
+	if (ma->error)
+		return (ma);
+	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
+	SLIST_INSERT_HEAD(&ma->list, maa, next);
+	tbuf = (void *)(maa + 1);
+	ma->error = copyinstr(val, tbuf, len, NULL);
+	return (mount_arg(ma, name, tbuf, -1));
+}
+
+/*
+ * Plain argument.
+ *
+ * If length is -1, treat value as a C string.
+ */
+struct mntarg *
+mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
+{
+
+	if (ma == NULL) {
+		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
+		SLIST_INIT(&ma->list);
+	}
+	if (ma->error)
+		return (ma);
+
+	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
+	    M_MOUNT, M_WAITOK);
+	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
+	ma->v[ma->len].iov_len = strlen(name) + 1;
+	ma->len++;
+
+	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
+	if (len < 0)
+		ma->v[ma->len].iov_len = strlen(val) + 1;
+	else
+		ma->v[ma->len].iov_len = len;
+	ma->len++;
+	return (ma);
+}
+
+/*
+ * Free a mntarg structure
+ */
+static void
+free_mntarg(struct mntarg *ma)
+{
+	struct mntaarg *maa;
+
+	while (!SLIST_EMPTY(&ma->list)) {
+		maa = SLIST_FIRST(&ma->list);
+		SLIST_REMOVE_HEAD(&ma->list, next);
+		free(maa, M_MOUNT);
+	}
+	free(ma->v, M_MOUNT);
+	free(ma, M_MOUNT);
+}
+
+/*
+ * Mount a filesystem
+ */
+int
+kernel_mount(struct mntarg *ma, uint64_t flags)
+{
+	struct uio auio;
+	int error;
+
+	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
+	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
+	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
+
+	auio.uio_iov = ma->v;
+	auio.uio_iovcnt = ma->len;
+	auio.uio_segflg = UIO_SYSSPACE;
+
+	error = ma->error;
+	if (!error)
+		error = vfs_donmount(curthread, flags, &auio);
+	free_mntarg(ma);
+	return (error);
+}
+
+/*
+ * A printflike function to mount a filesystem.
+ */
+int
+kernel_vmount(int flags, ...)
+{
+	struct mntarg *ma = NULL;
+	va_list ap;
+	const char *cp;
+	const void *vp;
+	int error;
+
+	va_start(ap, flags);
+	for (;;) {
+		cp = va_arg(ap, const char *);
+		if (cp == NULL)
+			break;
+		vp = va_arg(ap, const void *);
+		ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0));
+	}
+	va_end(ap);
+
+	error = kernel_mount(ma, flags);
+	return (error);
+}
+
+void
+vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp)
+{
+
+	bcopy(oexp, exp, sizeof(*oexp));
+	exp->ex_numsecflavors = 0;
+}
diff --git a/sys/kern/vfs_mountroot.c b/sys/kern/vfs_mountroot.c
new file mode 100644
index 0000000..322fc9a
--- /dev/null
+++ b/sys/kern/vfs_mountroot.c
@@ -0,0 +1,1041 @@
+/*-
+ * Copyright (c) 2010 Marcel Moolenaar
+ * Copyright (c) 1999-2004 Poul-Henning Kamp
+ * Copyright (c) 1999 Michael Smith
+ * Copyright (c) 1989, 1993
+ *      The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_rootdevname.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mdioctl.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/filedesc.h>
+#include <sys/reboot.h>
+#include <sys/sbuf.h>
+#include <sys/stat.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+#include <geom/geom.h>
+
+/*
+ * The root filesystem is detailed in the kernel environment variable
+ * vfs.root.mountfrom, which is expected to be in the general format
+ *
+ * <vfsname>:[<path>][	<vfsname>:[<path>] ...]
+ * vfsname   := the name of a VFS known to the kernel and capable
+ *              of being mounted as root
+ * path      := disk device name or other data used by the filesystem
+ *              to locate its physical store
+ *
+ * If the environment variable vfs.root.mountfrom is a space separated list,
+ * each list element is tried in turn and the root filesystem will be mounted
+ * from the first one that suceeds.
+ *
+ * The environment variable vfs.root.mountfrom.options is a comma delimited
+ * set of string mount options.  These mount options must be parseable
+ * by nmount() in the kernel.
+ */
+
+static int parse_mount(char **);
+static struct mntarg *parse_mountroot_options(struct mntarg *, const char *);
+
+/*
+ * The vnode of the system's root (/ in the filesystem, without chroot
+ * active.)
+ */
+struct vnode *rootvnode;
+
+char *rootdevnames[2] = {NULL, NULL};
+
+struct root_hold_token {
+	const char			*who;
+	LIST_ENTRY(root_hold_token)	list;
+};
+
+static LIST_HEAD(, root_hold_token)	root_holds =
+    LIST_HEAD_INITIALIZER(root_holds);
+
+enum action {
+	A_CONTINUE,
+	A_PANIC,
+	A_REBOOT,
+	A_RETRY
+};
+
+static enum action root_mount_onfail = A_CONTINUE;
+
+static int root_mount_mddev;
+static int root_mount_complete;
+
+/* By default wait up to 3 seconds for devices to appear. */
+static int root_mount_timeout = 3;
+TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout);
+
+struct root_hold_token *
+root_mount_hold(const char *identifier)
+{
+	struct root_hold_token *h;
+
+	if (root_mounted())
+		return (NULL);
+
+	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
+	h->who = identifier;
+	mtx_lock(&mountlist_mtx);
+	LIST_INSERT_HEAD(&root_holds, h, list);
+	mtx_unlock(&mountlist_mtx);
+	return (h);
+}
+
+void
+root_mount_rel(struct root_hold_token *h)
+{
+
+	if (h == NULL)
+		return;
+	mtx_lock(&mountlist_mtx);
+	LIST_REMOVE(h, list);
+	wakeup(&root_holds);
+	mtx_unlock(&mountlist_mtx);
+	free(h, M_DEVBUF);
+}
+
+int
+root_mounted(void)
+{
+
+	/* No mutex is acquired here because int stores are atomic. */
+	return (root_mount_complete);
+}
+
+void
+root_mount_wait(void)
+{
+
+	/*
+	 * Panic on an obvious deadlock - the function can't be called from
+	 * a thread which is doing the whole SYSINIT stuff.
+	 */
+	KASSERT(curthread->td_proc->p_pid != 0,
+	    ("root_mount_wait: cannot be called from the swapper thread"));
+	mtx_lock(&mountlist_mtx);
+	while (!root_mount_complete) {
+		msleep(&root_mount_complete, &mountlist_mtx, PZERO, "rootwait",
+		    hz);
+	}
+	mtx_unlock(&mountlist_mtx);
+}
+
+static void
+set_rootvnode(void)
+{
+	struct proc *p;
+
+	if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode))
+		panic("Cannot find root vnode");
+
+	VOP_UNLOCK(rootvnode, 0);
+
+	p = curthread->td_proc;
+	FILEDESC_XLOCK(p->p_fd);
+
+	if (p->p_fd->fd_cdir != NULL)
+		vrele(p->p_fd->fd_cdir);
+	p->p_fd->fd_cdir = rootvnode;
+	VREF(rootvnode);
+
+	if (p->p_fd->fd_rdir != NULL)
+		vrele(p->p_fd->fd_rdir);
+	p->p_fd->fd_rdir = rootvnode;
+	VREF(rootvnode);
+
+	FILEDESC_XUNLOCK(p->p_fd);
+}
+
+static int
+vfs_mountroot_devfs(struct thread *td, struct mount **mpp)
+{
+	struct vfsoptlist *opts;
+	struct vfsconf *vfsp;
+	struct mount *mp;
+	int error;
+
+	*mpp = NULL;
+
+	vfsp = vfs_byname("devfs");
+	KASSERT(vfsp != NULL, ("Could not find devfs by name"));
+	if (vfsp == NULL)
+		return (ENOENT);
+
+	mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
+
+	error = VFS_MOUNT(mp);
+	KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
+	if (error)
+		return (error);
+
+	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
+	TAILQ_INIT(opts);
+	mp->mnt_opt = opts;
+
+	mtx_lock(&mountlist_mtx);
+	TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
+	mtx_unlock(&mountlist_mtx);
+
+	*mpp = mp;
+	set_rootvnode();
+
+	error = kern_symlink(td, "/", "dev", UIO_SYSSPACE);
+	if (error)
+		printf("kern_symlink /dev -> / returns %d\n", error);
+
+	return (error);
+}
+
+static int
+vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs)
+{
+	struct nameidata nd;
+	struct mount *mporoot, *mpnroot;
+	struct vnode *vp, *vporoot, *vpdevfs;
+	char *fspath;
+	int error;
+
+	mpnroot = TAILQ_NEXT(mpdevfs, mnt_list);
+
+	/* Shuffle the mountlist. */
+	mtx_lock(&mountlist_mtx);
+	mporoot = TAILQ_FIRST(&mountlist);
+	TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list);
+	if (mporoot != mpdevfs) {
+		TAILQ_REMOVE(&mountlist, mpnroot, mnt_list);
+		TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list);
+	}
+	TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list);
+	mtx_unlock(&mountlist_mtx);
+
+	cache_purgevfs(mporoot);
+	if (mporoot != mpdevfs)
+		cache_purgevfs(mpdevfs);
+
+	VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot);
+
+	VI_LOCK(vporoot);
+	vporoot->v_iflag &= ~VI_MOUNT;
+	VI_UNLOCK(vporoot);
+	vporoot->v_mountedhere = NULL;
+	mporoot->mnt_flag &= ~MNT_ROOTFS;
+	mporoot->mnt_vnodecovered = NULL;
+	vput(vporoot);
+
+	/* Set up the new rootvnode, and purge the cache */
+	mpnroot->mnt_vnodecovered = NULL;
+	set_rootvnode();
+	cache_purgevfs(rootvnode->v_mount);
+
+	if (mporoot != mpdevfs) {
+		/* Remount old root under /.mount or /mnt */
+		fspath = "/.mount";
+		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
+		    fspath, td);
+		error = namei(&nd);
+		if (error) {
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+			fspath = "/mnt";
+			NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
+			    fspath, td);
+			error = namei(&nd);
+		}
+		if (!error) {
+			vp = nd.ni_vp;
+			error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
+			if (!error)
+				error = vinvalbuf(vp, V_SAVE, 0, 0);
+			if (!error) {
+				cache_purge(vp);
+				mporoot->mnt_vnodecovered = vp;
+				vp->v_mountedhere = mporoot;
+				strlcpy(mporoot->mnt_stat.f_mntonname,
+				    fspath, MNAMELEN);
+				VOP_UNLOCK(vp, 0);
+			} else
+				vput(vp);
+		}
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+
+		if (error && bootverbose)
+			printf("mountroot: unable to remount previous root "
+			    "under /.mount or /mnt (error %d).\n", error);
+	}
+
+	/* Remount devfs under /dev */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td);
+	error = namei(&nd);
+	if (!error) {
+		vp = nd.ni_vp;
+		error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
+		if (!error)
+			error = vinvalbuf(vp, V_SAVE, 0, 0);
+		if (!error) {
+			vpdevfs = mpdevfs->mnt_vnodecovered;
+			if (vpdevfs != NULL) {
+				cache_purge(vpdevfs);
+				vpdevfs->v_mountedhere = NULL;
+				vrele(vpdevfs);
+			}
+			mpdevfs->mnt_vnodecovered = vp;
+			vp->v_mountedhere = mpdevfs;
+			VOP_UNLOCK(vp, 0);
+		} else
+			vput(vp);
+	}
+	if (error && bootverbose)
+		printf("mountroot: unable to remount devfs under /dev "
+		    "(error %d).\n", error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	if (mporoot == mpdevfs) {
+		vfs_unbusy(mpdevfs);
+		/* Unlink the no longer needed /dev/dev -> / symlink */
+		error = kern_unlink(td, "/dev/dev", UIO_SYSSPACE);
+		if (error && bootverbose)
+			printf("mountroot: unable to unlink /dev/dev "
+			    "(error %d)\n", error);
+	}
+
+	return (0);
+}
+
+/*
+ * Configuration parser.
+ */
+
+/* Parser character classes. */
+#define	CC_WHITESPACE		-1
+#define	CC_NONWHITESPACE	-2
+
+/* Parse errors. */
+#define	PE_EOF			-1
+#define	PE_EOL			-2
+
+static __inline int
+parse_peek(char **conf)
+{
+
+	return (**conf);
+}
+
+static __inline void
+parse_poke(char **conf, int c)
+{
+
+	**conf = c;
+}
+
+static __inline void
+parse_advance(char **conf)
+{
+
+	(*conf)++;
+}
+
+static __inline int
+parse_isspace(int c)
+{
+
+	return ((c == ' ' || c == '\t' || c == '\n') ? 1 : 0);
+}
+
+static int
+parse_skipto(char **conf, int mc)
+{
+	int c, match;
+
+	while (1) {
+		c = parse_peek(conf);
+		if (c == 0)
+			return (PE_EOF);
+		switch (mc) {
+		case CC_WHITESPACE:
+			match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0;
+			break;
+		case CC_NONWHITESPACE:
+			if (c == '\n')
+				return (PE_EOL);
+			match = (c != ' ' && c != '\t') ? 1 : 0;
+			break;
+		default:
+			match = (c == mc) ? 1 : 0;
+			break;
+		}
+		if (match)
+			break;
+		parse_advance(conf);
+	}
+	return (0);
+}
+
+static int
+parse_token(char **conf, char **tok)
+{
+	char *p;
+	size_t len;
+	int error;
+
+	*tok = NULL;
+	error = parse_skipto(conf, CC_NONWHITESPACE);
+	if (error)
+		return (error);
+	p = *conf;
+	error = parse_skipto(conf, CC_WHITESPACE);
+	len = *conf - p;
+	*tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO);
+	bcopy(p, *tok, len);
+	return (0);
+}
+
+static void
+parse_dir_ask_printenv(const char *var)
+{
+	char *val;
+
+	val = getenv(var);
+	if (val != NULL) {
+		printf("  %s=%s\n", var, val);
+		freeenv(val);
+	}
+}
+
+static int
+parse_dir_ask(char **conf)
+{
+	char name[80];
+	char *mnt;
+	int error;
+
+	printf("\nLoader variables:\n");
+	parse_dir_ask_printenv("vfs.root.mountfrom");
+	parse_dir_ask_printenv("vfs.root.mountfrom.options");
+
+	printf("\nManual root filesystem specification:\n");
+	printf("  <fstype>:<device> [options]\n");
+	printf("      Mount <device> using filesystem <fstype>\n");
+	printf("      and with the specified (optional) option list.\n");
+	printf("\n");
+	printf("    eg. ufs:/dev/da0s1a\n");
+	printf("        zfs:tank\n");
+	printf("        cd9660:/dev/acd0 ro\n");
+	printf("          (which is equivalent to: ");
+	printf("mount -t cd9660 -o ro /dev/acd0 /)\n");
+	printf("\n");
+	printf("  ?               List valid disk boot devices\n");
+	printf("  .               Yield 1 second (for background tasks)\n");
+	printf("  <empty line>    Abort manual input\n");
+
+	do {
+		error = EINVAL;
+		printf("\nmountroot> ");
+		cngets(name, sizeof(name), GETS_ECHO);
+		if (name[0] == '\0')
+			break;
+		if (name[0] == '?' && name[1] == '\0') {
+			printf("\nList of GEOM managed disk devices:\n  ");
+			g_dev_print();
+			continue;
+		}
+		if (name[0] == '.' && name[1] == '\0') {
+			pause("rmask", hz);
+			continue;
+		}
+		mnt = name;
+		error = parse_mount(&mnt);
+		if (error == -1)
+			printf("Invalid file system specification.\n");
+	} while (error != 0);
+
+	return (error);
+}
+
+static int
+parse_dir_md(char **conf)
+{
+	struct stat sb;
+	struct thread *td;
+	struct md_ioctl *mdio;
+	char *path, *tok;
+	int error, fd, len;
+
+	td = curthread;
+
+	error = parse_token(conf, &tok);
+	if (error)
+		return (error);
+
+	len = strlen(tok);
+	mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO);
+	path = (void *)(mdio + 1);
+	bcopy(tok, path, len);
+	free(tok, M_TEMP);
+
+	/* Get file status. */
+	error = kern_stat(td, path, UIO_SYSSPACE, &sb);
+	if (error)
+		goto out;
+
+	/* Open /dev/mdctl so that we can attach/detach. */
+	error = kern_open(td, "/dev/" MDCTL_NAME, UIO_SYSSPACE, O_RDWR, 0);
+	if (error)
+		goto out;
+
+	fd = td->td_retval[0];
+	mdio->md_version = MDIOVERSION;
+	mdio->md_type = MD_VNODE;
+
+	if (root_mount_mddev != -1) {
+		mdio->md_unit = root_mount_mddev;
+		DROP_GIANT();
+		error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
+		PICKUP_GIANT();
+		/* Ignore errors. We don't care. */
+		root_mount_mddev = -1;
+	}
+
+	mdio->md_file = (void *)(mdio + 1);
+	mdio->md_options = MD_AUTOUNIT | MD_READONLY;
+	mdio->md_mediasize = sb.st_size;
+	mdio->md_unit = 0;
+	DROP_GIANT();
+	error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio);
+	PICKUP_GIANT();
+	if (error)
+		goto out;
+
+	if (mdio->md_unit > 9) {
+		printf("rootmount: too many md units\n");
+		mdio->md_file = NULL;
+		mdio->md_options = 0;
+		mdio->md_mediasize = 0;
+		DROP_GIANT();
+		error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
+		PICKUP_GIANT();
+		/* Ignore errors. We don't care. */
+		error = ERANGE;
+		goto out;
+	}
+
+	root_mount_mddev = mdio->md_unit;
+	printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file);
+
+	error = kern_close(td, fd);
+
+ out:
+	free(mdio, M_TEMP);
+	return (error);
+}
+
+static int
+parse_dir_onfail(char **conf)
+{
+	char *action;
+	int error;
+
+	error = parse_token(conf, &action);
+	if (error)
+		return (error);
+
+	if (!strcmp(action, "continue"))
+		root_mount_onfail = A_CONTINUE;
+	else if (!strcmp(action, "panic"))
+		root_mount_onfail = A_PANIC;
+	else if (!strcmp(action, "reboot"))
+		root_mount_onfail = A_REBOOT;
+	else if (!strcmp(action, "retry"))
+		root_mount_onfail = A_RETRY;
+	else {
+		printf("rootmount: %s: unknown action\n", action);
+		error = EINVAL;
+	}
+
+	free(action, M_TEMP);
+	return (0);
+}
+
+static int
+parse_dir_timeout(char **conf)
+{
+	char *tok, *endtok;
+	long secs;
+	int error;
+
+	error = parse_token(conf, &tok);
+	if (error)
+		return (error);
+
+	secs = strtol(tok, &endtok, 0);
+	error = (secs < 0 || *endtok != '\0') ? EINVAL : 0;
+	if (!error)
+		root_mount_timeout = secs;
+	free(tok, M_TEMP);
+	return (error);
+}
+
+static int
+parse_directive(char **conf)
+{
+	char *dir;
+	int error;
+
+	error = parse_token(conf, &dir);
+	if (error)
+		return (error);
+
+	if (strcmp(dir, ".ask") == 0)
+		error = parse_dir_ask(conf);
+	else if (strcmp(dir, ".md") == 0)
+		error = parse_dir_md(conf);
+	else if (strcmp(dir, ".onfail") == 0)
+		error = parse_dir_onfail(conf);
+	else if (strcmp(dir, ".timeout") == 0)
+		error = parse_dir_timeout(conf);
+	else {
+		printf("mountroot: invalid directive `%s'\n", dir);
+		/* Ignore the rest of the line. */
+		(void)parse_skipto(conf, '\n');
+		error = EINVAL;
+	}
+	free(dir, M_TEMP);
+	return (error);
+}
+
+static int
+parse_mount_dev_present(const char *dev)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, dev, curthread);
+	error = namei(&nd);
+	if (!error)
+		vput(nd.ni_vp);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	return (error != 0) ? 0 : 1;
+}
+
+#define	ERRMSGL	255
+static int
+parse_mount(char **conf)
+{
+	char *errmsg;
+	struct mntarg *ma;
+	char *dev, *fs, *opts, *tok;
+	int delay, error, timeout;
+
+	error = parse_token(conf, &tok);
+	if (error)
+		return (error);
+	fs = tok;
+	error = parse_skipto(&tok, ':');
+	if (error) {
+		free(fs, M_TEMP);
+		return (error);
+	}
+	parse_poke(&tok, '\0');
+	parse_advance(&tok);
+	dev = tok;
+
+	if (root_mount_mddev != -1) {
+		/* Handle substitution for the md unit number. */
+		tok = strstr(dev, "md#");
+		if (tok != NULL)
+			tok[2] = '0' + root_mount_mddev;
+	}
+
+	/* Parse options. */
+	error = parse_token(conf, &tok);
+	opts = (error == 0) ? tok : NULL;
+
+	printf("Trying to mount root from %s:%s [%s]...\n", fs, dev,
+	    (opts != NULL) ? opts : "");
+
+	errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO);
+
+	if (vfs_byname(fs) == NULL) {
+		strlcpy(errmsg, "unknown file system", ERRMSGL);
+		error = ENOENT;
+		goto out;
+	}
+
+	if (strcmp(fs, "zfs") != 0 && strstr(fs, "nfs") == NULL && 
+	    dev[0] != '\0' && !parse_mount_dev_present(dev)) {
+		printf("mountroot: waiting for device %s ...\n", dev);
+		delay = hz / 10;
+		timeout = root_mount_timeout * hz;
+		do {
+			pause("rmdev", delay);
+			timeout -= delay;
+		} while (timeout > 0 && !parse_mount_dev_present(dev));
+		if (timeout <= 0) {
+			error = ENODEV;
+			goto out;
+		}
+	}
+
+	ma = NULL;
+	ma = mount_arg(ma, "fstype", fs, -1);
+	ma = mount_arg(ma, "fspath", "/", -1);
+	ma = mount_arg(ma, "from", dev, -1);
+	ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL);
+	ma = mount_arg(ma, "ro", NULL, 0);
+	ma = parse_mountroot_options(ma, opts);
+	error = kernel_mount(ma, MNT_ROOTFS);
+
+ out:
+	if (error) {
+		printf("Mounting from %s:%s failed with error %d",
+		    fs, dev, error);
+		if (errmsg[0] != '\0')
+			printf(": %s", errmsg);
+		printf(".\n");
+	}
+	free(fs, M_TEMP);
+	free(errmsg, M_TEMP);
+	if (opts != NULL)
+		free(opts, M_TEMP);
+	/* kernel_mount can return -1 on error. */
+	return ((error < 0) ? EDOOFUS : error);
+}
+#undef ERRMSGL
+
+static int
+vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs)
+{
+	struct mount *mp;
+	char *conf;
+	int error;
+
+	root_mount_mddev = -1;
+
+retry:
+	conf = sbuf_data(sb);
+	mp = TAILQ_NEXT(mpdevfs, mnt_list);
+	error = (mp == NULL) ? 0 : EDOOFUS;
+	root_mount_onfail = A_CONTINUE;
+	while (mp == NULL) {
+		error = parse_skipto(&conf, CC_NONWHITESPACE);
+		if (error == PE_EOL) {
+			parse_advance(&conf);
+			continue;
+		}
+		if (error < 0)
+			break;
+		switch (parse_peek(&conf)) {
+		case '#':
+			error = parse_skipto(&conf, '\n');
+			break;
+		case '.':
+			error = parse_directive(&conf);
+			break;
+		default:
+			error = parse_mount(&conf);
+			break;
+		}
+		if (error < 0)
+			break;
+		/* Ignore any trailing garbage on the line. */
+		if (parse_peek(&conf) != '\n') {
+			printf("mountroot: advancing to next directive...\n");
+			(void)parse_skipto(&conf, '\n');
+		}
+		mp = TAILQ_NEXT(mpdevfs, mnt_list);
+	}
+	if (mp != NULL)
+		return (0);
+
+	/*
+	 * We failed to mount (a new) root.
+	 */
+	switch (root_mount_onfail) {
+	case A_CONTINUE:
+		break;
+	case A_PANIC:
+		panic("mountroot: unable to (re-)mount root.");
+		/* NOTREACHED */
+	case A_RETRY:
+		goto retry;
+	case A_REBOOT:
+		kern_reboot(RB_NOSYNC);
+		/* NOTREACHED */
+	}
+
+	return (error);
+}
+
+static void
+vfs_mountroot_conf0(struct sbuf *sb)
+{
+	char *s, *tok, *mnt, *opt;
+	int error;
+
+	sbuf_printf(sb, ".onfail panic\n");
+	sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
+	if (boothowto & RB_ASKNAME)
+		sbuf_printf(sb, ".ask\n");
+#ifdef ROOTDEVNAME
+	if (boothowto & RB_DFLTROOT)
+		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
+#endif
+	if (boothowto & RB_CDROM) {
+		sbuf_printf(sb, "cd9660:/dev/cd0 ro\n");
+		sbuf_printf(sb, ".timeout 0\n");
+		sbuf_printf(sb, "cd9660:/dev/acd0 ro\n");
+		sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
+	}
+	s = getenv("vfs.root.mountfrom");
+	if (s != NULL) {
+		opt = getenv("vfs.root.mountfrom.options");
+		tok = s;
+		error = parse_token(&tok, &mnt);
+		while (!error) {
+			sbuf_printf(sb, "%s %s\n", mnt,
+			    (opt != NULL) ? opt : "");
+			free(mnt, M_TEMP);
+			error = parse_token(&tok, &mnt);
+		}
+		if (opt != NULL)
+			freeenv(opt);
+		freeenv(s);
+	}
+	if (rootdevnames[0] != NULL)
+		sbuf_printf(sb, "%s\n", rootdevnames[0]);
+	if (rootdevnames[1] != NULL)
+		sbuf_printf(sb, "%s\n", rootdevnames[1]);
+#ifdef ROOTDEVNAME
+	if (!(boothowto & RB_DFLTROOT))
+		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
+#endif
+	if (!(boothowto & RB_ASKNAME))
+		sbuf_printf(sb, ".ask\n");
+}
+
+static int
+vfs_mountroot_readconf(struct thread *td, struct sbuf *sb)
+{
+	static char buf[128];
+	struct nameidata nd;
+	off_t ofs;
+	ssize_t resid;
+	int error, flags, len;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td);
+	flags = FREAD;
+	error = vn_open(&nd, &flags, 0, NULL);
+	if (error)
+		return (error);
+
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	ofs = 0;
+	len = sizeof(buf) - 1;
+	while (1) {
+		error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs,
+		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
+		    NOCRED, &resid, td);
+		if (error)
+			break;
+		if (resid == len)
+			break;
+		buf[len - resid] = 0;
+		sbuf_printf(sb, "%s", buf);
+		ofs += len - resid;
+	}
+
+	VOP_UNLOCK(nd.ni_vp, 0);
+	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+	return (error);
+}
+
+static void
+vfs_mountroot_wait(void)
+{
+	struct root_hold_token *h;
+	struct timeval lastfail;
+	int curfail;
+
+	curfail = 0;
+	while (1) {
+		DROP_GIANT();
+		g_waitidle();
+		PICKUP_GIANT();
+		mtx_lock(&mountlist_mtx);
+		if (LIST_EMPTY(&root_holds)) {
+			mtx_unlock(&mountlist_mtx);
+			break;
+		}
+		if (ppsratecheck(&lastfail, &curfail, 1)) {
+			printf("Root mount waiting for:");
+			LIST_FOREACH(h, &root_holds, list)
+				printf(" %s", h->who);
+			printf("\n");
+		}
+		msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold",
+		    hz);
+	}
+}
+
+void
+vfs_mountroot(void)
+{
+	struct mount *mp;
+	struct sbuf *sb;
+	struct thread *td;
+	time_t timebase;
+	int error;
+
+	td = curthread;
+
+	vfs_mountroot_wait();
+
+	sb = sbuf_new_auto();
+	vfs_mountroot_conf0(sb);
+	sbuf_finish(sb);
+
+	error = vfs_mountroot_devfs(td, &mp);
+	while (!error) {
+		error = vfs_mountroot_parse(sb, mp);
+		if (!error) {
+			error = vfs_mountroot_shuffle(td, mp);
+			if (!error) {
+				sbuf_clear(sb);
+				error = vfs_mountroot_readconf(td, sb);
+				sbuf_finish(sb);
+			}
+		}
+	}
+
+	sbuf_delete(sb);
+
+	/*
+	 * Iterate over all currently mounted file systems and use
+	 * the time stamp found to check and/or initialize the RTC.
+	 * Call inittodr() only once and pass it the largest of the
+	 * timestamps we encounter.
+	 */
+	timebase = 0;
+	mtx_lock(&mountlist_mtx);
+	mp = TAILQ_FIRST(&mountlist);
+	while (mp != NULL) {
+		if (mp->mnt_time > timebase)
+			timebase = mp->mnt_time;
+		mp = TAILQ_NEXT(mp, mnt_list);
+	}
+	mtx_unlock(&mountlist_mtx);
+	inittodr(timebase);
+
+	/* Keep prison0's root in sync with the global rootvnode. */
+	mtx_lock(&prison0.pr_mtx);
+	prison0.pr_root = rootvnode;
+	vref(prison0.pr_root);
+	mtx_unlock(&prison0.pr_mtx);
+
+	mtx_lock(&mountlist_mtx);
+	atomic_store_rel_int(&root_mount_complete, 1);
+	wakeup(&root_mount_complete);
+	mtx_unlock(&mountlist_mtx);
+
+	EVENTHANDLER_INVOKE(mountroot);
+}
+
+static struct mntarg *
+parse_mountroot_options(struct mntarg *ma, const char *options)
+{
+	char *p;
+	char *name, *name_arg;
+	char *val, *val_arg;
+	char *opts;
+
+	if (options == NULL || options[0] == '\0')
+		return (ma);
+
+	p = opts = strdup(options, M_MOUNT);
+	if (opts == NULL) {
+		return (ma);
+	}
+
+	while((name = strsep(&p, ",")) != NULL) {
+		if (name[0] == '\0')
+			break;
+
+		val = strchr(name, '=');
+		if (val != NULL) {
+			*val = '\0';
+			++val;
+		}
+		if( strcmp(name, "rw") == 0 ||
+		    strcmp(name, "noro") == 0) {
+			/*
+			 * The first time we mount the root file system,
+			 * we need to mount 'ro', so We need to ignore
+			 * 'rw' and 'noro' mount options.
+			 */
+			continue;
+		}
+		name_arg = strdup(name, M_MOUNT);
+		val_arg = NULL;
+		if (val != NULL)
+			val_arg = strdup(val, M_MOUNT);
+
+		ma = mount_arg(ma, name_arg, val_arg,
+		    (val_arg != NULL ? -1 : 0));
+	}
+	free(opts, M_MOUNT);
+	return (ma);
+}
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
new file mode 100644
index 0000000..3cbc95f
--- /dev/null
+++ b/sys/kern/vfs_subr.c
@@ -0,0 +1,4775 @@
+/*-
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
+ */
+
+/*
+ * External virtual filesystem routines
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_ddb.h"
+#include "opt_watchdog.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/condvar.h>
+#include <sys/conf.h>
+#include <sys/dirent.h>
+#include <sys/event.h>
+#include <sys/eventhandler.h>
+#include <sys/extattr.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lockf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/pctrie.h>
+#include <sys/priv.h>
+#include <sys/reboot.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/sleepqueue.h>
+#include <sys/smp.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <sys/watchdog.h>
+
+#include <machine/stdarg.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_kern.h>
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static void	delmntque(struct vnode *vp);
+static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
+		    int slpflag, int slptimeo);
+static void	syncer_shutdown(void *arg, int howto);
+static int	vtryrecycle(struct vnode *vp);
+static void	v_incr_usecount(struct vnode *);
+static void	v_decr_usecount(struct vnode *);
+static void	v_decr_useonly(struct vnode *);
+static void	v_upgrade_usecount(struct vnode *);
+static void	vnlru_free(int);
+static void	vgonel(struct vnode *);
+static void	vfs_knllock(void *arg);
+static void	vfs_knlunlock(void *arg);
+static void	vfs_knl_assert_locked(void *arg);
+static void	vfs_knl_assert_unlocked(void *arg);
+static void	destroy_vpollinfo(struct vpollinfo *vi);
+
+/*
+ * Number of vnodes in existence.  Increased whenever getnewvnode()
+ * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
+ */
+static unsigned long	numvnodes;
+
+SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
+    "Number of vnodes in existence");
+
+/*
+ * Conversion tables for conversion from vnode types to inode formats
+ * and back.
+ */
+enum vtype iftovt_tab[16] = {
+	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
+};
+int vttoif_tab[10] = {
+	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
+};
+
+/*
+ * List of vnodes that are ready for recycling.
+ */
+static TAILQ_HEAD(freelst, vnode) vnode_free_list;
+
+/*
+ * Free vnode target.  Free vnodes may simply be files which have been stat'd
+ * but not read.  This is somewhat common, and a small cache of such files
+ * should be kept to avoid recreation costs.
+ */
+static u_long wantfreevnodes;
+SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
+/* Number of vnodes in the free list. */
+static u_long freevnodes;
+SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
+    "Number of vnodes in the free list");
+
+static int vlru_allow_cache_src;
+SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
+    &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
+
+/*
+ * Various variables used for debugging the new implementation of
+ * reassignbuf().
+ * XXX these are probably of (very) limited utility now.
+ */
+static int reassignbufcalls;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
+    "Number of calls to reassignbuf");
+
+/*
+ * Cache for the mount type id assigned to NFS.  This is used for
+ * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
+ */
+int	nfs_mount_type = -1;
+
+/* To keep more than one thread at a time from running vfs_getnewfsid */
+static struct mtx mntid_mtx;
+
+/*
+ * Lock for any access to the following:
+ *	vnode_free_list
+ *	numvnodes
+ *	freevnodes
+ */
+static struct mtx vnode_free_list_mtx;
+
+/* Publicly exported FS */
+struct nfs_public nfs_pub;
+
+static uma_zone_t buf_trie_zone;
+
+/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
+static uma_zone_t vnode_zone;
+static uma_zone_t vnodepoll_zone;
+
+/*
+ * The workitem queue.
+ *
+ * It is useful to delay writes of file data and filesystem metadata
+ * for tens of seconds so that quickly created and deleted files need
+ * not waste disk bandwidth being created and removed. To realize this,
+ * we append vnodes to a "workitem" queue. When running with a soft
+ * updates implementation, most pending metadata dependencies should
+ * not wait for more than a few seconds. Thus, mounted on block devices
+ * are delayed only about a half the time that file data is delayed.
+ * Similarly, directory updates are more critical, so are only delayed
+ * about a third the time that file data is delayed. Thus, there are
+ * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
+ * one each second (driven off the filesystem syncer process). The
+ * syncer_delayno variable indicates the next queue that is to be processed.
+ * Items that need to be processed soon are placed in this queue:
+ *
+ *	syncer_workitem_pending[syncer_delayno]
+ *
+ * A delay of fifteen seconds is done by placing the request fifteen
+ * entries later in the queue:
+ *
+ *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
+ *
+ */
+static int syncer_delayno;
+static long syncer_mask;
+LIST_HEAD(synclist, bufobj);
+static struct synclist *syncer_workitem_pending;
+/*
+ * The sync_mtx protects:
+ *	bo->bo_synclist
+ *	sync_vnode_count
+ *	syncer_delayno
+ *	syncer_state
+ *	syncer_workitem_pending
+ *	syncer_worklist_len
+ *	rushjob
+ */
+static struct mtx sync_mtx;
+static struct cv sync_wakeup;
+
+#define SYNCER_MAXDELAY		32
+static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
+static int syncdelay = 30;		/* max time to delay syncing data */
+static int filedelay = 30;		/* time to delay syncing files */
+SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
+    "Time to delay syncing files (in seconds)");
+static int dirdelay = 29;		/* time to delay syncing directories */
+SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
+    "Time to delay syncing directories (in seconds)");
+static int metadelay = 28;		/* time to delay syncing metadata */
+SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
+    "Time to delay syncing metadata (in seconds)");
+static int rushjob;		/* number of slots to run ASAP */
+static int stat_rush_requests;	/* number of times I/O speeded up */
+SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
+    "Number of times I/O speeded up (rush requests)");
+
+/*
+ * When shutting down the syncer, run it at four times normal speed.
+ */
+#define SYNCER_SHUTDOWN_SPEEDUP		4
+static int sync_vnode_count;
+static int syncer_worklist_len;
+static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
+    syncer_state;
+
+/*
+ * Number of vnodes we want to exist at any one time.  This is mostly used
+ * to size hash tables in vnode-related code.  It is normally not used in
+ * getnewvnode(), as wantfreevnodes is normally nonzero.)
+ *
+ * XXX desiredvnodes is historical cruft and should not exist.
+ */
+int desiredvnodes;
+SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
+    &desiredvnodes, 0, "Maximum number of vnodes");
+SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
+    &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
+static int vnlru_nowhere;
+SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
+    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
+
+/*
+ * Macros to control when a vnode is freed and recycled.  All require
+ * the vnode interlock.
+ */
+#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
+#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
+#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
+
+/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
+static int vnsz2log;
+
+/*
+ * Support for the bufobj clean & dirty pctrie.
+ */
+static void *
+buf_trie_alloc(struct pctrie *ptree)
+{
+
+	return uma_zalloc(buf_trie_zone, M_NOWAIT);
+}
+
+static void
+buf_trie_free(struct pctrie *ptree, void *node)
+{
+
+	uma_zfree(buf_trie_zone, node);
+}
+PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
+
+/*
+ * Initialize the vnode management data structures.
+ *
+ * Reevaluate the following cap on the number of vnodes after the physical
+ * memory size exceeds 512GB.  In the limit, as the physical memory size
+ * grows, the ratio of physical pages to vnodes approaches sixteen to one.
+ */
+#ifndef	MAXVNODES_MAX
+#define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
+#endif
+static void
+vntblinit(void *dummy __unused)
+{
+	u_int i;
+	int physvnodes, virtvnodes;
+
+	/*
+	 * Desiredvnodes is a function of the physical memory size and the
+	 * kernel's heap size.  Generally speaking, it scales with the
+	 * physical memory size.  The ratio of desiredvnodes to physical pages
+	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
+	 * marginal ratio of desiredvnodes to physical pages is one to
+	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
+	 * size.  The memory required by desiredvnodes vnodes and vm objects
+	 * may not exceed one seventh of the kernel's heap size.
+	 */
+	physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
+	    cnt.v_page_count) / 16;
+	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
+	    sizeof(struct vnode)));
+	desiredvnodes = min(physvnodes, virtvnodes);
+	if (desiredvnodes > MAXVNODES_MAX) {
+		if (bootverbose)
+			printf("Reducing kern.maxvnodes %d -> %d\n",
+			    desiredvnodes, MAXVNODES_MAX);
+		desiredvnodes = MAXVNODES_MAX;
+	}
+	wantfreevnodes = desiredvnodes / 4;
+	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
+	TAILQ_INIT(&vnode_free_list);
+	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
+	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	/*
+	 * Preallocate enough nodes to support one-per buf so that
+	 * we can not fail an insert.  reassignbuf() callers can not
+	 * tolerate the insertion failure.
+	 */
+	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
+	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 
+	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
+	uma_prealloc(buf_trie_zone, nbuf);
+	/*
+	 * Initialize the filesystem syncer.
+	 */
+	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
+	    &syncer_mask);
+	syncer_maxdelay = syncer_mask + 1;
+	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
+	cv_init(&sync_wakeup, "syncer");
+	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
+		vnsz2log++;
+	vnsz2log--;
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
+
+
+/*
+ * Mark a mount point as busy. Used to synchronize access and to delay
+ * unmounting. Eventually, mountlist_mtx is not released on failure.
+ *
+ * vfs_busy() is a custom lock, it can block the caller.
+ * vfs_busy() only sleeps if the unmount is active on the mount point.
+ * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
+ * vnode belonging to mp.
+ *
+ * Lookup uses vfs_busy() to traverse mount points.
+ * root fs			var fs
+ * / vnode lock		A	/ vnode lock (/var)		D
+ * /var vnode lock	B	/log vnode lock(/var/log)	E
+ * vfs_busy lock	C	vfs_busy lock			F
+ *
+ * Within each file system, the lock order is C->A->B and F->D->E.
+ *
+ * When traversing across mounts, the system follows that lock order:
+ *
+ *        C->A->B
+ *              |
+ *              +->F->D->E
+ *
+ * The lookup() process for namei("/var") illustrates the process:
+ *  VOP_LOOKUP() obtains B while A is held
+ *  vfs_busy() obtains a shared lock on F while A and B are held
+ *  vput() releases lock on B
+ *  vput() releases lock on A
+ *  VFS_ROOT() obtains lock on D while shared lock on F is held
+ *  vfs_unbusy() releases shared lock on F
+ *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
+ *    Attempt to lock A (instead of vp_crossmp) while D is held would
+ *    violate the global order, causing deadlocks.
+ *
+ * dounmount() locks B while F is drained.
+ */
+int
+vfs_busy(struct mount *mp, int flags)
+{
+
+	MPASS((flags & ~MBF_MASK) == 0);
+	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
+
+	MNT_ILOCK(mp);
+	MNT_REF(mp);
+	/*
+	 * If mount point is currenly being unmounted, sleep until the
+	 * mount point fate is decided.  If thread doing the unmounting fails,
+	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
+	 * that this mount point has survived the unmount attempt and vfs_busy
+	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
+	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
+	 * about to be really destroyed.  vfs_busy needs to release its
+	 * reference on the mount point in this case and return with ENOENT,
+	 * telling the caller that mount mount it tried to busy is no longer
+	 * valid.
+	 */
+	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
+			MNT_REL(mp);
+			MNT_IUNLOCK(mp);
+			CTR1(KTR_VFS, "%s: failed busying before sleeping",
+			    __func__);
+			return (ENOENT);
+		}
+		if (flags & MBF_MNTLSTLOCK)
+			mtx_unlock(&mountlist_mtx);
+		mp->mnt_kern_flag |= MNTK_MWAIT;
+		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
+		if (flags & MBF_MNTLSTLOCK)
+			mtx_lock(&mountlist_mtx);
+		MNT_ILOCK(mp);
+	}
+	if (flags & MBF_MNTLSTLOCK)
+		mtx_unlock(&mountlist_mtx);
+	mp->mnt_lockref++;
+	MNT_IUNLOCK(mp);
+	return (0);
+}
+
+/*
+ * Free a busy filesystem.
+ */
+void
+vfs_unbusy(struct mount *mp)
+{
+
+	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+	MNT_ILOCK(mp);
+	MNT_REL(mp);
+	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
+	mp->mnt_lockref--;
+	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
+		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
+		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
+		mp->mnt_kern_flag &= ~MNTK_DRAINING;
+		wakeup(&mp->mnt_lockref);
+	}
+	MNT_IUNLOCK(mp);
+}
+
+/*
+ * Lookup a mount point by filesystem identifier.
+ */
+struct mount *
+vfs_getvfs(fsid_t *fsid)
+{
+	struct mount *mp;
+
+	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+			vfs_ref(mp);
+			mtx_unlock(&mountlist_mtx);
+			return (mp);
+		}
+	}
+	mtx_unlock(&mountlist_mtx);
+	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
+	return ((struct mount *) 0);
+}
+
+/*
+ * Lookup a mount point by filesystem identifier, busying it before
+ * returning.
+ */
+struct mount *
+vfs_busyfs(fsid_t *fsid)
+{
+	struct mount *mp;
+	int error;
+
+	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+			error = vfs_busy(mp, MBF_MNTLSTLOCK);
+			if (error) {
+				mtx_unlock(&mountlist_mtx);
+				return (NULL);
+			}
+			return (mp);
+		}
+	}
+	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
+	mtx_unlock(&mountlist_mtx);
+	return ((struct mount *) 0);
+}
+
+/*
+ * Check if a user can access privileged mount options.
+ */
+int
+vfs_suser(struct mount *mp, struct thread *td)
+{
+	int error;
+
+	/*
+	 * If the thread is jailed, but this is not a jail-friendly file
+	 * system, deny immediately.
+	 */
+	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
+		return (EPERM);
+
+	/*
+	 * If the file system was mounted outside the jail of the calling
+	 * thread, deny immediately.
+	 */
+	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
+		return (EPERM);
+
+	/*
+	 * If file system supports delegated administration, we don't check
+	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
+	 * by the file system itself.
+	 * If this is not the user that did original mount, we check for
+	 * the PRIV_VFS_MOUNT_OWNER privilege.
+	 */
+	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
+	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
+		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
+			return (error);
+	}
+	return (0);
+}
+
+/*
+ * Get a new unique fsid.  Try to make its val[0] unique, since this value
+ * will be used to create fake device numbers for stat().  Also try (but
+ * not so hard) make its val[0] unique mod 2^16, since some emulators only
+ * support 16-bit device numbers.  We end up with unique val[0]'s for the
+ * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
+ *
+ * Keep in mind that several mounts may be running in parallel.  Starting
+ * the search one past where the previous search terminated is both a
+ * micro-optimization and a defense against returning the same fsid to
+ * different mounts.
+ */
+void
+vfs_getnewfsid(struct mount *mp)
+{
+	static uint16_t mntid_base;
+	struct mount *nmp;
+	fsid_t tfsid;
+	int mtype;
+
+	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+	mtx_lock(&mntid_mtx);
+	mtype = mp->mnt_vfc->vfc_typenum;
+	tfsid.val[1] = mtype;
+	mtype = (mtype & 0xFF) << 24;
+	for (;;) {
+		tfsid.val[0] = makedev(255,
+		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
+		mntid_base++;
+		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
+			break;
+		vfs_rel(nmp);
+	}
+	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
+	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
+	mtx_unlock(&mntid_mtx);
+}
+
+/*
+ * Knob to control the precision of file timestamps:
+ *
+ *   0 = seconds only; nanoseconds zeroed.
+ *   1 = seconds and nanoseconds, accurate within 1/HZ.
+ *   2 = seconds and nanoseconds, truncated to microseconds.
+ * >=3 = seconds and nanoseconds, maximum precision.
+ */
+enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
+
+static int timestamp_precision = TSP_SEC;
+SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
+    &timestamp_precision, 0, "File timestamp precision (0: seconds, "
+    "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
+    "3+: sec + ns (max. precision))");
+
+/*
+ * Get a current timestamp.
+ */
+void
+vfs_timestamp(struct timespec *tsp)
+{
+	struct timeval tv;
+
+	switch (timestamp_precision) {
+	case TSP_SEC:
+		tsp->tv_sec = time_second;
+		tsp->tv_nsec = 0;
+		break;
+	case TSP_HZ:
+		getnanotime(tsp);
+		break;
+	case TSP_USEC:
+		microtime(&tv);
+		TIMEVAL_TO_TIMESPEC(&tv, tsp);
+		break;
+	case TSP_NSEC:
+	default:
+		nanotime(tsp);
+		break;
+	}
+}
+
+/*
+ * Set vnode attributes to VNOVAL
+ */
+void
+vattr_null(struct vattr *vap)
+{
+
+	vap->va_type = VNON;
+	vap->va_size = VNOVAL;
+	vap->va_bytes = VNOVAL;
+	vap->va_mode = VNOVAL;
+	vap->va_nlink = VNOVAL;
+	vap->va_uid = VNOVAL;
+	vap->va_gid = VNOVAL;
+	vap->va_fsid = VNOVAL;
+	vap->va_fileid = VNOVAL;
+	vap->va_blocksize = VNOVAL;
+	vap->va_rdev = VNOVAL;
+	vap->va_atime.tv_sec = VNOVAL;
+	vap->va_atime.tv_nsec = VNOVAL;
+	vap->va_mtime.tv_sec = VNOVAL;
+	vap->va_mtime.tv_nsec = VNOVAL;
+	vap->va_ctime.tv_sec = VNOVAL;
+	vap->va_ctime.tv_nsec = VNOVAL;
+	vap->va_birthtime.tv_sec = VNOVAL;
+	vap->va_birthtime.tv_nsec = VNOVAL;
+	vap->va_flags = VNOVAL;
+	vap->va_gen = VNOVAL;
+	vap->va_vaflags = 0;
+}
+
+/*
+ * This routine is called when we have too many vnodes.  It attempts
+ * to free <count> vnodes and will potentially free vnodes that still
+ * have VM backing store (VM backing store is typically the cause
+ * of a vnode blowout so we want to do this).  Therefore, this operation
+ * is not considered cheap.
+ *
+ * A number of conditions may prevent a vnode from being reclaimed.
+ * the buffer cache may have references on the vnode, a directory
+ * vnode may still have references due to the namei cache representing
+ * underlying files, or the vnode may be in active use.   It is not
+ * desireable to reuse such vnodes.  These conditions may cause the
+ * number of vnodes to reach some minimum value regardless of what
+ * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
+ */
+static int
+vlrureclaim(struct mount *mp)
+{
+	struct vnode *vp;
+	int done;
+	int trigger;
+	int usevnodes;
+	int count;
+
+	/*
+	 * Calculate the trigger point, don't allow user
+	 * screwups to blow us up.   This prevents us from
+	 * recycling vnodes with lots of resident pages.  We
+	 * aren't trying to free memory, we are trying to
+	 * free vnodes.
+	 */
+	usevnodes = desiredvnodes;
+	if (usevnodes <= 0)
+		usevnodes = 1;
+	trigger = cnt.v_page_count * 2 / usevnodes;
+	done = 0;
+	vn_start_write(NULL, &mp, V_WAIT);
+	MNT_ILOCK(mp);
+	count = mp->mnt_nvnodelistsize / 10 + 1;
+	while (count != 0) {
+		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
+		while (vp != NULL && vp->v_type == VMARKER)
+			vp = TAILQ_NEXT(vp, v_nmntvnodes);
+		if (vp == NULL)
+			break;
+		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+		--count;
+		if (!VI_TRYLOCK(vp))
+			goto next_iter;
+		/*
+		 * If it's been deconstructed already, it's still
+		 * referenced, or it exceeds the trigger, skip it.
+		 */
+		if (vp->v_usecount ||
+		    (!vlru_allow_cache_src &&
+			!LIST_EMPTY(&(vp)->v_cache_src)) ||
+		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
+		    vp->v_object->resident_page_count > trigger)) {
+			VI_UNLOCK(vp);
+			goto next_iter;
+		}
+		MNT_IUNLOCK(mp);
+		vholdl(vp);
+		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
+			vdrop(vp);
+			goto next_iter_mntunlocked;
+		}
+		VI_LOCK(vp);
+		/*
+		 * v_usecount may have been bumped after VOP_LOCK() dropped
+		 * the vnode interlock and before it was locked again.
+		 *
+		 * It is not necessary to recheck VI_DOOMED because it can
+		 * only be set by another thread that holds both the vnode
+		 * lock and vnode interlock.  If another thread has the
+		 * vnode lock before we get to VOP_LOCK() and obtains the
+		 * vnode interlock after VOP_LOCK() drops the vnode
+		 * interlock, the other thread will be unable to drop the
+		 * vnode lock before our VOP_LOCK() call fails.
+		 */
+		if (vp->v_usecount ||
+		    (!vlru_allow_cache_src &&
+			!LIST_EMPTY(&(vp)->v_cache_src)) ||
+		    (vp->v_object != NULL &&
+		    vp->v_object->resident_page_count > trigger)) {
+			VOP_UNLOCK(vp, LK_INTERLOCK);
+			vdrop(vp);
+			goto next_iter_mntunlocked;
+		}
+		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
+		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
+		vgonel(vp);
+		VOP_UNLOCK(vp, 0);
+		vdropl(vp);
+		done++;
+next_iter_mntunlocked:
+		if (!should_yield())
+			goto relock_mnt;
+		goto yield;
+next_iter:
+		if (!should_yield())
+			continue;
+		MNT_IUNLOCK(mp);
+yield:
+		kern_yield(PRI_USER);
+relock_mnt:
+		MNT_ILOCK(mp);
+	}
+	MNT_IUNLOCK(mp);
+	vn_finished_write(mp);
+	return done;
+}
+
+/*
+ * Attempt to keep the free list at wantfreevnodes length.
+ */
+static void
+vnlru_free(int count)
+{
+	struct vnode *vp;
+
+	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
+	for (; count > 0; count--) {
+		vp = TAILQ_FIRST(&vnode_free_list);
+		/*
+		 * The list can be modified while the free_list_mtx
+		 * has been dropped and vp could be NULL here.
+		 */
+		if (!vp)
+			break;
+		VNASSERT(vp->v_op != NULL, vp,
+		    ("vnlru_free: vnode already reclaimed."));
+		KASSERT((vp->v_iflag & VI_FREE) != 0,
+		    ("Removing vnode not on freelist"));
+		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
+		    ("Mangling active vnode"));
+		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
+		/*
+		 * Don't recycle if we can't get the interlock.
+		 */
+		if (!VI_TRYLOCK(vp)) {
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
+			continue;
+		}
+		VNASSERT(VCANRECYCLE(vp), vp,
+		    ("vp inconsistent on freelist"));
+		freevnodes--;
+		vp->v_iflag &= ~VI_FREE;
+		vholdl(vp);
+		mtx_unlock(&vnode_free_list_mtx);
+		VI_UNLOCK(vp);
+		vtryrecycle(vp);
+		/*
+		 * If the recycled succeeded this vdrop will actually free
+		 * the vnode.  If not it will simply place it back on
+		 * the free list.
+		 */
+		vdrop(vp);
+		mtx_lock(&vnode_free_list_mtx);
+	}
+}
+/*
+ * Attempt to recycle vnodes in a context that is always safe to block.
+ * Calling vlrurecycle() from the bowels of filesystem code has some
+ * interesting deadlock problems.
+ */
+static struct proc *vnlruproc;
+static int vnlruproc_sig;
+
+static void
+vnlru_proc(void)
+{
+	struct mount *mp, *nmp;
+	int done;
+	struct proc *p = vnlruproc;
+
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
+	    SHUTDOWN_PRI_FIRST);
+
+	for (;;) {
+		kproc_suspend_check(p);
+		mtx_lock(&vnode_free_list_mtx);
+		if (freevnodes > wantfreevnodes)
+			vnlru_free(freevnodes - wantfreevnodes);
+		if (numvnodes <= desiredvnodes * 9 / 10) {
+			vnlruproc_sig = 0;
+			wakeup(&vnlruproc_sig);
+			msleep(vnlruproc, &vnode_free_list_mtx,
+			    PVFS|PDROP, "vlruwt", hz);
+			continue;
+		}
+		mtx_unlock(&vnode_free_list_mtx);
+		done = 0;
+		mtx_lock(&mountlist_mtx);
+		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
+				nmp = TAILQ_NEXT(mp, mnt_list);
+				continue;
+			}
+			done += vlrureclaim(mp);
+			mtx_lock(&mountlist_mtx);
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			vfs_unbusy(mp);
+		}
+		mtx_unlock(&mountlist_mtx);
+		if (done == 0) {
+#if 0
+			/* These messages are temporary debugging aids */
+			if (vnlru_nowhere < 5)
+				printf("vnlru process getting nowhere..\n");
+			else if (vnlru_nowhere == 5)
+				printf("vnlru process messages stopped.\n");
+#endif
+			vnlru_nowhere++;
+			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
+		} else
+			kern_yield(PRI_USER);
+	}
+}
+
+static struct kproc_desc vnlru_kp = {
+	"vnlru",
+	vnlru_proc,
+	&vnlruproc
+};
+SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
+    &vnlru_kp);
+ 
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+
+/*
+ * Try to recycle a freed vnode.  We abort if anyone picks up a reference
+ * before we actually vgone().  This function must be called with the vnode
+ * held to prevent the vnode from being returned to the free list midway
+ * through vgone().
+ */
+static int
+vtryrecycle(struct vnode *vp)
+{
+	struct mount *vnmp;
+
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	VNASSERT(vp->v_holdcnt, vp,
+	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
+	/*
+	 * This vnode may found and locked via some other list, if so we
+	 * can't recycle it yet.
+	 */
+	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
+		CTR2(KTR_VFS,
+		    "%s: impossible to recycle, vp %p lock is already held",
+		    __func__, vp);
+		return (EWOULDBLOCK);
+	}
+	/*
+	 * Don't recycle if its filesystem is being suspended.
+	 */
+	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
+		VOP_UNLOCK(vp, 0);
+		CTR2(KTR_VFS,
+		    "%s: impossible to recycle, cannot start the write for %p",
+		    __func__, vp);
+		return (EBUSY);
+	}
+	/*
+	 * If we got this far, we need to acquire the interlock and see if
+	 * anyone picked up this vnode from another list.  If not, we will
+	 * mark it with DOOMED via vgonel() so that anyone who does find it
+	 * will skip over it.
+	 */
+	VI_LOCK(vp);
+	if (vp->v_usecount) {
+		VOP_UNLOCK(vp, LK_INTERLOCK);
+		vn_finished_write(vnmp);
+		CTR2(KTR_VFS,
+		    "%s: impossible to recycle, %p is already referenced",
+		    __func__, vp);
+		return (EBUSY);
+	}
+	if ((vp->v_iflag & VI_DOOMED) == 0)
+		vgonel(vp);
+	VOP_UNLOCK(vp, LK_INTERLOCK);
+	vn_finished_write(vnmp);
+	return (0);
+}
+
+/*
+ * Wait for available vnodes.
+ */
+static int
+getnewvnode_wait(int suspended)
+{
+
+	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
+	if (numvnodes > desiredvnodes) {
+		if (suspended) {
+			/*
+			 * File system is beeing suspended, we cannot risk a
+			 * deadlock here, so allocate new vnode anyway.
+			 */
+			if (freevnodes > wantfreevnodes)
+				vnlru_free(freevnodes - wantfreevnodes);
+			return (0);
+		}
+		if (vnlruproc_sig == 0) {
+			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
+			wakeup(vnlruproc);
+		}
+		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
+		    "vlruwk", hz);
+	}
+	return (numvnodes > desiredvnodes ? ENFILE : 0);
+}
+
+void
+getnewvnode_reserve(u_int count)
+{
+	struct thread *td;
+
+	td = curthread;
+	mtx_lock(&vnode_free_list_mtx);
+	while (count > 0) {
+		if (getnewvnode_wait(0) == 0) {
+			count--;
+			td->td_vp_reserv++;
+			numvnodes++;
+		}
+	}
+	mtx_unlock(&vnode_free_list_mtx);
+}
+
+void
+getnewvnode_drop_reserve(void)
+{
+	struct thread *td;
+
+	td = curthread;
+	mtx_lock(&vnode_free_list_mtx);
+	KASSERT(numvnodes >= td->td_vp_reserv, ("reserve too large"));
+	numvnodes -= td->td_vp_reserv;
+	mtx_unlock(&vnode_free_list_mtx);
+	td->td_vp_reserv = 0;
+}
+
+/*
+ * Return the next vnode from the free list.
+ */
+int
+getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
+    struct vnode **vpp)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+	struct thread *td;
+	int error;
+
+	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
+	vp = NULL;
+	td = curthread;
+	if (td->td_vp_reserv > 0) {
+		td->td_vp_reserv -= 1;
+		goto alloc;
+	}
+	mtx_lock(&vnode_free_list_mtx);
+	/*
+	 * Lend our context to reclaim vnodes if they've exceeded the max.
+	 */
+	if (freevnodes > wantfreevnodes)
+		vnlru_free(1);
+	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
+	    MNTK_SUSPEND));
+#if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
+	if (error != 0) {
+		mtx_unlock(&vnode_free_list_mtx);
+		return (error);
+	}
+#endif
+	numvnodes++;
+	mtx_unlock(&vnode_free_list_mtx);
+alloc:
+	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
+	/*
+	 * Setup locks.
+	 */
+	vp->v_vnlock = &vp->v_lock;
+	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
+	/*
+	 * By default, don't allow shared locks unless filesystems
+	 * opt-in.
+	 */
+	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
+	/*
+	 * Initialize bufobj.
+	 */
+	bo = &vp->v_bufobj;
+	bo->__bo_vnode = vp;
+	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
+	bo->bo_ops = &buf_ops_bio;
+	bo->bo_private = vp;
+	TAILQ_INIT(&bo->bo_clean.bv_hd);
+	TAILQ_INIT(&bo->bo_dirty.bv_hd);
+	/*
+	 * Initialize namecache.
+	 */
+	LIST_INIT(&vp->v_cache_src);
+	TAILQ_INIT(&vp->v_cache_dst);
+	/*
+	 * Finalize various vnode identity bits.
+	 */
+	vp->v_type = VNON;
+	vp->v_tag = tag;
+	vp->v_op = vops;
+	v_incr_usecount(vp);
+	vp->v_data = NULL;
+#ifdef MAC
+	mac_vnode_init(vp);
+	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
+		mac_vnode_associate_singlelabel(mp, vp);
+	else if (mp == NULL && vops != &dead_vnodeops)
+		printf("NULL mp in getnewvnode()\n");
+#endif
+	if (mp != NULL) {
+		bo->bo_bsize = mp->mnt_stat.f_iosize;
+		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
+			vp->v_vflag |= VV_NOKNOTE;
+	}
+	rangelock_init(&vp->v_rl);
+
+	/*
+	 * For the filesystems which do not use vfs_hash_insert(),
+	 * still initialize v_hash to have vfs_hash_index() useful.
+	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
+	 * its own hashing.
+	 */
+	vp->v_hash = (uintptr_t)vp >> vnsz2log;
+
+	*vpp = vp;
+	return (0);
+}
+
+/*
+ * Delete from old mount point vnode list, if on one.
+ */
+static void
+delmntque(struct vnode *vp)
+{
+	struct mount *mp;
+	int active;
+
+	mp = vp->v_mount;
+	if (mp == NULL)
+		return;
+	MNT_ILOCK(mp);
+	VI_LOCK(vp);
+	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
+	    ("Active vnode list size %d > Vnode list size %d",
+	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
+	active = vp->v_iflag & VI_ACTIVE;
+	vp->v_iflag &= ~VI_ACTIVE;
+	if (active) {
+		mtx_lock(&vnode_free_list_mtx);
+		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
+		mp->mnt_activevnodelistsize--;
+		mtx_unlock(&vnode_free_list_mtx);
+	}
+	vp->v_mount = NULL;
+	VI_UNLOCK(vp);
+	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
+		("bad mount point vnode list size"));
+	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+	mp->mnt_nvnodelistsize--;
+	MNT_REL(mp);
+	MNT_IUNLOCK(mp);
+}
+
+static void
+insmntque_stddtr(struct vnode *vp, void *dtr_arg)
+{
+
+	vp->v_data = NULL;
+	vp->v_op = &dead_vnodeops;
+	vgone(vp);
+	vput(vp);
+}
+
+/*
+ * Insert into list of vnodes for the new mount point, if available.
+ */
+int
+insmntque1(struct vnode *vp, struct mount *mp,
+	void (*dtr)(struct vnode *, void *), void *dtr_arg)
+{
+
+	KASSERT(vp->v_mount == NULL,
+		("insmntque: vnode already on per mount vnode list"));
+	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
+	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
+
+	/*
+	 * We acquire the vnode interlock early to ensure that the
+	 * vnode cannot be recycled by another process releasing a
+	 * holdcnt on it before we get it on both the vnode list
+	 * and the active vnode list. The mount mutex protects only
+	 * manipulation of the vnode list and the vnode freelist
+	 * mutex protects only manipulation of the active vnode list.
+	 * Hence the need to hold the vnode interlock throughout.
+	 */
+	MNT_ILOCK(mp);
+	VI_LOCK(vp);
+	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
+	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
+	    mp->mnt_nvnodelistsize == 0)) &&
+	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
+		VI_UNLOCK(vp);
+		MNT_IUNLOCK(mp);
+		if (dtr != NULL)
+			dtr(vp, dtr_arg);
+		return (EBUSY);
+	}
+	vp->v_mount = mp;
+	MNT_REF(mp);
+	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
+		("neg mount point vnode list size"));
+	mp->mnt_nvnodelistsize++;
+	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
+	    ("Activating already active vnode"));
+	vp->v_iflag |= VI_ACTIVE;
+	mtx_lock(&vnode_free_list_mtx);
+	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
+	mp->mnt_activevnodelistsize++;
+	mtx_unlock(&vnode_free_list_mtx);
+	VI_UNLOCK(vp);
+	MNT_IUNLOCK(mp);
+	return (0);
+}
+
+int
+insmntque(struct vnode *vp, struct mount *mp)
+{
+
+	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a bufobj
+ * Called with the underlying object locked.
+ */
+int
+bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
+{
+	int error;
+
+	BO_LOCK(bo);
+	if (flags & V_SAVE) {
+		error = bufobj_wwait(bo, slpflag, slptimeo);
+		if (error) {
+			BO_UNLOCK(bo);
+			return (error);
+		}
+		if (bo->bo_dirty.bv_cnt > 0) {
+			BO_UNLOCK(bo);
+			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
+				return (error);
+			/*
+			 * XXX We could save a lock/unlock if this was only
+			 * enabled under INVARIANTS
+			 */
+			BO_LOCK(bo);
+			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
+				panic("vinvalbuf: dirty bufs");
+		}
+	}
+	/*
+	 * If you alter this loop please notice that interlock is dropped and
+	 * reacquired in flushbuflist.  Special care is needed to ensure that
+	 * no race conditions occur from this.
+	 */
+	do {
+		error = flushbuflist(&bo->bo_clean,
+		    flags, bo, slpflag, slptimeo);
+		if (error == 0 && !(flags & V_CLEANONLY))
+			error = flushbuflist(&bo->bo_dirty,
+			    flags, bo, slpflag, slptimeo);
+		if (error != 0 && error != EAGAIN) {
+			BO_UNLOCK(bo);
+			return (error);
+		}
+	} while (error != 0);
+
+	/*
+	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
+	 * have write I/O in-progress but if there is a VM object then the
+	 * VM object can also have read-I/O in-progress.
+	 */
+	do {
+		bufobj_wwait(bo, 0, 0);
+		BO_UNLOCK(bo);
+		if (bo->bo_object != NULL) {
+			VM_OBJECT_WLOCK(bo->bo_object);
+			vm_object_pip_wait(bo->bo_object, "bovlbx");
+			VM_OBJECT_WUNLOCK(bo->bo_object);
+		}
+		BO_LOCK(bo);
+	} while (bo->bo_numoutput > 0);
+	BO_UNLOCK(bo);
+
+	/*
+	 * Destroy the copy in the VM cache, too.
+	 */
+	if (bo->bo_object != NULL &&
+	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
+		VM_OBJECT_WLOCK(bo->bo_object);
+		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
+		    OBJPR_CLEANONLY : 0);
+		VM_OBJECT_WUNLOCK(bo->bo_object);
+	}
+
+#ifdef INVARIANTS
+	BO_LOCK(bo);
+	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
+	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
+		panic("vinvalbuf: flush failed");
+	BO_UNLOCK(bo);
+#endif
+	return (0);
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ * Called with the underlying object locked.
+ */
+int
+vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
+{
+
+	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
+	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
+	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
+}
+
+/*
+ * Flush out buffers on the specified list.
+ *
+ */
+static int
+flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
+    int slptimeo)
+{
+	struct buf *bp, *nbp;
+	int retval, error;
+	daddr_t lblkno;
+	b_xflags_t xflags;
+
+	ASSERT_BO_WLOCKED(bo);
+
+	retval = 0;
+	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
+		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
+		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
+			continue;
+		}
+		lblkno = 0;
+		xflags = 0;
+		if (nbp != NULL) {
+			lblkno = nbp->b_lblkno;
+			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
+		}
+		retval = EAGAIN;
+		error = BUF_TIMELOCK(bp,
+		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
+		    "flushbuf", slpflag, slptimeo);
+		if (error) {
+			BO_LOCK(bo);
+			return (error != ENOLCK ? error : EAGAIN);
+		}
+		KASSERT(bp->b_bufobj == bo,
+		    ("bp %p wrong b_bufobj %p should be %p",
+		    bp, bp->b_bufobj, bo));
+		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
+			BUF_UNLOCK(bp);
+			BO_LOCK(bo);
+			return (EAGAIN);
+		}
+		/*
+		 * XXX Since there are no node locks for NFS, I
+		 * believe there is a slight chance that a delayed
+		 * write will occur while sleeping just above, so
+		 * check for it.
+		 */
+		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
+		    (flags & V_SAVE)) {
+			bremfree(bp);
+			bp->b_flags |= B_ASYNC;
+			bwrite(bp);
+			BO_LOCK(bo);
+			return (EAGAIN);	/* XXX: why not loop ? */
+		}
+		bremfree(bp);
+		bp->b_flags |= (B_INVAL | B_RELBUF);
+		bp->b_flags &= ~B_ASYNC;
+		brelse(bp);
+		BO_LOCK(bo);
+		if (nbp != NULL &&
+		    (nbp->b_bufobj != bo ||
+		     nbp->b_lblkno != lblkno ||
+		     (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
+			break;			/* nbp invalid */
+	}
+	return (retval);
+}
+
+/*
+ * Truncate a file's buffer and pages to a specified length.  This
+ * is in lieu of the old vinvalbuf mechanism, which performed unneeded
+ * sync activity.
+ */
+int
+vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
+{
+	struct buf *bp, *nbp;
+	int anyfreed;
+	int trunclbn;
+	struct bufobj *bo;
+
+	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
+	    vp, cred, blksize, (uintmax_t)length);
+
+	/*
+	 * Round up to the *next* lbn.
+	 */
+	trunclbn = (length + blksize - 1) / blksize;
+
+	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
+restart:
+	bo = &vp->v_bufobj;
+	BO_LOCK(bo);
+	anyfreed = 1;
+	for (;anyfreed;) {
+		anyfreed = 0;
+		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
+			if (bp->b_lblkno < trunclbn)
+				continue;
+			if (BUF_LOCK(bp,
+			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+			    BO_LOCKPTR(bo)) == ENOLCK)
+				goto restart;
+
+			bremfree(bp);
+			bp->b_flags |= (B_INVAL | B_RELBUF);
+			bp->b_flags &= ~B_ASYNC;
+			brelse(bp);
+			anyfreed = 1;
+
+			BO_LOCK(bo);
+			if (nbp != NULL &&
+			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
+			    (nbp->b_vp != vp) ||
+			    (nbp->b_flags & B_DELWRI))) {
+				BO_UNLOCK(bo);
+				goto restart;
+			}
+		}
+
+		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+			if (bp->b_lblkno < trunclbn)
+				continue;
+			if (BUF_LOCK(bp,
+			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+			    BO_LOCKPTR(bo)) == ENOLCK)
+				goto restart;
+			bremfree(bp);
+			bp->b_flags |= (B_INVAL | B_RELBUF);
+			bp->b_flags &= ~B_ASYNC;
+			brelse(bp);
+			anyfreed = 1;
+
+			BO_LOCK(bo);
+			if (nbp != NULL &&
+			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
+			    (nbp->b_vp != vp) ||
+			    (nbp->b_flags & B_DELWRI) == 0)) {
+				BO_UNLOCK(bo);
+				goto restart;
+			}
+		}
+	}
+
+	if (length > 0) {
+restartsync:
+		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+			if (bp->b_lblkno > 0)
+				continue;
+			/*
+			 * Since we hold the vnode lock this should only
+			 * fail if we're racing with the buf daemon.
+			 */
+			if (BUF_LOCK(bp,
+			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+			    BO_LOCKPTR(bo)) == ENOLCK) {
+				goto restart;
+			}
+			VNASSERT((bp->b_flags & B_DELWRI), vp,
+			    ("buf(%p) on dirty queue without DELWRI", bp));
+
+			bremfree(bp);
+			bawrite(bp);
+			BO_LOCK(bo);
+			goto restartsync;
+		}
+	}
+
+	bufobj_wwait(bo, 0, 0);
+	BO_UNLOCK(bo);
+	vnode_pager_setsize(vp, length);
+
+	return (0);
+}
+
+static void
+buf_vlist_remove(struct buf *bp)
+{
+	struct bufv *bv;
+
+	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+	ASSERT_BO_WLOCKED(bp->b_bufobj);
+	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
+	    (BX_VNDIRTY|BX_VNCLEAN),
+	    ("buf_vlist_remove: Buf %p is on two lists", bp));
+	if (bp->b_xflags & BX_VNDIRTY)
+		bv = &bp->b_bufobj->bo_dirty;
+	else
+		bv = &bp->b_bufobj->bo_clean;
+	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
+	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
+	bv->bv_cnt--;
+	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
+}
+
+/*
+ * Add the buffer to the sorted clean or dirty block list.
+ *
+ * NOTE: xflags is passed as a constant, optimizing this inline function!
+ */
+static void
+buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
+{
+	struct bufv *bv;
+	struct buf *n;
+	int error;
+
+	ASSERT_BO_WLOCKED(bo);
+	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
+	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
+	bp->b_xflags |= xflags;
+	if (xflags & BX_VNDIRTY)
+		bv = &bo->bo_dirty;
+	else
+		bv = &bo->bo_clean;
+
+	/*
+	 * Keep the list ordered.  Optimize empty list insertion.  Assume
+	 * we tend to grow at the tail so lookup_le should usually be cheaper
+	 * than _ge. 
+	 */
+	if (bv->bv_cnt == 0 ||
+	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
+		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
+	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
+		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
+	else
+		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
+	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
+	if (error)
+		panic("buf_vlist_add:  Preallocated nodes insufficient.");
+	bv->bv_cnt++;
+}
+
+/*
+ * Lookup a buffer using the splay tree.  Note that we specifically avoid
+ * shadow buffers used in background bitmap writes.
+ *
+ * This code isn't quite efficient as it could be because we are maintaining
+ * two sorted lists and do not know which list the block resides in.
+ *
+ * During a "make buildworld" the desired buffer is found at one of
+ * the roots more than 60% of the time.  Thus, checking both roots
+ * before performing either splay eliminates unnecessary splays on the
+ * first tree splayed.
+ */
+struct buf *
+gbincore(struct bufobj *bo, daddr_t lblkno)
+{
+	struct buf *bp;
+
+	ASSERT_BO_LOCKED(bo);
+	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
+	if (bp != NULL)
+		return (bp);
+	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
+}
+
+/*
+ * Associate a buffer with a vnode.
+ */
+void
+bgetvp(struct vnode *vp, struct buf *bp)
+{
+	struct bufobj *bo;
+
+	bo = &vp->v_bufobj;
+	ASSERT_BO_WLOCKED(bo);
+	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
+
+	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
+	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
+	    ("bgetvp: bp already attached! %p", bp));
+
+	vhold(vp);
+	bp->b_vp = vp;
+	bp->b_bufobj = bo;
+	/*
+	 * Insert onto list for new vnode.
+	 */
+	buf_vlist_add(bp, bo, BX_VNCLEAN);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ */
+void
+brelvp(struct buf *bp)
+{
+	struct bufobj *bo;
+	struct vnode *vp;
+
+	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
+
+	/*
+	 * Delete from old vnode list, if on one.
+	 */
+	vp = bp->b_vp;		/* XXX */
+	bo = bp->b_bufobj;
+	BO_LOCK(bo);
+	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
+		buf_vlist_remove(bp);
+	else
+		panic("brelvp: Buffer %p not on queue.", bp);
+	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
+		bo->bo_flag &= ~BO_ONWORKLST;
+		mtx_lock(&sync_mtx);
+		LIST_REMOVE(bo, bo_synclist);
+		syncer_worklist_len--;
+		mtx_unlock(&sync_mtx);
+	}
+	bp->b_vp = NULL;
+	bp->b_bufobj = NULL;
+	BO_UNLOCK(bo);
+	vdrop(vp);
+}
+
+/*
+ * Add an item to the syncer work queue.
+ */
+static void
+vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
+{
+	int slot;
+
+	ASSERT_BO_WLOCKED(bo);
+
+	mtx_lock(&sync_mtx);
+	if (bo->bo_flag & BO_ONWORKLST)
+		LIST_REMOVE(bo, bo_synclist);
+	else {
+		bo->bo_flag |= BO_ONWORKLST;
+		syncer_worklist_len++;
+	}
+
+	if (delay > syncer_maxdelay - 2)
+		delay = syncer_maxdelay - 2;
+	slot = (syncer_delayno + delay) & syncer_mask;
+
+	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
+	mtx_unlock(&sync_mtx);
+}
+
+static int
+sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
+{
+	int error, len;
+
+	mtx_lock(&sync_mtx);
+	len = syncer_worklist_len - sync_vnode_count;
+	mtx_unlock(&sync_mtx);
+	error = SYSCTL_OUT(req, &len, sizeof(len));
+	return (error);
+}
+
+SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
+    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
+
+static struct proc *updateproc;
+static void sched_sync(void);
+static struct kproc_desc up_kp = {
+	"syncer",
+	sched_sync,
+	&updateproc
+};
+SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
+
+static int
+sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
+{
+	struct vnode *vp;
+	struct mount *mp;
+
+	*bo = LIST_FIRST(slp);
+	if (*bo == NULL)
+		return (0);
+	vp = (*bo)->__bo_vnode;	/* XXX */
+	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
+		return (1);
+	/*
+	 * We use vhold in case the vnode does not
+	 * successfully sync.  vhold prevents the vnode from
+	 * going away when we unlock the sync_mtx so that
+	 * we can acquire the vnode interlock.
+	 */
+	vholdl(vp);
+	mtx_unlock(&sync_mtx);
+	VI_UNLOCK(vp);
+	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+		vdrop(vp);
+		mtx_lock(&sync_mtx);
+		return (*bo == LIST_FIRST(slp));
+	}
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	(void) VOP_FSYNC(vp, MNT_LAZY, td);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	BO_LOCK(*bo);
+	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
+		/*
+		 * Put us back on the worklist.  The worklist
+		 * routine will remove us from our current
+		 * position and then add us back in at a later
+		 * position.
+		 */
+		vn_syncer_add_to_worklist(*bo, syncdelay);
+	}
+	BO_UNLOCK(*bo);
+	vdrop(vp);
+	mtx_lock(&sync_mtx);
+	return (0);
+}
+
+/*
+ * System filesystem synchronizer daemon.
+ */
+static void
+sched_sync(void)
+{
+	struct synclist *next, *slp;
+	struct bufobj *bo;
+	long starttime;
+	struct thread *td = curthread;
+	int last_work_seen;
+	int net_worklist_len;
+	int syncer_final_iter;
+	int first_printf;
+	int error;
+
+	last_work_seen = 0;
+	syncer_final_iter = 0;
+	first_printf = 1;
+	syncer_state = SYNCER_RUNNING;
+	starttime = time_uptime;
+	td->td_pflags |= TDP_NORUNNINGBUF;
+
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
+	    SHUTDOWN_PRI_LAST);
+
+	mtx_lock(&sync_mtx);
+	for (;;) {
+		if (syncer_state == SYNCER_FINAL_DELAY &&
+		    syncer_final_iter == 0) {
+			mtx_unlock(&sync_mtx);
+			kproc_suspend_check(td->td_proc);
+			mtx_lock(&sync_mtx);
+		}
+		net_worklist_len = syncer_worklist_len - sync_vnode_count;
+		if (syncer_state != SYNCER_RUNNING &&
+		    starttime != time_uptime) {
+			if (first_printf) {
+				printf("\nSyncing disks, vnodes remaining...");
+				first_printf = 0;
+			}
+			printf("%d ", net_worklist_len);
+		}
+		starttime = time_uptime;
+
+		/*
+		 * Push files whose dirty time has expired.  Be careful
+		 * of interrupt race on slp queue.
+		 *
+		 * Skip over empty worklist slots when shutting down.
+		 */
+		do {
+			slp = &syncer_workitem_pending[syncer_delayno];
+			syncer_delayno += 1;
+			if (syncer_delayno == syncer_maxdelay)
+				syncer_delayno = 0;
+			next = &syncer_workitem_pending[syncer_delayno];
+			/*
+			 * If the worklist has wrapped since the
+			 * it was emptied of all but syncer vnodes,
+			 * switch to the FINAL_DELAY state and run
+			 * for one more second.
+			 */
+			if (syncer_state == SYNCER_SHUTTING_DOWN &&
+			    net_worklist_len == 0 &&
+			    last_work_seen == syncer_delayno) {
+				syncer_state = SYNCER_FINAL_DELAY;
+				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
+			}
+		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
+		    syncer_worklist_len > 0);
+
+		/*
+		 * Keep track of the last time there was anything
+		 * on the worklist other than syncer vnodes.
+		 * Return to the SHUTTING_DOWN state if any
+		 * new work appears.
+		 */
+		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
+			last_work_seen = syncer_delayno;
+		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
+			syncer_state = SYNCER_SHUTTING_DOWN;
+		while (!LIST_EMPTY(slp)) {
+			error = sync_vnode(slp, &bo, td);
+			if (error == 1) {
+				LIST_REMOVE(bo, bo_synclist);
+				LIST_INSERT_HEAD(next, bo, bo_synclist);
+				continue;
+			}
+
+			if (first_printf == 0)
+				wdog_kern_pat(WD_LASTVAL);
+
+		}
+		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
+			syncer_final_iter--;
+		/*
+		 * The variable rushjob allows the kernel to speed up the
+		 * processing of the filesystem syncer process. A rushjob
+		 * value of N tells the filesystem syncer to process the next
+		 * N seconds worth of work on its queue ASAP. Currently rushjob
+		 * is used by the soft update code to speed up the filesystem
+		 * syncer process when the incore state is getting so far
+		 * ahead of the disk that the kernel memory pool is being
+		 * threatened with exhaustion.
+		 */
+		if (rushjob > 0) {
+			rushjob -= 1;
+			continue;
+		}
+		/*
+		 * Just sleep for a short period of time between
+		 * iterations when shutting down to allow some I/O
+		 * to happen.
+		 *
+		 * If it has taken us less than a second to process the
+		 * current work, then wait. Otherwise start right over
+		 * again. We can still lose time if any single round
+		 * takes more than two seconds, but it does not really
+		 * matter as we are just trying to generally pace the
+		 * filesystem activity.
+		 */
+		if (syncer_state != SYNCER_RUNNING ||
+		    time_uptime == starttime) {
+			thread_lock(td);
+			sched_prio(td, PPAUSE);
+			thread_unlock(td);
+		}
+		if (syncer_state != SYNCER_RUNNING)
+			cv_timedwait(&sync_wakeup, &sync_mtx,
+			    hz / SYNCER_SHUTDOWN_SPEEDUP);
+		else if (time_uptime == starttime)
+			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
+	}
+}
+
+/*
+ * Request the syncer daemon to speed up its work.
+ * We never push it to speed up more than half of its
+ * normal turn time, otherwise it could take over the cpu.
+ */
+int
+speedup_syncer(void)
+{
+	int ret = 0;
+
+	mtx_lock(&sync_mtx);
+	if (rushjob < syncdelay / 2) {
+		rushjob += 1;
+		stat_rush_requests += 1;
+		ret = 1;
+	}
+	mtx_unlock(&sync_mtx);
+	cv_broadcast(&sync_wakeup);
+	return (ret);
+}
+
+/*
+ * Tell the syncer to speed up its work and run though its work
+ * list several times, then tell it to shut down.
+ */
+static void
+syncer_shutdown(void *arg, int howto)
+{
+
+	if (howto & RB_NOSYNC)
+		return;
+	mtx_lock(&sync_mtx);
+	syncer_state = SYNCER_SHUTTING_DOWN;
+	rushjob = 0;
+	mtx_unlock(&sync_mtx);
+	cv_broadcast(&sync_wakeup);
+	kproc_shutdown(arg, howto);
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+void
+reassignbuf(struct buf *bp)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+	int delay;
+#ifdef INVARIANTS
+	struct bufv *bv;
+#endif
+
+	vp = bp->b_vp;
+	bo = bp->b_bufobj;
+	++reassignbufcalls;
+
+	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
+	    bp, bp->b_vp, bp->b_flags);
+	/*
+	 * B_PAGING flagged buffers cannot be reassigned because their vp
+	 * is not fully linked in.
+	 */
+	if (bp->b_flags & B_PAGING)
+		panic("cannot reassign paging buffer");
+
+	/*
+	 * Delete from old vnode list, if on one.
+	 */
+	BO_LOCK(bo);
+	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
+		buf_vlist_remove(bp);
+	else
+		panic("reassignbuf: Buffer %p not on queue.", bp);
+	/*
+	 * If dirty, put on list of dirty buffers; otherwise insert onto list
+	 * of clean buffers.
+	 */
+	if (bp->b_flags & B_DELWRI) {
+		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
+			switch (vp->v_type) {
+			case VDIR:
+				delay = dirdelay;
+				break;
+			case VCHR:
+				delay = metadelay;
+				break;
+			default:
+				delay = filedelay;
+			}
+			vn_syncer_add_to_worklist(bo, delay);
+		}
+		buf_vlist_add(bp, bo, BX_VNDIRTY);
+	} else {
+		buf_vlist_add(bp, bo, BX_VNCLEAN);
+
+		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
+			mtx_lock(&sync_mtx);
+			LIST_REMOVE(bo, bo_synclist);
+			syncer_worklist_len--;
+			mtx_unlock(&sync_mtx);
+			bo->bo_flag &= ~BO_ONWORKLST;
+		}
+	}
+#ifdef INVARIANTS
+	bv = &bo->bo_clean;
+	bp = TAILQ_FIRST(&bv->bv_hd);
+	KASSERT(bp == NULL || bp->b_bufobj == bo,
+	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+	bp = TAILQ_LAST(&bv->bv_hd, buflists);
+	KASSERT(bp == NULL || bp->b_bufobj == bo,
+	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+	bv = &bo->bo_dirty;
+	bp = TAILQ_FIRST(&bv->bv_hd);
+	KASSERT(bp == NULL || bp->b_bufobj == bo,
+	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+	bp = TAILQ_LAST(&bv->bv_hd, buflists);
+	KASSERT(bp == NULL || bp->b_bufobj == bo,
+	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+#endif
+	BO_UNLOCK(bo);
+}
+
+/*
+ * Increment the use and hold counts on the vnode, taking care to reference
+ * the driver's usecount if this is a chardev.  The vholdl() will remove
+ * the vnode from the free list if it is presently free.  Requires the
+ * vnode interlock and returns with it held.
+ */
+static void
+v_incr_usecount(struct vnode *vp)
+{
+
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	vp->v_usecount++;
+	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
+		dev_lock();
+		vp->v_rdev->si_usecount++;
+		dev_unlock();
+	}
+	vholdl(vp);
+}
+
+/*
+ * Turn a holdcnt into a use+holdcnt such that only one call to
+ * v_decr_usecount is needed.
+ */
+static void
+v_upgrade_usecount(struct vnode *vp)
+{
+
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	vp->v_usecount++;
+	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
+		dev_lock();
+		vp->v_rdev->si_usecount++;
+		dev_unlock();
+	}
+}
+
+/*
+ * Decrement the vnode use and hold count along with the driver's usecount
+ * if this is a chardev.  The vdropl() below releases the vnode interlock
+ * as it may free the vnode.
+ */
+static void
+v_decr_usecount(struct vnode *vp)
+{
+
+	ASSERT_VI_LOCKED(vp, __FUNCTION__);
+	VNASSERT(vp->v_usecount > 0, vp,
+	    ("v_decr_usecount: negative usecount"));
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	vp->v_usecount--;
+	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
+		dev_lock();
+		vp->v_rdev->si_usecount--;
+		dev_unlock();
+	}
+	vdropl(vp);
+}
+
+/*
+ * Decrement only the use count and driver use count.  This is intended to
+ * be paired with a follow on vdropl() to release the remaining hold count.
+ * In this way we may vgone() a vnode with a 0 usecount without risk of
+ * having it end up on a free list because the hold count is kept above 0.
+ */
+static void
+v_decr_useonly(struct vnode *vp)
+{
+
+	ASSERT_VI_LOCKED(vp, __FUNCTION__);
+	VNASSERT(vp->v_usecount > 0, vp,
+	    ("v_decr_useonly: negative usecount"));
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	vp->v_usecount--;
+	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
+		dev_lock();
+		vp->v_rdev->si_usecount--;
+		dev_unlock();
+	}
+}
+
+/*
+ * Grab a particular vnode from the free list, increment its
+ * reference count and lock it.  VI_DOOMED is set if the vnode
+ * is being destroyed.  Only callers who specify LK_RETRY will
+ * see doomed vnodes.  If inactive processing was delayed in
+ * vput try to do it here.
+ */
+int
+vget(struct vnode *vp, int flags, struct thread *td)
+{
+	int error;
+
+	error = 0;
+	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
+	    ("vget: invalid lock operation"));
+	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
+
+	if ((flags & LK_INTERLOCK) == 0)
+		VI_LOCK(vp);
+	vholdl(vp);
+	if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
+		vdrop(vp);
+		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
+		    vp);
+		return (error);
+	}
+	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
+		panic("vget: vn_lock failed to return ENOENT\n");
+	VI_LOCK(vp);
+	/* Upgrade our holdcnt to a usecount. */
+	v_upgrade_usecount(vp);
+	/*
+	 * We don't guarantee that any particular close will
+	 * trigger inactive processing so just make a best effort
+	 * here at preventing a reference to a removed file.  If
+	 * we don't succeed no harm is done.
+	 */
+	if (vp->v_iflag & VI_OWEINACT) {
+		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
+		    (flags & LK_NOWAIT) == 0)
+			vinactive(vp, td);
+		vp->v_iflag &= ~VI_OWEINACT;
+	}
+	VI_UNLOCK(vp);
+	return (0);
+}
+
+/*
+ * Increase the reference count of a vnode.
+ */
+void
+vref(struct vnode *vp)
+{
+
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	VI_LOCK(vp);
+	v_incr_usecount(vp);
+	VI_UNLOCK(vp);
+}
+
+/*
+ * Return reference count of a vnode.
+ *
+ * The results of this call are only guaranteed when some mechanism other
+ * than the VI lock is used to stop other processes from gaining references
+ * to the vnode.  This may be the case if the caller holds the only reference.
+ * This is also useful when stale data is acceptable as race conditions may
+ * be accounted for by some other means.
+ */
+int
+vrefcnt(struct vnode *vp)
+{
+	int usecnt;
+
+	VI_LOCK(vp);
+	usecnt = vp->v_usecount;
+	VI_UNLOCK(vp);
+
+	return (usecnt);
+}
+
+#define	VPUTX_VRELE	1
+#define	VPUTX_VPUT	2
+#define	VPUTX_VUNREF	3
+
+static void
+vputx(struct vnode *vp, int func)
+{
+	int error;
+
+	KASSERT(vp != NULL, ("vputx: null vp"));
+	if (func == VPUTX_VUNREF)
+		ASSERT_VOP_LOCKED(vp, "vunref");
+	else if (func == VPUTX_VPUT)
+		ASSERT_VOP_LOCKED(vp, "vput");
+	else
+		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	VI_LOCK(vp);
+
+	/* Skip this v_writecount check if we're going to panic below. */
+	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
+	    ("vputx: missed vn_close"));
+	error = 0;
+
+	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
+	    vp->v_usecount == 1)) {
+		if (func == VPUTX_VPUT)
+			VOP_UNLOCK(vp, 0);
+		v_decr_usecount(vp);
+		return;
+	}
+
+	if (vp->v_usecount != 1) {
+		vprint("vputx: negative ref count", vp);
+		panic("vputx: negative ref cnt");
+	}
+	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
+	/*
+	 * We want to hold the vnode until the inactive finishes to
+	 * prevent vgone() races.  We drop the use count here and the
+	 * hold count below when we're done.
+	 */
+	v_decr_useonly(vp);
+	/*
+	 * We must call VOP_INACTIVE with the node locked. Mark
+	 * as VI_DOINGINACT to avoid recursion.
+	 */
+	vp->v_iflag |= VI_OWEINACT;
+	switch (func) {
+	case VPUTX_VRELE:
+		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
+		VI_LOCK(vp);
+		break;
+	case VPUTX_VPUT:
+		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
+			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
+			    LK_NOWAIT);
+			VI_LOCK(vp);
+		}
+		break;
+	case VPUTX_VUNREF:
+		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
+			error = EBUSY;
+		break;
+	}
+	if (vp->v_usecount > 0)
+		vp->v_iflag &= ~VI_OWEINACT;
+	if (error == 0) {
+		if (vp->v_iflag & VI_OWEINACT)
+			vinactive(vp, curthread);
+		if (func != VPUTX_VUNREF)
+			VOP_UNLOCK(vp, 0);
+	}
+	vdropl(vp);
+}
+
+/*
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vrele(struct vnode *vp)
+{
+
+	vputx(vp, VPUTX_VRELE);
+}
+
+/*
+ * Release an already locked vnode.  This give the same effects as
+ * unlock+vrele(), but takes less time and avoids releasing and
+ * re-aquiring the lock (as vrele() acquires the lock internally.)
+ */
+void
+vput(struct vnode *vp)
+{
+
+	vputx(vp, VPUTX_VPUT);
+}
+
+/*
+ * Release an exclusively locked vnode. Do not unlock the vnode lock.
+ */
+void
+vunref(struct vnode *vp)
+{
+
+	vputx(vp, VPUTX_VUNREF);
+}
+
+/*
+ * Somebody doesn't want the vnode recycled.
+ */
+void
+vhold(struct vnode *vp)
+{
+
+	VI_LOCK(vp);
+	vholdl(vp);
+	VI_UNLOCK(vp);
+}
+
+/*
+ * Increase the hold count and activate if this is the first reference.
+ */
+void
+vholdl(struct vnode *vp)
+{
+	struct mount *mp;
+
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	vp->v_holdcnt++;
+	if (!VSHOULDBUSY(vp))
+		return;
+	ASSERT_VI_LOCKED(vp, "vholdl");
+	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
+	VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
+	/*
+	 * Remove a vnode from the free list, mark it as in use,
+	 * and put it on the active list.
+	 */
+	mtx_lock(&vnode_free_list_mtx);
+	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
+	freevnodes--;
+	vp->v_iflag &= ~(VI_FREE|VI_AGE);
+	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
+	    ("Activating already active vnode"));
+	vp->v_iflag |= VI_ACTIVE;
+	mp = vp->v_mount;
+	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
+	mp->mnt_activevnodelistsize++;
+	mtx_unlock(&vnode_free_list_mtx);
+}
+
+/*
+ * Note that there is one less who cares about this vnode.
+ * vdrop() is the opposite of vhold().
+ */
+void
+vdrop(struct vnode *vp)
+{
+
+	VI_LOCK(vp);
+	vdropl(vp);
+}
+
+/*
+ * Drop the hold count of the vnode.  If this is the last reference to
+ * the vnode we place it on the free list unless it has been vgone'd
+ * (marked VI_DOOMED) in which case we will free it.
+ */
+void
+vdropl(struct vnode *vp)
+{
+	struct bufobj *bo;
+	struct mount *mp;
+	int active;
+
+	ASSERT_VI_LOCKED(vp, "vdropl");
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	if (vp->v_holdcnt <= 0)
+		panic("vdrop: holdcnt %d", vp->v_holdcnt);
+	vp->v_holdcnt--;
+	if (vp->v_holdcnt > 0) {
+		VI_UNLOCK(vp);
+		return;
+	}
+	if ((vp->v_iflag & VI_DOOMED) == 0) {
+		/*
+		 * Mark a vnode as free: remove it from its active list
+		 * and put it up for recycling on the freelist.
+		 */
+		VNASSERT(vp->v_op != NULL, vp,
+		    ("vdropl: vnode already reclaimed."));
+		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
+		    ("vnode already free"));
+		VNASSERT(VSHOULDFREE(vp), vp,
+		    ("vdropl: freeing when we shouldn't"));
+		active = vp->v_iflag & VI_ACTIVE;
+		vp->v_iflag &= ~VI_ACTIVE;
+		mp = vp->v_mount;
+		mtx_lock(&vnode_free_list_mtx);
+		if (active) {
+			TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
+			    v_actfreelist);
+			mp->mnt_activevnodelistsize--;
+		}
+		if (vp->v_iflag & VI_AGE) {
+			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist);
+		} else {
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
+		}
+		freevnodes++;
+		vp->v_iflag &= ~VI_AGE;
+		vp->v_iflag |= VI_FREE;
+		mtx_unlock(&vnode_free_list_mtx);
+		VI_UNLOCK(vp);
+		return;
+	}
+	/*
+	 * The vnode has been marked for destruction, so free it.
+	 */
+	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
+	mtx_lock(&vnode_free_list_mtx);
+	numvnodes--;
+	mtx_unlock(&vnode_free_list_mtx);
+	bo = &vp->v_bufobj;
+	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
+	    ("cleaned vnode still on the free list."));
+	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
+	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
+	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
+	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
+	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
+	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
+	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
+	    ("clean blk trie not empty"));
+	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
+	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
+	    ("dirty blk trie not empty"));
+	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
+	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
+	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
+	VI_UNLOCK(vp);
+#ifdef MAC
+	mac_vnode_destroy(vp);
+#endif
+	if (vp->v_pollinfo != NULL)
+		destroy_vpollinfo(vp->v_pollinfo);
+#ifdef INVARIANTS
+	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
+	vp->v_op = NULL;
+#endif
+	rangelock_destroy(&vp->v_rl);
+	lockdestroy(vp->v_vnlock);
+	mtx_destroy(&vp->v_interlock);
+	rw_destroy(BO_LOCKPTR(bo));
+	uma_zfree(vnode_zone, vp);
+}
+
+/*
+ * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
+ * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
+ * OWEINACT tracks whether a vnode missed a call to inactive due to a
+ * failed lock upgrade.
+ */
+void
+vinactive(struct vnode *vp, struct thread *td)
+{
+	struct vm_object *obj;
+
+	ASSERT_VOP_ELOCKED(vp, "vinactive");
+	ASSERT_VI_LOCKED(vp, "vinactive");
+	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
+	    ("vinactive: recursed on VI_DOINGINACT"));
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	vp->v_iflag |= VI_DOINGINACT;
+	vp->v_iflag &= ~VI_OWEINACT;
+	VI_UNLOCK(vp);
+	/*
+	 * Before moving off the active list, we must be sure that any
+	 * modified pages are on the vnode's dirty list since these will
+	 * no longer be checked once the vnode is on the inactive list.
+	 * Because the vnode vm object keeps a hold reference on the vnode
+	 * if there is at least one resident non-cached page, the vnode
+	 * cannot leave the active list without the page cleanup done.
+	 */
+	obj = vp->v_object;
+	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
+		VM_OBJECT_WLOCK(obj);
+		vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
+		VM_OBJECT_WUNLOCK(obj);
+	}
+	VOP_INACTIVE(vp, td);
+	VI_LOCK(vp);
+	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
+	    ("vinactive: lost VI_DOINGINACT"));
+	vp->v_iflag &= ~VI_DOINGINACT;
+}
+
+/*
+ * Remove any vnodes in the vnode table belonging to mount point mp.
+ *
+ * If FORCECLOSE is not specified, there should not be any active ones,
+ * return error if any are found (nb: this is a user error, not a
+ * system error). If FORCECLOSE is specified, detach any active vnodes
+ * that are found.
+ *
+ * If WRITECLOSE is set, only flush out regular file vnodes open for
+ * writing.
+ *
+ * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
+ *
+ * `rootrefs' specifies the base reference count for the root vnode
+ * of this filesystem. The root vnode is considered busy if its
+ * v_usecount exceeds this value. On a successful return, vflush(, td)
+ * will call vrele() on the root vnode exactly rootrefs times.
+ * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
+ * be zero.
+ */
+#ifdef DIAGNOSTIC
+static int busyprt = 0;		/* print out busy vnodes */
+SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
+#endif
+
+int
+vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
+{
+	struct vnode *vp, *mvp, *rootvp = NULL;
+	struct vattr vattr;
+	int busy = 0, error;
+
+	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
+	    rootrefs, flags);
+	if (rootrefs > 0) {
+		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
+		    ("vflush: bad args"));
+		/*
+		 * Get the filesystem root vnode. We can vput() it
+		 * immediately, since with rootrefs > 0, it won't go away.
+		 */
+		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
+			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
+			    __func__, error);
+			return (error);
+		}
+		vput(rootvp);
+	}
+loop:
+	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
+		vholdl(vp);
+		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
+		if (error) {
+			vdrop(vp);
+			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+			goto loop;
+		}
+		/*
+		 * Skip over a vnodes marked VV_SYSTEM.
+		 */
+		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
+			VOP_UNLOCK(vp, 0);
+			vdrop(vp);
+			continue;
+		}
+		/*
+		 * If WRITECLOSE is set, flush out unlinked but still open
+		 * files (even if open only for reading) and regular file
+		 * vnodes open for writing.
+		 */
+		if (flags & WRITECLOSE) {
+			if (vp->v_object != NULL) {
+				VM_OBJECT_WLOCK(vp->v_object);
+				vm_object_page_clean(vp->v_object, 0, 0, 0);
+				VM_OBJECT_WUNLOCK(vp->v_object);
+			}
+			error = VOP_FSYNC(vp, MNT_WAIT, td);
+			if (error != 0) {
+				VOP_UNLOCK(vp, 0);
+				vdrop(vp);
+				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+				return (error);
+			}
+			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
+			VI_LOCK(vp);
+
+			if ((vp->v_type == VNON ||
+			    (error == 0 && vattr.va_nlink > 0)) &&
+			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
+				VOP_UNLOCK(vp, 0);
+				vdropl(vp);
+				continue;
+			}
+		} else
+			VI_LOCK(vp);
+		/*
+		 * With v_usecount == 0, all we need to do is clear out the
+		 * vnode data structures and we are done.
+		 *
+		 * If FORCECLOSE is set, forcibly close the vnode.
+		 */
+		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
+			VNASSERT(vp->v_usecount == 0 ||
+			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
+			    ("device VNODE %p is FORCECLOSED", vp));
+			vgonel(vp);
+		} else {
+			busy++;
+#ifdef DIAGNOSTIC
+			if (busyprt)
+				vprint("vflush: busy vnode", vp);
+#endif
+		}
+		VOP_UNLOCK(vp, 0);
+		vdropl(vp);
+	}
+	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
+		/*
+		 * If just the root vnode is busy, and if its refcount
+		 * is equal to `rootrefs', then go ahead and kill it.
+		 */
+		VI_LOCK(rootvp);
+		KASSERT(busy > 0, ("vflush: not busy"));
+		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
+		    ("vflush: usecount %d < rootrefs %d",
+		     rootvp->v_usecount, rootrefs));
+		if (busy == 1 && rootvp->v_usecount == rootrefs) {
+			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
+			vgone(rootvp);
+			VOP_UNLOCK(rootvp, 0);
+			busy = 0;
+		} else
+			VI_UNLOCK(rootvp);
+	}
+	if (busy) {
+		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
+		    busy);
+		return (EBUSY);
+	}
+	for (; rootrefs > 0; rootrefs--)
+		vrele(rootvp);
+	return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ */
+int
+vrecycle(struct vnode *vp)
+{
+	int recycled;
+
+	ASSERT_VOP_ELOCKED(vp, "vrecycle");
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	recycled = 0;
+	VI_LOCK(vp);
+	if (vp->v_usecount == 0) {
+		recycled = 1;
+		vgonel(vp);
+	}
+	VI_UNLOCK(vp);
+	return (recycled);
+}
+
+/*
+ * Eliminate all activity associated with a vnode
+ * in preparation for reuse.
+ */
+void
+vgone(struct vnode *vp)
+{
+	VI_LOCK(vp);
+	vgonel(vp);
+	VI_UNLOCK(vp);
+}
+
+static void
+notify_lowervp_vfs_dummy(struct mount *mp __unused,
+    struct vnode *lowervp __unused)
+{
+}
+
+/*
+ * Notify upper mounts about reclaimed or unlinked vnode.
+ */
+void
+vfs_notify_upper(struct vnode *vp, int event)
+{
+	static struct vfsops vgonel_vfsops = {
+		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
+		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
+	};
+	struct mount *mp, *ump, *mmp;
+
+	mp = vp->v_mount;
+	if (mp == NULL)
+		return;
+
+	MNT_ILOCK(mp);
+	if (TAILQ_EMPTY(&mp->mnt_uppers))
+		goto unlock;
+	MNT_IUNLOCK(mp);
+	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
+	mmp->mnt_op = &vgonel_vfsops;
+	mmp->mnt_kern_flag |= MNTK_MARKER;
+	MNT_ILOCK(mp);
+	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
+	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
+		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
+			ump = TAILQ_NEXT(ump, mnt_upper_link);
+			continue;
+		}
+		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
+		MNT_IUNLOCK(mp);
+		switch (event) {
+		case VFS_NOTIFY_UPPER_RECLAIM:
+			VFS_RECLAIM_LOWERVP(ump, vp);
+			break;
+		case VFS_NOTIFY_UPPER_UNLINK:
+			VFS_UNLINK_LOWERVP(ump, vp);
+			break;
+		default:
+			KASSERT(0, ("invalid event %d", event));
+			break;
+		}
+		MNT_ILOCK(mp);
+		ump = TAILQ_NEXT(mmp, mnt_upper_link);
+		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
+	}
+	free(mmp, M_TEMP);
+	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
+	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
+		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
+		wakeup(&mp->mnt_uppers);
+	}
+unlock:
+	MNT_IUNLOCK(mp);
+}
+
+/*
+ * vgone, with the vp interlock held.
+ */
+void
+vgonel(struct vnode *vp)
+{
+	struct thread *td;
+	int oweinact;
+	int active;
+	struct mount *mp;
+
+	ASSERT_VOP_ELOCKED(vp, "vgonel");
+	ASSERT_VI_LOCKED(vp, "vgonel");
+	VNASSERT(vp->v_holdcnt, vp,
+	    ("vgonel: vp %p has no reference.", vp));
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	td = curthread;
+
+	/*
+	 * Don't vgonel if we're already doomed.
+	 */
+	if (vp->v_iflag & VI_DOOMED)
+		return;
+	vp->v_iflag |= VI_DOOMED;
+
+	/*
+	 * Check to see if the vnode is in use.  If so, we have to call
+	 * VOP_CLOSE() and VOP_INACTIVE().
+	 */
+	active = vp->v_usecount;
+	oweinact = (vp->v_iflag & VI_OWEINACT);
+	VI_UNLOCK(vp);
+	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
+
+	/*
+	 * Clean out any buffers associated with the vnode.
+	 * If the flush fails, just toss the buffers.
+	 */
+	mp = NULL;
+	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
+		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
+	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0)
+		vinvalbuf(vp, 0, 0, 0);
+
+	/*
+	 * If purging an active vnode, it must be closed and
+	 * deactivated before being reclaimed.
+	 */
+	if (active)
+		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
+	if (oweinact || active) {
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_DOINGINACT) == 0)
+			vinactive(vp, td);
+		VI_UNLOCK(vp);
+	}
+	if (vp->v_type == VSOCK)
+		vfs_unp_reclaim(vp);
+	/*
+	 * Reclaim the vnode.
+	 */
+	if (VOP_RECLAIM(vp, td))
+		panic("vgone: cannot reclaim");
+	if (mp != NULL)
+		vn_finished_secondary_write(mp);
+	VNASSERT(vp->v_object == NULL, vp,
+	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
+	/*
+	 * Clear the advisory locks and wake up waiting threads.
+	 */
+	(void)VOP_ADVLOCKPURGE(vp);
+	/*
+	 * Delete from old mount point vnode list.
+	 */
+	delmntque(vp);
+	cache_purge(vp);
+	/*
+	 * Done with purge, reset to the standard lock and invalidate
+	 * the vnode.
+	 */
+	VI_LOCK(vp);
+	vp->v_vnlock = &vp->v_lock;
+	vp->v_op = &dead_vnodeops;
+	vp->v_tag = "none";
+	vp->v_type = VBAD;
+}
+
+/*
+ * Calculate the total number of references to a special device.
+ */
+int
+vcount(struct vnode *vp)
+{
+	int count;
+
+	dev_lock();
+	count = vp->v_rdev->si_usecount;
+	dev_unlock();
+	return (count);
+}
+
+/*
+ * Same as above, but using the struct cdev *as argument
+ */
+int
+count_dev(struct cdev *dev)
+{
+	int count;
+
+	dev_lock();
+	count = dev->si_usecount;
+	dev_unlock();
+	return(count);
+}
+
+/*
+ * Print out a description of a vnode.
+ */
+static char *typename[] =
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
+ "VMARKER"};
+
+void
+vn_printf(struct vnode *vp, const char *fmt, ...)
+{
+	va_list ap;
+	char buf[256], buf2[16];
+	u_long flags;
+
+	va_start(ap, fmt);
+	vprintf(fmt, ap);
+	va_end(ap);
+	printf("%p: ", (void *)vp);
+	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
+	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
+	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
+	buf[0] = '\0';
+	buf[1] = '\0';
+	if (vp->v_vflag & VV_ROOT)
+		strlcat(buf, "|VV_ROOT", sizeof(buf));
+	if (vp->v_vflag & VV_ISTTY)
+		strlcat(buf, "|VV_ISTTY", sizeof(buf));
+	if (vp->v_vflag & VV_NOSYNC)
+		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
+	if (vp->v_vflag & VV_ETERNALDEV)
+		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
+	if (vp->v_vflag & VV_CACHEDLABEL)
+		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
+	if (vp->v_vflag & VV_TEXT)
+		strlcat(buf, "|VV_TEXT", sizeof(buf));
+	if (vp->v_vflag & VV_COPYONWRITE)
+		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
+	if (vp->v_vflag & VV_SYSTEM)
+		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
+	if (vp->v_vflag & VV_PROCDEP)
+		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
+	if (vp->v_vflag & VV_NOKNOTE)
+		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
+	if (vp->v_vflag & VV_DELETED)
+		strlcat(buf, "|VV_DELETED", sizeof(buf));
+	if (vp->v_vflag & VV_MD)
+		strlcat(buf, "|VV_MD", sizeof(buf));
+	if (vp->v_vflag & VV_FORCEINSMQ)
+		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
+	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
+	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
+	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
+	if (flags != 0) {
+		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
+		strlcat(buf, buf2, sizeof(buf));
+	}
+	if (vp->v_iflag & VI_MOUNT)
+		strlcat(buf, "|VI_MOUNT", sizeof(buf));
+	if (vp->v_iflag & VI_AGE)
+		strlcat(buf, "|VI_AGE", sizeof(buf));
+	if (vp->v_iflag & VI_DOOMED)
+		strlcat(buf, "|VI_DOOMED", sizeof(buf));
+	if (vp->v_iflag & VI_FREE)
+		strlcat(buf, "|VI_FREE", sizeof(buf));
+	if (vp->v_iflag & VI_ACTIVE)
+		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
+	if (vp->v_iflag & VI_DOINGINACT)
+		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
+	if (vp->v_iflag & VI_OWEINACT)
+		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
+	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
+	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
+	if (flags != 0) {
+		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
+		strlcat(buf, buf2, sizeof(buf));
+	}
+	printf("    flags (%s)\n", buf + 1);
+	if (mtx_owned(VI_MTX(vp)))
+		printf(" VI_LOCKed");
+	if (vp->v_object != NULL)
+		printf("    v_object %p ref %d pages %d\n",
+		    vp->v_object, vp->v_object->ref_count,
+		    vp->v_object->resident_page_count);
+	printf("    ");
+	lockmgr_printinfo(vp->v_vnlock);
+	if (vp->v_data != NULL)
+		VOP_PRINT(vp);
+}
+
+#ifdef DDB
+/*
+ * List all of the locked vnodes in the system.
+ * Called when debugging the kernel.
+ */
+DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
+{
+	struct mount *mp;
+	struct vnode *vp;
+
+	/*
+	 * Note: because this is DDB, we can't obey the locking semantics
+	 * for these structures, which means we could catch an inconsistent
+	 * state and dereference a nasty pointer.  Not much to be done
+	 * about that.
+	 */
+	db_printf("Locked vnodes\n");
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
+				vprint("", vp);
+		}
+	}
+}
+
+/*
+ * Show details about the given vnode.
+ */
+DB_SHOW_COMMAND(vnode, db_show_vnode)
+{
+	struct vnode *vp;
+
+	if (!have_addr)
+		return;
+	vp = (struct vnode *)addr;
+	vn_printf(vp, "vnode ");
+}
+
+/*
+ * Show details about the given mount point.
+ */
+DB_SHOW_COMMAND(mount, db_show_mount)
+{
+	struct mount *mp;
+	struct vfsopt *opt;
+	struct statfs *sp;
+	struct vnode *vp;
+	char buf[512];
+	uint64_t mflags;
+	u_int flags;
+
+	if (!have_addr) {
+		/* No address given, print short info about all mount points. */
+		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+			db_printf("%p %s on %s (%s)\n", mp,
+			    mp->mnt_stat.f_mntfromname,
+			    mp->mnt_stat.f_mntonname,
+			    mp->mnt_stat.f_fstypename);
+			if (db_pager_quit)
+				break;
+		}
+		db_printf("\nMore info: show mount <addr>\n");
+		return;
+	}
+
+	mp = (struct mount *)addr;
+	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
+	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
+
+	buf[0] = '\0';
+	mflags = mp->mnt_flag;
+#define	MNT_FLAG(flag)	do {						\
+	if (mflags & (flag)) {						\
+		if (buf[0] != '\0')					\
+			strlcat(buf, ", ", sizeof(buf));		\
+		strlcat(buf, (#flag) + 4, sizeof(buf));			\
+		mflags &= ~(flag);					\
+	}								\
+} while (0)
+	MNT_FLAG(MNT_RDONLY);
+	MNT_FLAG(MNT_SYNCHRONOUS);
+	MNT_FLAG(MNT_NOEXEC);
+	MNT_FLAG(MNT_NOSUID);
+	MNT_FLAG(MNT_NFS4ACLS);
+	MNT_FLAG(MNT_UNION);
+	MNT_FLAG(MNT_ASYNC);
+	MNT_FLAG(MNT_SUIDDIR);
+	MNT_FLAG(MNT_SOFTDEP);
+	MNT_FLAG(MNT_NOSYMFOLLOW);
+	MNT_FLAG(MNT_GJOURNAL);
+	MNT_FLAG(MNT_MULTILABEL);
+	MNT_FLAG(MNT_ACLS);
+	MNT_FLAG(MNT_NOATIME);
+	MNT_FLAG(MNT_NOCLUSTERR);
+	MNT_FLAG(MNT_NOCLUSTERW);
+	MNT_FLAG(MNT_SUJ);
+	MNT_FLAG(MNT_EXRDONLY);
+	MNT_FLAG(MNT_EXPORTED);
+	MNT_FLAG(MNT_DEFEXPORTED);
+	MNT_FLAG(MNT_EXPORTANON);
+	MNT_FLAG(MNT_EXKERB);
+	MNT_FLAG(MNT_EXPUBLIC);
+	MNT_FLAG(MNT_LOCAL);
+	MNT_FLAG(MNT_QUOTA);
+	MNT_FLAG(MNT_ROOTFS);
+	MNT_FLAG(MNT_USER);
+	MNT_FLAG(MNT_IGNORE);
+	MNT_FLAG(MNT_UPDATE);
+	MNT_FLAG(MNT_DELEXPORT);
+	MNT_FLAG(MNT_RELOAD);
+	MNT_FLAG(MNT_FORCE);
+	MNT_FLAG(MNT_SNAPSHOT);
+	MNT_FLAG(MNT_BYFSID);
+#undef MNT_FLAG
+	if (mflags != 0) {
+		if (buf[0] != '\0')
+			strlcat(buf, ", ", sizeof(buf));
+		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
+		    "0x%016jx", mflags);
+	}
+	db_printf("    mnt_flag = %s\n", buf);
+
+	buf[0] = '\0';
+	flags = mp->mnt_kern_flag;
+#define	MNT_KERN_FLAG(flag)	do {					\
+	if (flags & (flag)) {						\
+		if (buf[0] != '\0')					\
+			strlcat(buf, ", ", sizeof(buf));		\
+		strlcat(buf, (#flag) + 5, sizeof(buf));			\
+		flags &= ~(flag);					\
+	}								\
+} while (0)
+	MNT_KERN_FLAG(MNTK_UNMOUNTF);
+	MNT_KERN_FLAG(MNTK_ASYNC);
+	MNT_KERN_FLAG(MNTK_SOFTDEP);
+	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
+	MNT_KERN_FLAG(MNTK_DRAINING);
+	MNT_KERN_FLAG(MNTK_REFEXPIRE);
+	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
+	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
+	MNT_KERN_FLAG(MNTK_NO_IOPF);
+	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
+	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
+	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
+	MNT_KERN_FLAG(MNTK_MARKER);
+	MNT_KERN_FLAG(MNTK_NOASYNC);
+	MNT_KERN_FLAG(MNTK_UNMOUNT);
+	MNT_KERN_FLAG(MNTK_MWAIT);
+	MNT_KERN_FLAG(MNTK_SUSPEND);
+	MNT_KERN_FLAG(MNTK_SUSPEND2);
+	MNT_KERN_FLAG(MNTK_SUSPENDED);
+	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
+	MNT_KERN_FLAG(MNTK_NOKNOTE);
+#undef MNT_KERN_FLAG
+	if (flags != 0) {
+		if (buf[0] != '\0')
+			strlcat(buf, ", ", sizeof(buf));
+		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
+		    "0x%08x", flags);
+	}
+	db_printf("    mnt_kern_flag = %s\n", buf);
+
+	db_printf("    mnt_opt = ");
+	opt = TAILQ_FIRST(mp->mnt_opt);
+	if (opt != NULL) {
+		db_printf("%s", opt->name);
+		opt = TAILQ_NEXT(opt, link);
+		while (opt != NULL) {
+			db_printf(", %s", opt->name);
+			opt = TAILQ_NEXT(opt, link);
+		}
+	}
+	db_printf("\n");
+
+	sp = &mp->mnt_stat;
+	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
+	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
+	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
+	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
+	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
+	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
+	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
+	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
+	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
+	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
+	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
+	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
+
+	db_printf("    mnt_cred = { uid=%u ruid=%u",
+	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
+	if (jailed(mp->mnt_cred))
+		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
+	db_printf(" }\n");
+	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
+	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
+	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
+	db_printf("    mnt_activevnodelistsize = %d\n",
+	    mp->mnt_activevnodelistsize);
+	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
+	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
+	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
+	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
+	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
+	db_printf("    mnt_secondary_accwrites = %d\n",
+	    mp->mnt_secondary_accwrites);
+	db_printf("    mnt_gjprovider = %s\n",
+	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
+
+	db_printf("\n\nList of active vnodes\n");
+	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
+		if (vp->v_type != VMARKER) {
+			vn_printf(vp, "vnode ");
+			if (db_pager_quit)
+				break;
+		}
+	}
+	db_printf("\n\nList of inactive vnodes\n");
+	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
+			vn_printf(vp, "vnode ");
+			if (db_pager_quit)
+				break;
+		}
+	}
+}
+#endif	/* DDB */
+
+/*
+ * Fill in a struct xvfsconf based on a struct vfsconf.
+ */
+static int
+vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
+{
+	struct xvfsconf xvfsp;
+
+	bzero(&xvfsp, sizeof(xvfsp));
+	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
+	xvfsp.vfc_typenum = vfsp->vfc_typenum;
+	xvfsp.vfc_refcount = vfsp->vfc_refcount;
+	xvfsp.vfc_flags = vfsp->vfc_flags;
+	/*
+	 * These are unused in userland, we keep them
+	 * to not break binary compatibility.
+	 */
+	xvfsp.vfc_vfsops = NULL;
+	xvfsp.vfc_next = NULL;
+	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
+}
+
+#ifdef COMPAT_FREEBSD32
+struct xvfsconf32 {
+	uint32_t	vfc_vfsops;
+	char		vfc_name[MFSNAMELEN];
+	int32_t		vfc_typenum;
+	int32_t		vfc_refcount;
+	int32_t		vfc_flags;
+	uint32_t	vfc_next;
+};
+
+static int
+vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
+{
+	struct xvfsconf32 xvfsp;
+
+	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
+	xvfsp.vfc_typenum = vfsp->vfc_typenum;
+	xvfsp.vfc_refcount = vfsp->vfc_refcount;
+	xvfsp.vfc_flags = vfsp->vfc_flags;
+	xvfsp.vfc_vfsops = 0;
+	xvfsp.vfc_next = 0;
+	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
+}
+#endif
+
+/*
+ * Top level filesystem related information gathering.
+ */
+static int
+sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
+{
+	struct vfsconf *vfsp;
+	int error;
+
+	error = 0;
+	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+#ifdef COMPAT_FREEBSD32
+		if (req->flags & SCTL_MASK32)
+			error = vfsconf2x32(req, vfsp);
+		else
+#endif
+			error = vfsconf2x(req, vfsp);
+		if (error)
+			break;
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD,
+    NULL, 0, sysctl_vfs_conflist,
+    "S,xvfsconf", "List of all configured filesystems");
+
+#ifndef BURN_BRIDGES
+static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *)arg1 - 1;	/* XXX */
+	u_int namelen = arg2 + 1;	/* XXX */
+	struct vfsconf *vfsp;
+
+	log(LOG_WARNING, "userland calling deprecated sysctl, "
+	    "please rebuild world\n");
+
+#if 1 || defined(COMPAT_PRELITE2)
+	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+	if (namelen == 1)
+		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+	switch (name[1]) {
+	case VFS_MAXTYPENUM:
+		if (namelen != 2)
+			return (ENOTDIR);
+		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
+	case VFS_CONF:
+		if (namelen != 3)
+			return (ENOTDIR);	/* overloaded */
+		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
+			if (vfsp->vfc_typenum == name[2])
+				break;
+		if (vfsp == NULL)
+			return (EOPNOTSUPP);
+#ifdef COMPAT_FREEBSD32
+		if (req->flags & SCTL_MASK32)
+			return (vfsconf2x32(req, vfsp));
+		else
+#endif
+			return (vfsconf2x(req, vfsp));
+	}
+	return (EOPNOTSUPP);
+}
+
+static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
+    vfs_sysctl, "Generic filesystem");
+
+#if 1 || defined(COMPAT_PRELITE2)
+
+static int
+sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct vfsconf *vfsp;
+	struct ovfsconf ovfs;
+
+	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+		bzero(&ovfs, sizeof(ovfs));
+		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
+		strcpy(ovfs.vfc_name, vfsp->vfc_name);
+		ovfs.vfc_index = vfsp->vfc_typenum;
+		ovfs.vfc_refcount = vfsp->vfc_refcount;
+		ovfs.vfc_flags = vfsp->vfc_flags;
+		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+		if (error)
+			return error;
+	}
+	return 0;
+}
+
+#endif /* 1 || COMPAT_PRELITE2 */
+#endif /* !BURN_BRIDGES */
+
+#define KINFO_VNODESLOP		10
+#ifdef notyet
+/*
+ * Dump vnode list (via sysctl).
+ */
+/* ARGSUSED */
+static int
+sysctl_vnode(SYSCTL_HANDLER_ARGS)
+{
+	struct xvnode *xvn;
+	struct mount *mp;
+	struct vnode *vp;
+	int error, len, n;
+
+	/*
+	 * Stale numvnodes access is not fatal here.
+	 */
+	req->lock = 0;
+	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
+	if (!req->oldptr)
+		/* Make an estimate */
+		return (SYSCTL_OUT(req, 0, len));
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
+	n = 0;
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
+			continue;
+		MNT_ILOCK(mp);
+		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+			if (n == len)
+				break;
+			vref(vp);
+			xvn[n].xv_size = sizeof *xvn;
+			xvn[n].xv_vnode = vp;
+			xvn[n].xv_id = 0;	/* XXX compat */
+#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
+			XV_COPY(usecount);
+			XV_COPY(writecount);
+			XV_COPY(holdcnt);
+			XV_COPY(mount);
+			XV_COPY(numoutput);
+			XV_COPY(type);
+#undef XV_COPY
+			xvn[n].xv_flag = vp->v_vflag;
+
+			switch (vp->v_type) {
+			case VREG:
+			case VDIR:
+			case VLNK:
+				break;
+			case VBLK:
+			case VCHR:
+				if (vp->v_rdev == NULL) {
+					vrele(vp);
+					continue;
+				}
+				xvn[n].xv_dev = dev2udev(vp->v_rdev);
+				break;
+			case VSOCK:
+				xvn[n].xv_socket = vp->v_socket;
+				break;
+			case VFIFO:
+				xvn[n].xv_fifo = vp->v_fifoinfo;
+				break;
+			case VNON:
+			case VBAD:
+			default:
+				/* shouldn't happen? */
+				vrele(vp);
+				continue;
+			}
+			vrele(vp);
+			++n;
+		}
+		MNT_IUNLOCK(mp);
+		mtx_lock(&mountlist_mtx);
+		vfs_unbusy(mp);
+		if (n == len)
+			break;
+	}
+	mtx_unlock(&mountlist_mtx);
+
+	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
+	free(xvn, M_TEMP);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
+    0, 0, sysctl_vnode, "S,xvnode", "");
+#endif
+
+/*
+ * Unmount all filesystems. The list is traversed in reverse order
+ * of mounting to avoid dependencies.
+ */
+void
+vfs_unmountall(void)
+{
+	struct mount *mp;
+	struct thread *td;
+	int error;
+
+	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
+	td = curthread;
+
+	/*
+	 * Since this only runs when rebooting, it is not interlocked.
+	 */
+	while(!TAILQ_EMPTY(&mountlist)) {
+		mp = TAILQ_LAST(&mountlist, mntlist);
+		error = dounmount(mp, MNT_FORCE, td);
+		if (error) {
+			TAILQ_REMOVE(&mountlist, mp, mnt_list);
+			/*
+			 * XXX: Due to the way in which we mount the root
+			 * file system off of devfs, devfs will generate a
+			 * "busy" warning when we try to unmount it before
+			 * the root.  Don't print a warning as a result in
+			 * order to avoid false positive errors that may
+			 * cause needless upset.
+			 */
+			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
+				printf("unmount of %s failed (",
+				    mp->mnt_stat.f_mntonname);
+				if (error == EBUSY)
+					printf("BUSY)\n");
+				else
+					printf("%d)\n", error);
+			}
+		} else {
+			/* The unmount has removed mp from the mountlist */
+		}
+	}
+}
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags)
+{
+	struct vnode *vp, *mvp;
+	struct vm_object *obj;
+
+	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
+		obj = vp->v_object;
+		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
+		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
+			if (!vget(vp,
+			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
+			    curthread)) {
+				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
+					vput(vp);
+					continue;
+				}
+
+				obj = vp->v_object;
+				if (obj != NULL) {
+					VM_OBJECT_WLOCK(obj);
+					vm_object_page_clean(obj, 0, 0,
+					    flags == MNT_WAIT ?
+					    OBJPC_SYNC : OBJPC_NOSYNC);
+					VM_OBJECT_WUNLOCK(obj);
+				}
+				vput(vp);
+			}
+		} else
+			VI_UNLOCK(vp);
+	}
+}
+
+static void
+destroy_vpollinfo_free(struct vpollinfo *vi)
+{
+
+	knlist_destroy(&vi->vpi_selinfo.si_note);
+	mtx_destroy(&vi->vpi_lock);
+	uma_zfree(vnodepoll_zone, vi);
+}
+
+static void
+destroy_vpollinfo(struct vpollinfo *vi)
+{
+
+	knlist_clear(&vi->vpi_selinfo.si_note, 1);
+	seldrain(&vi->vpi_selinfo);
+	destroy_vpollinfo_free(vi);
+}
+
+/*
+ * Initalize per-vnode helper structure to hold poll-related state.
+ */
+void
+v_addpollinfo(struct vnode *vp)
+{
+	struct vpollinfo *vi;
+
+	if (vp->v_pollinfo != NULL)
+		return;
+	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
+	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
+	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
+	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
+	VI_LOCK(vp);
+	if (vp->v_pollinfo != NULL) {
+		VI_UNLOCK(vp);
+		destroy_vpollinfo_free(vi);
+		return;
+	}
+	vp->v_pollinfo = vi;
+	VI_UNLOCK(vp);
+}
+
+/*
+ * Record a process's interest in events which might happen to
+ * a vnode.  Because poll uses the historic select-style interface
+ * internally, this routine serves as both the ``check for any
+ * pending events'' and the ``record my interest in future events''
+ * functions.  (These are done together, while the lock is held,
+ * to avoid race conditions.)
+ */
+int
+vn_pollrecord(struct vnode *vp, struct thread *td, int events)
+{
+
+	v_addpollinfo(vp);
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	if (vp->v_pollinfo->vpi_revents & events) {
+		/*
+		 * This leaves events we are not interested
+		 * in available for the other process which
+		 * which presumably had requested them
+		 * (otherwise they would never have been
+		 * recorded).
+		 */
+		events &= vp->v_pollinfo->vpi_revents;
+		vp->v_pollinfo->vpi_revents &= ~events;
+
+		mtx_unlock(&vp->v_pollinfo->vpi_lock);
+		return (events);
+	}
+	vp->v_pollinfo->vpi_events |= events;
+	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+	return (0);
+}
+
+/*
+ * Routine to create and manage a filesystem syncer vnode.
+ */
+#define sync_close ((int (*)(struct  vop_close_args *))nullop)
+static int	sync_fsync(struct  vop_fsync_args *);
+static int	sync_inactive(struct  vop_inactive_args *);
+static int	sync_reclaim(struct  vop_reclaim_args *);
+
+static struct vop_vector sync_vnodeops = {
+	.vop_bypass =	VOP_EOPNOTSUPP,
+	.vop_close =	sync_close,		/* close */
+	.vop_fsync =	sync_fsync,		/* fsync */
+	.vop_inactive =	sync_inactive,	/* inactive */
+	.vop_reclaim =	sync_reclaim,	/* reclaim */
+	.vop_lock1 =	vop_stdlock,	/* lock */
+	.vop_unlock =	vop_stdunlock,	/* unlock */
+	.vop_islocked =	vop_stdislocked,	/* islocked */
+};
+
+/*
+ * Create a new filesystem syncer vnode for the specified mount point.
+ */
+void
+vfs_allocate_syncvnode(struct mount *mp)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+	static long start, incr, next;
+	int error;
+
+	/* Allocate a new vnode */
+	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
+	if (error != 0)
+		panic("vfs_allocate_syncvnode: getnewvnode() failed");
+	vp->v_type = VNON;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	vp->v_vflag |= VV_FORCEINSMQ;
+	error = insmntque(vp, mp);
+	if (error != 0)
+		panic("vfs_allocate_syncvnode: insmntque() failed");
+	vp->v_vflag &= ~VV_FORCEINSMQ;
+	VOP_UNLOCK(vp, 0);
+	/*
+	 * Place the vnode onto the syncer worklist. We attempt to
+	 * scatter them about on the list so that they will go off
+	 * at evenly distributed times even if all the filesystems
+	 * are mounted at once.
+	 */
+	next += incr;
+	if (next == 0 || next > syncer_maxdelay) {
+		start /= 2;
+		incr /= 2;
+		if (start == 0) {
+			start = syncer_maxdelay / 2;
+			incr = syncer_maxdelay;
+		}
+		next = start;
+	}
+	bo = &vp->v_bufobj;
+	BO_LOCK(bo);
+	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
+	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
+	mtx_lock(&sync_mtx);
+	sync_vnode_count++;
+	if (mp->mnt_syncer == NULL) {
+		mp->mnt_syncer = vp;
+		vp = NULL;
+	}
+	mtx_unlock(&sync_mtx);
+	BO_UNLOCK(bo);
+	if (vp != NULL) {
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		vgone(vp);
+		vput(vp);
+	}
+}
+
+void
+vfs_deallocate_syncvnode(struct mount *mp)
+{
+	struct vnode *vp;
+
+	mtx_lock(&sync_mtx);
+	vp = mp->mnt_syncer;
+	if (vp != NULL)
+		mp->mnt_syncer = NULL;
+	mtx_unlock(&sync_mtx);
+	if (vp != NULL)
+		vrele(vp);
+}
+
+/*
+ * Do a lazy sync of the filesystem.
+ */
+static int
+sync_fsync(struct vop_fsync_args *ap)
+{
+	struct vnode *syncvp = ap->a_vp;
+	struct mount *mp = syncvp->v_mount;
+	int error, save;
+	struct bufobj *bo;
+
+	/*
+	 * We only need to do something if this is a lazy evaluation.
+	 */
+	if (ap->a_waitfor != MNT_LAZY)
+		return (0);
+
+	/*
+	 * Move ourselves to the back of the sync list.
+	 */
+	bo = &syncvp->v_bufobj;
+	BO_LOCK(bo);
+	vn_syncer_add_to_worklist(bo, syncdelay);
+	BO_UNLOCK(bo);
+
+	/*
+	 * Walk the list of vnodes pushing all that are dirty and
+	 * not already on the sync list.
+	 */
+	mtx_lock(&mountlist_mtx);
+	if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
+		mtx_unlock(&mountlist_mtx);
+		return (0);
+	}
+	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
+		vfs_unbusy(mp);
+		return (0);
+	}
+	save = curthread_pflags_set(TDP_SYNCIO);
+	vfs_msync(mp, MNT_NOWAIT);
+	error = VFS_SYNC(mp, MNT_LAZY);
+	curthread_pflags_restore(save);
+	vn_finished_write(mp);
+	vfs_unbusy(mp);
+	return (error);
+}
+
+/*
+ * The syncer vnode is no referenced.
+ */
+static int
+sync_inactive(struct vop_inactive_args *ap)
+{
+
+	vgone(ap->a_vp);
+	return (0);
+}
+
+/*
+ * The syncer vnode is no longer needed and is being decommissioned.
+ *
+ * Modifications to the worklist must be protected by sync_mtx.
+ */
+static int
+sync_reclaim(struct vop_reclaim_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct bufobj *bo;
+
+	bo = &vp->v_bufobj;
+	BO_LOCK(bo);
+	mtx_lock(&sync_mtx);
+	if (vp->v_mount->mnt_syncer == vp)
+		vp->v_mount->mnt_syncer = NULL;
+	if (bo->bo_flag & BO_ONWORKLST) {
+		LIST_REMOVE(bo, bo_synclist);
+		syncer_worklist_len--;
+		sync_vnode_count--;
+		bo->bo_flag &= ~BO_ONWORKLST;
+	}
+	mtx_unlock(&sync_mtx);
+	BO_UNLOCK(bo);
+
+	return (0);
+}
+
+/*
+ * Check if vnode represents a disk device
+ */
+int
+vn_isdisk(struct vnode *vp, int *errp)
+{
+	int error;
+
+	error = 0;
+	dev_lock();
+	if (vp->v_type != VCHR)
+		error = ENOTBLK;
+	else if (vp->v_rdev == NULL)
+		error = ENXIO;
+	else if (vp->v_rdev->si_devsw == NULL)
+		error = ENXIO;
+	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
+		error = ENOTBLK;
+	dev_unlock();
+	if (errp != NULL)
+		*errp = error;
+	return (error == 0);
+}
+
+/*
+ * Common filesystem object access control check routine.  Accepts a
+ * vnode's type, "mode", uid and gid, requested access mode, credentials,
+ * and optional call-by-reference privused argument allowing vaccess()
+ * to indicate to the caller whether privilege was used to satisfy the
+ * request (obsoleted).  Returns 0 on success, or an errno on failure.
+ */
+int
+vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
+    accmode_t accmode, struct ucred *cred, int *privused)
+{
+	accmode_t dac_granted;
+	accmode_t priv_granted;
+
+	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
+	    ("invalid bit in accmode"));
+	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
+	    ("VAPPEND without VWRITE"));
+
+	/*
+	 * Look for a normal, non-privileged way to access the file/directory
+	 * as requested.  If it exists, go with that.
+	 */
+
+	if (privused != NULL)
+		*privused = 0;
+
+	dac_granted = 0;
+
+	/* Check the owner. */
+	if (cred->cr_uid == file_uid) {
+		dac_granted |= VADMIN;
+		if (file_mode & S_IXUSR)
+			dac_granted |= VEXEC;
+		if (file_mode & S_IRUSR)
+			dac_granted |= VREAD;
+		if (file_mode & S_IWUSR)
+			dac_granted |= (VWRITE | VAPPEND);
+
+		if ((accmode & dac_granted) == accmode)
+			return (0);
+
+		goto privcheck;
+	}
+
+	/* Otherwise, check the groups (first match) */
+	if (groupmember(file_gid, cred)) {
+		if (file_mode & S_IXGRP)
+			dac_granted |= VEXEC;
+		if (file_mode & S_IRGRP)
+			dac_granted |= VREAD;
+		if (file_mode & S_IWGRP)
+			dac_granted |= (VWRITE | VAPPEND);
+
+		if ((accmode & dac_granted) == accmode)
+			return (0);
+
+		goto privcheck;
+	}
+
+	/* Otherwise, check everyone else. */
+	if (file_mode & S_IXOTH)
+		dac_granted |= VEXEC;
+	if (file_mode & S_IROTH)
+		dac_granted |= VREAD;
+	if (file_mode & S_IWOTH)
+		dac_granted |= (VWRITE | VAPPEND);
+	if ((accmode & dac_granted) == accmode)
+		return (0);
+
+privcheck:
+	/*
+	 * Build a privilege mask to determine if the set of privileges
+	 * satisfies the requirements when combined with the granted mask
+	 * from above.  For each privilege, if the privilege is required,
+	 * bitwise or the request type onto the priv_granted mask.
+	 */
+	priv_granted = 0;
+
+	if (type == VDIR) {
+		/*
+		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
+		 * requests, instead of PRIV_VFS_EXEC.
+		 */
+		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
+		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
+			priv_granted |= VEXEC;
+	} else {
+		/*
+		 * Ensure that at least one execute bit is on. Otherwise,
+		 * a privileged user will always succeed, and we don't want
+		 * this to happen unless the file really is executable.
+		 */
+		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
+		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
+		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
+			priv_granted |= VEXEC;
+	}
+
+	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
+	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
+		priv_granted |= VREAD;
+
+	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
+	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
+		priv_granted |= (VWRITE | VAPPEND);
+
+	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
+	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
+		priv_granted |= VADMIN;
+
+	if ((accmode & (priv_granted | dac_granted)) == accmode) {
+		/* XXX audit: privilege used */
+		if (privused != NULL)
+			*privused = 1;
+		return (0);
+	}
+
+	return ((accmode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * Credential check based on process requesting service, and per-attribute
+ * permissions.
+ */
+int
+extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
+    struct thread *td, accmode_t accmode)
+{
+
+	/*
+	 * Kernel-invoked always succeeds.
+	 */
+	if (cred == NOCRED)
+		return (0);
+
+	/*
+	 * Do not allow privileged processes in jail to directly manipulate
+	 * system attributes.
+	 */
+	switch (attrnamespace) {
+	case EXTATTR_NAMESPACE_SYSTEM:
+		/* Potentially should be: return (EPERM); */
+		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
+	case EXTATTR_NAMESPACE_USER:
+		return (VOP_ACCESS(vp, accmode, cred, td));
+	default:
+		return (EPERM);
+	}
+}
+
+#ifdef DEBUG_VFS_LOCKS
+/*
+ * This only exists to supress warnings from unlocked specfs accesses.  It is
+ * no longer ok to have an unlocked VFS.
+ */
+#define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
+	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
+
+int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
+    "Drop into debugger on lock violation");
+
+int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
+    0, "Check for interlock across VOPs");
+
+int vfs_badlock_print = 1;	/* Print lock violations. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
+    0, "Print lock violations");
+
+#ifdef KDB
+int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
+    &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
+#endif
+
+static void
+vfs_badlock(const char *msg, const char *str, struct vnode *vp)
+{
+
+#ifdef KDB
+	if (vfs_badlock_backtrace)
+		kdb_backtrace();
+#endif
+	if (vfs_badlock_print)
+		printf("%s: %p %s\n", str, (void *)vp, msg);
+	if (vfs_badlock_ddb)
+		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
+}
+
+void
+assert_vi_locked(struct vnode *vp, const char *str)
+{
+
+	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
+		vfs_badlock("interlock is not locked but should be", str, vp);
+}
+
+void
+assert_vi_unlocked(struct vnode *vp, const char *str)
+{
+
+	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
+		vfs_badlock("interlock is locked but should not be", str, vp);
+}
+
+void
+assert_vop_locked(struct vnode *vp, const char *str)
+{
+	int locked;
+
+	if (!IGNORE_LOCK(vp)) {
+		locked = VOP_ISLOCKED(vp);
+		if (locked == 0 || locked == LK_EXCLOTHER)
+			vfs_badlock("is not locked but should be", str, vp);
+	}
+}
+
+void
+assert_vop_unlocked(struct vnode *vp, const char *str)
+{
+
+	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
+		vfs_badlock("is locked but should not be", str, vp);
+}
+
+void
+assert_vop_elocked(struct vnode *vp, const char *str)
+{
+
+	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
+		vfs_badlock("is not exclusive locked but should be", str, vp);
+}
+
+#if 0
+void
+assert_vop_elocked_other(struct vnode *vp, const char *str)
+{
+
+	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
+		vfs_badlock("is not exclusive locked by another thread",
+		    str, vp);
+}
+
+void
+assert_vop_slocked(struct vnode *vp, const char *str)
+{
+
+	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
+		vfs_badlock("is not locked shared but should be", str, vp);
+}
+#endif /* 0 */
+#endif /* DEBUG_VFS_LOCKS */
+
+void
+vop_rename_fail(struct vop_rename_args *ap)
+{
+
+	if (ap->a_tvp != NULL)
+		vput(ap->a_tvp);
+	if (ap->a_tdvp == ap->a_tvp)
+		vrele(ap->a_tdvp);
+	else
+		vput(ap->a_tdvp);
+	vrele(ap->a_fdvp);
+	vrele(ap->a_fvp);
+}
+
+void
+vop_rename_pre(void *ap)
+{
+	struct vop_rename_args *a = ap;
+
+#ifdef DEBUG_VFS_LOCKS
+	if (a->a_tvp)
+		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
+	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
+	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
+	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
+
+	/* Check the source (from). */
+	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
+	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
+		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
+	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
+		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
+
+	/* Check the target. */
+	if (a->a_tvp)
+		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
+	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
+#endif
+	if (a->a_tdvp != a->a_fdvp)
+		vhold(a->a_fdvp);
+	if (a->a_tvp != a->a_fvp)
+		vhold(a->a_fvp);
+	vhold(a->a_tdvp);
+	if (a->a_tvp)
+		vhold(a->a_tvp);
+}
+
+void
+vop_strategy_pre(void *ap)
+{
+#ifdef DEBUG_VFS_LOCKS
+	struct vop_strategy_args *a;
+	struct buf *bp;
+
+	a = ap;
+	bp = a->a_bp;
+
+	/*
+	 * Cluster ops lock their component buffers but not the IO container.
+	 */
+	if ((bp->b_flags & B_CLUSTER) != 0)
+		return;
+
+	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
+		if (vfs_badlock_print)
+			printf(
+			    "VOP_STRATEGY: bp is not locked but should be\n");
+		if (vfs_badlock_ddb)
+			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
+	}
+#endif
+}
+
+void
+vop_lock_pre(void *ap)
+{
+#ifdef DEBUG_VFS_LOCKS
+	struct vop_lock1_args *a = ap;
+
+	if ((a->a_flags & LK_INTERLOCK) == 0)
+		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
+	else
+		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
+#endif
+}
+
+void
+vop_lock_post(void *ap, int rc)
+{
+#ifdef DEBUG_VFS_LOCKS
+	struct vop_lock1_args *a = ap;
+
+	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
+	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
+		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
+#endif
+}
+
+void
+vop_unlock_pre(void *ap)
+{
+#ifdef DEBUG_VFS_LOCKS
+	struct vop_unlock_args *a = ap;
+
+	if (a->a_flags & LK_INTERLOCK)
+		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
+	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
+#endif
+}
+
+void
+vop_unlock_post(void *ap, int rc)
+{
+#ifdef DEBUG_VFS_LOCKS
+	struct vop_unlock_args *a = ap;
+
+	if (a->a_flags & LK_INTERLOCK)
+		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
+#endif
+}
+
+void
+vop_create_post(void *ap, int rc)
+{
+	struct vop_create_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+}
+
+void
+vop_deleteextattr_post(void *ap, int rc)
+{
+	struct vop_deleteextattr_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+}
+
+void
+vop_link_post(void *ap, int rc)
+{
+	struct vop_link_args *a = ap;
+
+	if (!rc) {
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
+		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
+	}
+}
+
+void
+vop_mkdir_post(void *ap, int rc)
+{
+	struct vop_mkdir_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
+}
+
+void
+vop_mknod_post(void *ap, int rc)
+{
+	struct vop_mknod_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+}
+
+void
+vop_remove_post(void *ap, int rc)
+{
+	struct vop_remove_args *a = ap;
+
+	if (!rc) {
+		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
+	}
+}
+
+void
+vop_rename_post(void *ap, int rc)
+{
+	struct vop_rename_args *a = ap;
+
+	if (!rc) {
+		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
+		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
+		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
+		if (a->a_tvp)
+			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
+	}
+	if (a->a_tdvp != a->a_fdvp)
+		vdrop(a->a_fdvp);
+	if (a->a_tvp != a->a_fvp)
+		vdrop(a->a_fvp);
+	vdrop(a->a_tdvp);
+	if (a->a_tvp)
+		vdrop(a->a_tvp);
+}
+
+void
+vop_rmdir_post(void *ap, int rc)
+{
+	struct vop_rmdir_args *a = ap;
+
+	if (!rc) {
+		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
+	}
+}
+
+void
+vop_setattr_post(void *ap, int rc)
+{
+	struct vop_setattr_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+}
+
+void
+vop_setextattr_post(void *ap, int rc)
+{
+	struct vop_setextattr_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+}
+
+void
+vop_symlink_post(void *ap, int rc)
+{
+	struct vop_symlink_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+}
+
+static struct knlist fs_knlist;
+
+static void
+vfs_event_init(void *arg)
+{
+	knlist_init_mtx(&fs_knlist, NULL);
+}
+/* XXX - correct order? */
+SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
+
+void
+vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
+{
+
+	KNOTE_UNLOCKED(&fs_knlist, event);
+}
+
+static int	filt_fsattach(struct knote *kn);
+static void	filt_fsdetach(struct knote *kn);
+static int	filt_fsevent(struct knote *kn, long hint);
+
+struct filterops fs_filtops = {
+	.f_isfd = 0,
+	.f_attach = filt_fsattach,
+	.f_detach = filt_fsdetach,
+	.f_event = filt_fsevent
+};
+
+static int
+filt_fsattach(struct knote *kn)
+{
+
+	kn->kn_flags |= EV_CLEAR;
+	knlist_add(&fs_knlist, kn, 0);
+	return (0);
+}
+
+static void
+filt_fsdetach(struct knote *kn)
+{
+
+	knlist_remove(&fs_knlist, kn, 0);
+}
+
+static int
+filt_fsevent(struct knote *kn, long hint)
+{
+
+	kn->kn_fflags |= hint;
+	return (kn->kn_fflags != 0);
+}
+
+static int
+sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
+{
+	struct vfsidctl vc;
+	int error;
+	struct mount *mp;
+
+	error = SYSCTL_IN(req, &vc, sizeof(vc));
+	if (error)
+		return (error);
+	if (vc.vc_vers != VFS_CTL_VERS1)
+		return (EINVAL);
+	mp = vfs_getvfs(&vc.vc_fsid);
+	if (mp == NULL)
+		return (ENOENT);
+	/* ensure that a specific sysctl goes to the right filesystem. */
+	if (strcmp(vc.vc_fstypename, "*") != 0 &&
+	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
+		vfs_rel(mp);
+		return (EINVAL);
+	}
+	VCTLTOREQ(&vc, req);
+	error = VFS_SYSCTL(mp, vc.vc_op, req);
+	vfs_rel(mp);
+	return (error);
+}
+
+SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
+    NULL, 0, sysctl_vfs_ctl, "",
+    "Sysctl by fsid");
+
+/*
+ * Function to initialize a va_filerev field sensibly.
+ * XXX: Wouldn't a random number make a lot more sense ??
+ */
+u_quad_t
+init_va_filerev(void)
+{
+	struct bintime bt;
+
+	getbinuptime(&bt);
+	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
+}
+
+static int	filt_vfsread(struct knote *kn, long hint);
+static int	filt_vfswrite(struct knote *kn, long hint);
+static int	filt_vfsvnode(struct knote *kn, long hint);
+static void	filt_vfsdetach(struct knote *kn);
+static struct filterops vfsread_filtops = {
+	.f_isfd = 1,
+	.f_detach = filt_vfsdetach,
+	.f_event = filt_vfsread
+};
+static struct filterops vfswrite_filtops = {
+	.f_isfd = 1,
+	.f_detach = filt_vfsdetach,
+	.f_event = filt_vfswrite
+};
+static struct filterops vfsvnode_filtops = {
+	.f_isfd = 1,
+	.f_detach = filt_vfsdetach,
+	.f_event = filt_vfsvnode
+};
+
+static void
+vfs_knllock(void *arg)
+{
+	struct vnode *vp = arg;
+
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+}
+
+static void
+vfs_knlunlock(void *arg)
+{
+	struct vnode *vp = arg;
+
+	VOP_UNLOCK(vp, 0);
+}
+
+static void
+vfs_knl_assert_locked(void *arg)
+{
+#ifdef DEBUG_VFS_LOCKS
+	struct vnode *vp = arg;
+
+	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
+#endif
+}
+
+static void
+vfs_knl_assert_unlocked(void *arg)
+{
+#ifdef DEBUG_VFS_LOCKS
+	struct vnode *vp = arg;
+
+	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
+#endif
+}
+
+int
+vfs_kqfilter(struct vop_kqfilter_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct knote *kn = ap->a_kn;
+	struct knlist *knl;
+
+	switch (kn->kn_filter) {
+	case EVFILT_READ:
+		kn->kn_fop = &vfsread_filtops;
+		break;
+	case EVFILT_WRITE:
+		kn->kn_fop = &vfswrite_filtops;
+		break;
+	case EVFILT_VNODE:
+		kn->kn_fop = &vfsvnode_filtops;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	kn->kn_hook = (caddr_t)vp;
+
+	v_addpollinfo(vp);
+	if (vp->v_pollinfo == NULL)
+		return (ENOMEM);
+	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
+	knlist_add(knl, kn, 0);
+
+	return (0);
+}
+
+/*
+ * Detach knote from vnode
+ */
+static void
+filt_vfsdetach(struct knote *kn)
+{
+	struct vnode *vp = (struct vnode *)kn->kn_hook;
+
+	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
+	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
+}
+
+/*ARGSUSED*/
+static int
+filt_vfsread(struct knote *kn, long hint)
+{
+	struct vnode *vp = (struct vnode *)kn->kn_hook;
+	struct vattr va;
+	int res;
+
+	/*
+	 * filesystem is gone, so set the EOF flag and schedule
+	 * the knote for deletion.
+	 */
+	if (hint == NOTE_REVOKE) {
+		VI_LOCK(vp);
+		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+		VI_UNLOCK(vp);
+		return (1);
+	}
+
+	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
+		return (0);
+
+	VI_LOCK(vp);
+	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
+	res = (kn->kn_data != 0);
+	VI_UNLOCK(vp);
+	return (res);
+}
+
+/*ARGSUSED*/
+static int
+filt_vfswrite(struct knote *kn, long hint)
+{
+	struct vnode *vp = (struct vnode *)kn->kn_hook;
+
+	VI_LOCK(vp);
+
+	/*
+	 * filesystem is gone, so set the EOF flag and schedule
+	 * the knote for deletion.
+	 */
+	if (hint == NOTE_REVOKE)
+		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+
+	kn->kn_data = 0;
+	VI_UNLOCK(vp);
+	return (1);
+}
+
+static int
+filt_vfsvnode(struct knote *kn, long hint)
+{
+	struct vnode *vp = (struct vnode *)kn->kn_hook;
+	int res;
+
+	VI_LOCK(vp);
+	if (kn->kn_sfflags & hint)
+		kn->kn_fflags |= hint;
+	if (hint == NOTE_REVOKE) {
+		kn->kn_flags |= EV_EOF;
+		VI_UNLOCK(vp);
+		return (1);
+	}
+	res = (kn->kn_fflags != 0);
+	VI_UNLOCK(vp);
+	return (res);
+}
+
+int
+vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
+{
+	int error;
+
+	if (dp->d_reclen > ap->a_uio->uio_resid)
+		return (ENAMETOOLONG);
+	error = uiomove(dp, dp->d_reclen, ap->a_uio);
+	if (error) {
+		if (ap->a_ncookies != NULL) {
+			if (ap->a_cookies != NULL)
+				free(ap->a_cookies, M_TEMP);
+			ap->a_cookies = NULL;
+			*ap->a_ncookies = 0;
+		}
+		return (error);
+	}
+	if (ap->a_ncookies == NULL)
+		return (0);
+
+	KASSERT(ap->a_cookies,
+	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
+
+	*ap->a_cookies = realloc(*ap->a_cookies,
+	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
+	(*ap->a_cookies)[*ap->a_ncookies] = off;
+	return (0);
+}
+
+/*
+ * Mark for update the access time of the file if the filesystem
+ * supports VOP_MARKATIME.  This functionality is used by execve and
+ * mmap, so we want to avoid the I/O implied by directly setting
+ * va_atime for the sake of efficiency.
+ */
+void
+vfs_mark_atime(struct vnode *vp, struct ucred *cred)
+{
+	struct mount *mp;
+
+	mp = vp->v_mount;
+	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
+	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
+		(void)VOP_MARKATIME(vp);
+}
+
+/*
+ * The purpose of this routine is to remove granularity from accmode_t,
+ * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
+ * VADMIN and VAPPEND.
+ *
+ * If it returns 0, the caller is supposed to continue with the usual
+ * access checks using 'accmode' as modified by this routine.  If it
+ * returns nonzero value, the caller is supposed to return that value
+ * as errno.
+ *
+ * Note that after this routine runs, accmode may be zero.
+ */
+int
+vfs_unixify_accmode(accmode_t *accmode)
+{
+	/*
+	 * There is no way to specify explicit "deny" rule using
+	 * file mode or POSIX.1e ACLs.
+	 */
+	if (*accmode & VEXPLICIT_DENY) {
+		*accmode = 0;
+		return (0);
+	}
+
+	/*
+	 * None of these can be translated into usual access bits.
+	 * Also, the common case for NFSv4 ACLs is to not contain
+	 * either of these bits. Caller should check for VWRITE
+	 * on the containing directory instead.
+	 */
+	if (*accmode & (VDELETE_CHILD | VDELETE))
+		return (EPERM);
+
+	if (*accmode & VADMIN_PERMS) {
+		*accmode &= ~VADMIN_PERMS;
+		*accmode |= VADMIN;
+	}
+
+	/*
+	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
+	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
+	 */
+	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
+
+	return (0);
+}
+
+/*
+ * These are helper functions for filesystems to traverse all
+ * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
+ *
+ * This interface replaces MNT_VNODE_FOREACH.
+ */
+
+MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
+
+struct vnode *
+__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
+{
+	struct vnode *vp;
+
+	if (should_yield())
+		kern_yield(PRI_USER);
+	MNT_ILOCK(mp);
+	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
+	while (vp != NULL && (vp->v_type == VMARKER ||
+	    (vp->v_iflag & VI_DOOMED) != 0))
+		vp = TAILQ_NEXT(vp, v_nmntvnodes);
+
+	/* Check if we are done */
+	if (vp == NULL) {
+		__mnt_vnode_markerfree_all(mvp, mp);
+		/* MNT_IUNLOCK(mp); -- done in above function */
+		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
+		return (NULL);
+	}
+	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
+	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
+	VI_LOCK(vp);
+	MNT_IUNLOCK(mp);
+	return (vp);
+}
+
+struct vnode *
+__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
+{
+	struct vnode *vp;
+
+	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
+	MNT_ILOCK(mp);
+	MNT_REF(mp);
+	(*mvp)->v_type = VMARKER;
+
+	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
+	while (vp != NULL && (vp->v_type == VMARKER ||
+	    (vp->v_iflag & VI_DOOMED) != 0))
+		vp = TAILQ_NEXT(vp, v_nmntvnodes);
+
+	/* Check if we are done */
+	if (vp == NULL) {
+		MNT_REL(mp);
+		MNT_IUNLOCK(mp);
+		free(*mvp, M_VNODE_MARKER);
+		*mvp = NULL;
+		return (NULL);
+	}
+	(*mvp)->v_mount = mp;
+	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
+	VI_LOCK(vp);
+	MNT_IUNLOCK(mp);
+	return (vp);
+}
+
+
+void
+__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
+{
+
+	if (*mvp == NULL) {
+		MNT_IUNLOCK(mp);
+		return;
+	}
+
+	mtx_assert(MNT_MTX(mp), MA_OWNED);
+
+	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
+	MNT_REL(mp);
+	MNT_IUNLOCK(mp);
+	free(*mvp, M_VNODE_MARKER);
+	*mvp = NULL;
+}
+
+/*
+ * These are helper functions for filesystems to traverse their
+ * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
+ */
+static void
+mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
+{
+
+	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+
+	MNT_ILOCK(mp);
+	MNT_REL(mp);
+	MNT_IUNLOCK(mp);
+	free(*mvp, M_VNODE_MARKER);
+	*mvp = NULL;
+}
+
+static struct vnode *
+mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
+{
+	struct vnode *vp, *nvp;
+
+	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
+	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+restart:
+	vp = TAILQ_NEXT(*mvp, v_actfreelist);
+	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
+	while (vp != NULL) {
+		if (vp->v_type == VMARKER) {
+			vp = TAILQ_NEXT(vp, v_actfreelist);
+			continue;
+		}
+		if (!VI_TRYLOCK(vp)) {
+			if (mp_ncpus == 1 || should_yield()) {
+				TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
+				mtx_unlock(&vnode_free_list_mtx);
+				pause("vnacti", 1);
+				mtx_lock(&vnode_free_list_mtx);
+				goto restart;
+			}
+			continue;
+		}
+		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
+		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
+		    ("alien vnode on the active list %p %p", vp, mp));
+		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
+			break;
+		nvp = TAILQ_NEXT(vp, v_actfreelist);
+		VI_UNLOCK(vp);
+		vp = nvp;
+	}
+
+	/* Check if we are done */
+	if (vp == NULL) {
+		mtx_unlock(&vnode_free_list_mtx);
+		mnt_vnode_markerfree_active(mvp, mp);
+		return (NULL);
+	}
+	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
+	mtx_unlock(&vnode_free_list_mtx);
+	ASSERT_VI_LOCKED(vp, "active iter");
+	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
+	return (vp);
+}
+
+struct vnode *
+__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
+{
+
+	if (should_yield())
+		kern_yield(PRI_USER);
+	mtx_lock(&vnode_free_list_mtx);
+	return (mnt_vnode_next_active(mvp, mp));
+}
+
+struct vnode *
+__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
+{
+	struct vnode *vp;
+
+	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
+	MNT_ILOCK(mp);
+	MNT_REF(mp);
+	MNT_IUNLOCK(mp);
+	(*mvp)->v_type = VMARKER;
+	(*mvp)->v_mount = mp;
+
+	mtx_lock(&vnode_free_list_mtx);
+	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
+	if (vp == NULL) {
+		mtx_unlock(&vnode_free_list_mtx);
+		mnt_vnode_markerfree_active(mvp, mp);
+		return (NULL);
+	}
+	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
+	return (mnt_vnode_next_active(mvp, mp));
+}
+
+void
+__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
+{
+
+	if (*mvp == NULL)
+		return;
+
+	mtx_lock(&vnode_free_list_mtx);
+	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
+	mtx_unlock(&vnode_free_list_mtx);
+	mnt_vnode_markerfree_active(mvp, mp);
+}
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
new file mode 100644
index 0000000..4b82df8
--- /dev/null
+++ b/sys/kern/vfs_syscalls.c
@@ -0,0 +1,4729 @@
+/*-
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/capability.h>
+#include <sys/disk.h>
+#include <sys/sysent.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/limits.h>
+#include <sys/linker.h>
+#include <sys/rwlock.h>
+#include <sys/sdt.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/dirent.h>
+#include <sys/jail.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <machine/stdarg.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+#include <ufs/ufs/quota.h>
+
+MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
+
+SDT_PROVIDER_DEFINE(vfs);
+SDT_PROBE_DEFINE2(vfs, , stat, mode, mode, "char *", "int");
+SDT_PROBE_DEFINE2(vfs, , stat, reg, reg, "char *", "int");
+
+static int chroot_refuse_vdir_fds(struct filedesc *fdp);
+static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
+static int kern_chflags(struct thread *td, const char *path,
+    enum uio_seg pathseg, u_long flags);
+static int kern_chflagsat(struct thread *td, int fd, const char *path,
+    enum uio_seg pathseg, u_long flags, int atflag);
+static int setfflags(struct thread *td, struct vnode *, u_long);
+static int setutimes(struct thread *td, struct vnode *,
+    const struct timespec *, int, int);
+static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
+    struct thread *td);
+
+/*
+ * The module initialization routine for POSIX asynchronous I/O will
+ * set this to the version of AIO that it implements.  (Zero means
+ * that it is not implemented.)  This value is used here by pathconf()
+ * and in kern_descrip.c by fpathconf().
+ */
+int async_io_version;
+
+#ifdef DEBUG
+static int syncprt = 0;
+SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+	int     dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_sync(td, uap)
+	struct thread *td;
+	struct sync_args *uap;
+{
+	struct mount *mp, *nmp;
+	int save;
+
+	mtx_lock(&mountlist_mtx);
+	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			continue;
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
+		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
+			save = curthread_pflags_set(TDP_SYNCIO);
+			vfs_msync(mp, MNT_NOWAIT);
+			VFS_SYNC(mp, MNT_NOWAIT);
+			curthread_pflags_restore(save);
+			vn_finished_write(mp);
+		}
+		mtx_lock(&mountlist_mtx);
+		nmp = TAILQ_NEXT(mp, mnt_list);
+		vfs_unbusy(mp);
+	}
+	mtx_unlock(&mountlist_mtx);
+	return (0);
+}
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+	char *path;
+	int cmd;
+	int uid;
+	caddr_t arg;
+};
+#endif
+int
+sys_quotactl(td, uap)
+	struct thread *td;
+	register struct quotactl_args /* {
+		char *path;
+		int cmd;
+		int uid;
+		caddr_t arg;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct nameidata nd;
+	int error;
+
+	AUDIT_ARG_CMD(uap->cmd);
+	AUDIT_ARG_UID(uap->uid);
+	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
+		return (EPERM);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	mp = nd.ni_vp->v_mount;
+	vfs_ref(mp);
+	vput(nd.ni_vp);
+	error = vfs_busy(mp, 0);
+	vfs_rel(mp);
+	if (error != 0)
+		return (error);
+	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
+
+	/*
+	 * Since quota on operation typically needs to open quota
+	 * file, the Q_QUOTAON handler needs to unbusy the mount point
+	 * before calling into namei.  Otherwise, unmount might be
+	 * started between two vfs_busy() invocations (first is our,
+	 * second is from mount point cross-walk code in lookup()),
+	 * causing deadlock.
+	 *
+	 * Require that Q_QUOTAON handles the vfs_busy() reference on
+	 * its own, always returning with ubusied mount point.
+	 */
+	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
+		vfs_unbusy(mp);
+	return (error);
+}
+
+/*
+ * Used by statfs conversion routines to scale the block size up if
+ * necessary so that all of the block counts are <= 'max_size'.  Note
+ * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
+ * value of 'n'.
+ */
+void
+statfs_scale_blocks(struct statfs *sf, long max_size)
+{
+	uint64_t count;
+	int shift;
+
+	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
+
+	/*
+	 * Attempt to scale the block counts to give a more accurate
+	 * overview to userland of the ratio of free space to used
+	 * space.  To do this, find the largest block count and compute
+	 * a divisor that lets it fit into a signed integer <= max_size.
+	 */
+	if (sf->f_bavail < 0)
+		count = -sf->f_bavail;
+	else
+		count = sf->f_bavail;
+	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
+	if (count <= max_size)
+		return;
+
+	count >>= flsl(max_size);
+	shift = 0;
+	while (count > 0) {
+		shift++;
+		count >>=1;
+	}
+
+	sf->f_bsize <<= shift;
+	sf->f_blocks >>= shift;
+	sf->f_bfree >>= shift;
+	sf->f_bavail >>= shift;
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+	char *path;
+	struct statfs *buf;
+};
+#endif
+int
+sys_statfs(td, uap)
+	struct thread *td;
+	register struct statfs_args /* {
+		char *path;
+		struct statfs *buf;
+	} */ *uap;
+{
+	struct statfs sf;
+	int error;
+
+	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
+	if (error == 0)
+		error = copyout(&sf, uap->buf, sizeof(sf));
+	return (error);
+}
+
+int
+kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
+    struct statfs *buf)
+{
+	struct mount *mp;
+	struct statfs *sp, sb;
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, td);
+	error = namei(&nd);
+	if (error != 0)
+		return (error);
+	mp = nd.ni_vp->v_mount;
+	vfs_ref(mp);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_vp);
+	error = vfs_busy(mp, 0);
+	vfs_rel(mp);
+	if (error != 0)
+		return (error);
+#ifdef MAC
+	error = mac_mount_check_stat(td->td_ucred, mp);
+	if (error != 0)
+		goto out;
+#endif
+	/*
+	 * Set these in case the underlying filesystem fails to do so.
+	 */
+	sp = &mp->mnt_stat;
+	sp->f_version = STATFS_VERSION;
+	sp->f_namemax = NAME_MAX;
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	error = VFS_STATFS(mp, sp);
+	if (error != 0)
+		goto out;
+	if (priv_check(td, PRIV_VFS_GENERATION)) {
+		bcopy(sp, &sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		prison_enforce_statfs(td->td_ucred, mp, &sb);
+		sp = &sb;
+	}
+	*buf = *sp;
+out:
+	vfs_unbusy(mp);
+	return (error);
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+	int fd;
+	struct statfs *buf;
+};
+#endif
+int
+sys_fstatfs(td, uap)
+	struct thread *td;
+	register struct fstatfs_args /* {
+		int fd;
+		struct statfs *buf;
+	} */ *uap;
+{
+	struct statfs sf;
+	int error;
+
+	error = kern_fstatfs(td, uap->fd, &sf);
+	if (error == 0)
+		error = copyout(&sf, uap->buf, sizeof(sf));
+	return (error);
+}
+
+int
+kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
+{
+	struct file *fp;
+	struct mount *mp;
+	struct statfs *sp, sb;
+	struct vnode *vp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(fd);
+	error = getvnode(td->td_proc->p_fd, fd,
+	    cap_rights_init(&rights, CAP_FSTATFS), &fp);
+	if (error != 0)
+		return (error);
+	vp = fp->f_vnode;
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+#ifdef AUDIT
+	AUDIT_ARG_VNODE1(vp);
+#endif
+	mp = vp->v_mount;
+	if (mp)
+		vfs_ref(mp);
+	VOP_UNLOCK(vp, 0);
+	fdrop(fp, td);
+	if (mp == NULL) {
+		error = EBADF;
+		goto out;
+	}
+	error = vfs_busy(mp, 0);
+	vfs_rel(mp);
+	if (error != 0)
+		return (error);
+#ifdef MAC
+	error = mac_mount_check_stat(td->td_ucred, mp);
+	if (error != 0)
+		goto out;
+#endif
+	/*
+	 * Set these in case the underlying filesystem fails to do so.
+	 */
+	sp = &mp->mnt_stat;
+	sp->f_version = STATFS_VERSION;
+	sp->f_namemax = NAME_MAX;
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	error = VFS_STATFS(mp, sp);
+	if (error != 0)
+		goto out;
+	if (priv_check(td, PRIV_VFS_GENERATION)) {
+		bcopy(sp, &sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		prison_enforce_statfs(td->td_ucred, mp, &sb);
+		sp = &sb;
+	}
+	*buf = *sp;
+out:
+	if (mp)
+		vfs_unbusy(mp);
+	return (error);
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+	struct statfs *buf;
+	long bufsize;
+	int flags;
+};
+#endif
+int
+sys_getfsstat(td, uap)
+	struct thread *td;
+	register struct getfsstat_args /* {
+		struct statfs *buf;
+		long bufsize;
+		int flags;
+	} */ *uap;
+{
+
+	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
+	    uap->flags));
+}
+
+/*
+ * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
+ *	The caller is responsible for freeing memory which will be allocated
+ *	in '*buf'.
+ */
+int
+kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
+    enum uio_seg bufseg, int flags)
+{
+	struct mount *mp, *nmp;
+	struct statfs *sfsp, *sp, sb;
+	size_t count, maxcount;
+	int error;
+
+	maxcount = bufsize / sizeof(struct statfs);
+	if (bufsize == 0)
+		sfsp = NULL;
+	else if (bufseg == UIO_USERSPACE)
+		sfsp = *buf;
+	else /* if (bufseg == UIO_SYSSPACE) */ {
+		count = 0;
+		mtx_lock(&mountlist_mtx);
+		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+			count++;
+		}
+		mtx_unlock(&mountlist_mtx);
+		if (maxcount > count)
+			maxcount = count;
+		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
+		    M_WAITOK);
+	}
+	count = 0;
+	mtx_lock(&mountlist_mtx);
+	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+		if (prison_canseemount(td->td_ucred, mp) != 0) {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			continue;
+		}
+#ifdef MAC
+		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			continue;
+		}
+#endif
+		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			continue;
+		}
+		if (sfsp && count < maxcount) {
+			sp = &mp->mnt_stat;
+			/*
+			 * Set these in case the underlying filesystem
+			 * fails to do so.
+			 */
+			sp->f_version = STATFS_VERSION;
+			sp->f_namemax = NAME_MAX;
+			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+			/*
+			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
+			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
+			 * overrides MNT_WAIT.
+			 */
+			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
+			    (flags & MNT_WAIT)) &&
+			    (error = VFS_STATFS(mp, sp))) {
+				mtx_lock(&mountlist_mtx);
+				nmp = TAILQ_NEXT(mp, mnt_list);
+				vfs_unbusy(mp);
+				continue;
+			}
+			if (priv_check(td, PRIV_VFS_GENERATION)) {
+				bcopy(sp, &sb, sizeof(sb));
+				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+				prison_enforce_statfs(td->td_ucred, mp, &sb);
+				sp = &sb;
+			}
+			if (bufseg == UIO_SYSSPACE)
+				bcopy(sp, sfsp, sizeof(*sp));
+			else /* if (bufseg == UIO_USERSPACE) */ {
+				error = copyout(sp, sfsp, sizeof(*sp));
+				if (error != 0) {
+					vfs_unbusy(mp);
+					return (error);
+				}
+			}
+			sfsp++;
+		}
+		count++;
+		mtx_lock(&mountlist_mtx);
+		nmp = TAILQ_NEXT(mp, mnt_list);
+		vfs_unbusy(mp);
+	}
+	mtx_unlock(&mountlist_mtx);
+	if (sfsp && count > maxcount)
+		td->td_retval[0] = maxcount;
+	else
+		td->td_retval[0] = count;
+	return (0);
+}
+
+#ifdef COMPAT_FREEBSD4
+/*
+ * Get old format filesystem statistics.
+ */
+static void cvtstatfs(struct statfs *, struct ostatfs *);
+
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_statfs_args {
+	char *path;
+	struct ostatfs *buf;
+};
+#endif
+int
+freebsd4_statfs(td, uap)
+	struct thread *td;
+	struct freebsd4_statfs_args /* {
+		char *path;
+		struct ostatfs *buf;
+	} */ *uap;
+{
+	struct ostatfs osb;
+	struct statfs sf;
+	int error;
+
+	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
+	if (error != 0)
+		return (error);
+	cvtstatfs(&sf, &osb);
+	return (copyout(&osb, uap->buf, sizeof(osb)));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_fstatfs_args {
+	int fd;
+	struct ostatfs *buf;
+};
+#endif
+int
+freebsd4_fstatfs(td, uap)
+	struct thread *td;
+	struct freebsd4_fstatfs_args /* {
+		int fd;
+		struct ostatfs *buf;
+	} */ *uap;
+{
+	struct ostatfs osb;
+	struct statfs sf;
+	int error;
+
+	error = kern_fstatfs(td, uap->fd, &sf);
+	if (error != 0)
+		return (error);
+	cvtstatfs(&sf, &osb);
+	return (copyout(&osb, uap->buf, sizeof(osb)));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_getfsstat_args {
+	struct ostatfs *buf;
+	long bufsize;
+	int flags;
+};
+#endif
+int
+freebsd4_getfsstat(td, uap)
+	struct thread *td;
+	register struct freebsd4_getfsstat_args /* {
+		struct ostatfs *buf;
+		long bufsize;
+		int flags;
+	} */ *uap;
+{
+	struct statfs *buf, *sp;
+	struct ostatfs osb;
+	size_t count, size;
+	int error;
+
+	count = uap->bufsize / sizeof(struct ostatfs);
+	size = count * sizeof(struct statfs);
+	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
+	if (size > 0) {
+		count = td->td_retval[0];
+		sp = buf;
+		while (count > 0 && error == 0) {
+			cvtstatfs(sp, &osb);
+			error = copyout(&osb, uap->buf, sizeof(osb));
+			sp++;
+			uap->buf++;
+			count--;
+		}
+		free(buf, M_TEMP);
+	}
+	return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_fhstatfs_args {
+	struct fhandle *u_fhp;
+	struct ostatfs *buf;
+};
+#endif
+int
+freebsd4_fhstatfs(td, uap)
+	struct thread *td;
+	struct freebsd4_fhstatfs_args /* {
+		struct fhandle *u_fhp;
+		struct ostatfs *buf;
+	} */ *uap;
+{
+	struct ostatfs osb;
+	struct statfs sf;
+	fhandle_t fh;
+	int error;
+
+	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
+	if (error != 0)
+		return (error);
+	error = kern_fhstatfs(td, fh, &sf);
+	if (error != 0)
+		return (error);
+	cvtstatfs(&sf, &osb);
+	return (copyout(&osb, uap->buf, sizeof(osb)));
+}
+
+/*
+ * Convert a new format statfs structure to an old format statfs structure.
+ */
+static void
+cvtstatfs(nsp, osp)
+	struct statfs *nsp;
+	struct ostatfs *osp;
+{
+
+	statfs_scale_blocks(nsp, LONG_MAX);
+	bzero(osp, sizeof(*osp));
+	osp->f_bsize = nsp->f_bsize;
+	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
+	osp->f_blocks = nsp->f_blocks;
+	osp->f_bfree = nsp->f_bfree;
+	osp->f_bavail = nsp->f_bavail;
+	osp->f_files = MIN(nsp->f_files, LONG_MAX);
+	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
+	osp->f_owner = nsp->f_owner;
+	osp->f_type = nsp->f_type;
+	osp->f_flags = nsp->f_flags;
+	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
+	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
+	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
+	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
+	strlcpy(osp->f_fstypename, nsp->f_fstypename,
+	    MIN(MFSNAMELEN, OMFSNAMELEN));
+	strlcpy(osp->f_mntonname, nsp->f_mntonname,
+	    MIN(MNAMELEN, OMNAMELEN));
+	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
+	    MIN(MNAMELEN, OMNAMELEN));
+	osp->f_fsid = nsp->f_fsid;
+}
+#endif /* COMPAT_FREEBSD4 */
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+	int	fd;
+};
+#endif
+int
+sys_fchdir(td, uap)
+	struct thread *td;
+	struct fchdir_args /* {
+		int fd;
+	} */ *uap;
+{
+	register struct filedesc *fdp = td->td_proc->p_fd;
+	struct vnode *vp, *tdp, *vpold;
+	struct mount *mp;
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	error = getvnode(fdp, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
+	    &fp);
+	if (error != 0)
+		return (error);
+	vp = fp->f_vnode;
+	VREF(vp);
+	fdrop(fp, td);
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+	error = change_dir(vp, td);
+	while (!error && (mp = vp->v_mountedhere) != NULL) {
+		if (vfs_busy(mp, 0))
+			continue;
+		error = VFS_ROOT(mp, LK_SHARED, &tdp);
+		vfs_unbusy(mp);
+		if (error != 0)
+			break;
+		vput(vp);
+		vp = tdp;
+	}
+	if (error != 0) {
+		vput(vp);
+		return (error);
+	}
+	VOP_UNLOCK(vp, 0);
+	FILEDESC_XLOCK(fdp);
+	vpold = fdp->fd_cdir;
+	fdp->fd_cdir = vp;
+	FILEDESC_XUNLOCK(fdp);
+	vrele(vpold);
+	return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+	char	*path;
+};
+#endif
+int
+sys_chdir(td, uap)
+	struct thread *td;
+	struct chdir_args /* {
+		char *path;
+	} */ *uap;
+{
+
+	return (kern_chdir(td, uap->path, UIO_USERSPACE));
+}
+
+int
+kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
+{
+	register struct filedesc *fdp = td->td_proc->p_fd;
+	struct nameidata nd;
+	struct vnode *vp;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	if ((error = change_dir(nd.ni_vp, td)) != 0) {
+		vput(nd.ni_vp);
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		return (error);
+	}
+	VOP_UNLOCK(nd.ni_vp, 0);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	FILEDESC_XLOCK(fdp);
+	vp = fdp->fd_cdir;
+	fdp->fd_cdir = nd.ni_vp;
+	FILEDESC_XUNLOCK(fdp);
+	vrele(vp);
+	return (0);
+}
+
+/*
+ * Helper function for raised chroot(2) security function:  Refuse if
+ * any filedescriptors are open directories.
+ */
+static int
+chroot_refuse_vdir_fds(fdp)
+	struct filedesc *fdp;
+{
+	struct vnode *vp;
+	struct file *fp;
+	int fd;
+
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
+		fp = fget_locked(fdp, fd);
+		if (fp == NULL)
+			continue;
+		if (fp->f_type == DTYPE_VNODE) {
+			vp = fp->f_vnode;
+			if (vp->v_type == VDIR)
+				return (EPERM);
+		}
+	}
+	return (0);
+}
+
+/*
+ * This sysctl determines if we will allow a process to chroot(2) if it
+ * has a directory open:
+ *	0: disallowed for all processes.
+ *	1: allowed for processes that were not already chroot(2)'ed.
+ *	2: allowed for all processes.
+ */
+
+static int chroot_allow_open_directories = 1;
+
+SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
+     &chroot_allow_open_directories, 0,
+     "Allow a process to chroot(2) if it has a directory open");
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+	char	*path;
+};
+#endif
+int
+sys_chroot(td, uap)
+	struct thread *td;
+	struct chroot_args /* {
+		char *path;
+	} */ *uap;
+{
+	struct nameidata nd;
+	int error;
+
+	error = priv_check(td, PRIV_VFS_CHROOT);
+	if (error != 0)
+		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+	    UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error != 0)
+		goto error;
+	error = change_dir(nd.ni_vp, td);
+	if (error != 0)
+		goto e_vunlock;
+#ifdef MAC
+	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
+	if (error != 0)
+		goto e_vunlock;
+#endif
+	VOP_UNLOCK(nd.ni_vp, 0);
+	error = change_root(nd.ni_vp, td);
+	vrele(nd.ni_vp);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	return (error);
+e_vunlock:
+	vput(nd.ni_vp);
+error:
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	return (error);
+}
+
+/*
+ * Common routine for chroot and chdir.  Callers must provide a locked vnode
+ * instance.
+ */
+int
+change_dir(vp, td)
+	struct vnode *vp;
+	struct thread *td;
+{
+#ifdef MAC
+	int error;
+#endif
+
+	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+#ifdef MAC
+	error = mac_vnode_check_chdir(td->td_ucred, vp);
+	if (error != 0)
+		return (error);
+#endif
+	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
+}
+
+/*
+ * Common routine for kern_chroot() and jail_attach().  The caller is
+ * responsible for invoking priv_check() and mac_vnode_check_chroot() to
+ * authorize this operation.
+ */
+int
+change_root(vp, td)
+	struct vnode *vp;
+	struct thread *td;
+{
+	struct filedesc *fdp;
+	struct vnode *oldvp;
+	int error;
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_XLOCK(fdp);
+	if (chroot_allow_open_directories == 0 ||
+	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
+		error = chroot_refuse_vdir_fds(fdp);
+		if (error != 0) {
+			FILEDESC_XUNLOCK(fdp);
+			return (error);
+		}
+	}
+	oldvp = fdp->fd_rdir;
+	fdp->fd_rdir = vp;
+	VREF(fdp->fd_rdir);
+	if (!fdp->fd_jdir) {
+		fdp->fd_jdir = vp;
+		VREF(fdp->fd_jdir);
+	}
+	FILEDESC_XUNLOCK(fdp);
+	vrele(oldvp);
+	return (0);
+}
+
+static __inline void
+flags_to_rights(int flags, cap_rights_t *rightsp)
+{
+
+	if (flags & O_EXEC) {
+		cap_rights_set(rightsp, CAP_FEXECVE);
+	} else {
+		switch ((flags & O_ACCMODE)) {
+		case O_RDONLY:
+			cap_rights_set(rightsp, CAP_READ);
+			break;
+		case O_RDWR:
+			cap_rights_set(rightsp, CAP_READ);
+			/* FALLTHROUGH */
+		case O_WRONLY:
+			cap_rights_set(rightsp, CAP_WRITE);
+			if (!(flags & (O_APPEND | O_TRUNC)))
+				cap_rights_set(rightsp, CAP_SEEK);
+			break;
+		}
+	}
+
+	if (flags & O_CREAT)
+		cap_rights_set(rightsp, CAP_CREATE);
+
+	if (flags & O_TRUNC)
+		cap_rights_set(rightsp, CAP_FTRUNCATE);
+
+	if (flags & (O_SYNC | O_FSYNC))
+		cap_rights_set(rightsp, CAP_FSYNC);
+
+	if (flags & (O_EXLOCK | O_SHLOCK))
+		cap_rights_set(rightsp, CAP_FLOCK);
+}
+
+/*
+ * Check permissions, allocate an open file structure, and call the device
+ * open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+	char	*path;
+	int	flags;
+	int	mode;
+};
+#endif
+int
+sys_open(td, uap)
+	struct thread *td;
+	register struct open_args /* {
+		char *path;
+		int flags;
+		int mode;
+	} */ *uap;
+{
+
+	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct openat_args {
+	int	fd;
+	char	*path;
+	int	flag;
+	int	mode;
+};
+#endif
+int
+sys_openat(struct thread *td, struct openat_args *uap)
+{
+
+	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
+	    uap->mode));
+}
+
+int
+kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
+    int mode)
+{
+
+	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
+}
+
+int
+kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    int flags, int mode)
+{
+	struct proc *p = td->td_proc;
+	struct filedesc *fdp = p->p_fd;
+	struct file *fp;
+	struct vnode *vp;
+	struct nameidata nd;
+	cap_rights_t rights;
+	int cmode, error, indx;
+
+	indx = -1;
+
+	AUDIT_ARG_FFLAGS(flags);
+	AUDIT_ARG_MODE(mode);
+	/* XXX: audit dirfd */
+	cap_rights_init(&rights, CAP_LOOKUP);
+	flags_to_rights(flags, &rights);
+	/*
+	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
+	 * may be specified.
+	 */
+	if (flags & O_EXEC) {
+		if (flags & O_ACCMODE)
+			return (EINVAL);
+	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
+		return (EINVAL);
+	} else {
+		flags = FFLAGS(flags);
+	}
+
+	/*
+	 * Allocate the file descriptor, but don't install a descriptor yet.
+	 */
+	error = falloc_noinstall(td, &fp);
+	if (error != 0)
+		return (error);
+	/*
+	 * An extra reference on `fp' has been held for us by
+	 * falloc_noinstall().
+	 */
+	/* Set the flags early so the finit in devfs can pick them up. */
+	fp->f_flag = flags & FMASK;
+	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
+	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
+	    &rights, td);
+	td->td_dupfd = -1;		/* XXX check for fdopen */
+	error = vn_open(&nd, &flags, cmode, fp);
+	if (error != 0) {
+		/*
+		 * If the vn_open replaced the method vector, something
+		 * wonderous happened deep below and we just pass it up
+		 * pretending we know what we do.
+		 */
+		if (error == ENXIO && fp->f_ops != &badfileops)
+			goto success;
+
+		/*
+		 * Handle special fdopen() case. bleh.
+		 *
+		 * Don't do this for relative (capability) lookups; we don't
+		 * understand exactly what would happen, and we don't think
+		 * that it ever should.
+		 */
+		if (nd.ni_strictrelative == 0 &&
+		    (error == ENODEV || error == ENXIO) &&
+		    td->td_dupfd >= 0) {
+			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
+			    &indx);
+			if (error == 0)
+				goto success;
+		}
+
+		goto bad;
+	}
+	td->td_dupfd = 0;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+
+	/*
+	 * Store the vnode, for any f_type. Typically, the vnode use
+	 * count is decremented by direct call to vn_closefile() for
+	 * files that switched type in the cdevsw fdopen() method.
+	 */
+	fp->f_vnode = vp;
+	/*
+	 * If the file wasn't claimed by devfs bind it to the normal
+	 * vnode operations here.
+	 */
+	if (fp->f_ops == &badfileops) {
+		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
+		fp->f_seqcount = 1;
+		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
+		    DTYPE_VNODE, vp, &vnops);
+	}
+
+	VOP_UNLOCK(vp, 0);
+	if (flags & O_TRUNC) {
+		error = fo_truncate(fp, 0, td->td_ucred, td);
+		if (error != 0)
+			goto bad;
+	}
+success:
+	/*
+	 * If we haven't already installed the FD (for dupfdopen), do so now.
+	 */
+	if (indx == -1) {
+		struct filecaps *fcaps;
+
+#ifdef CAPABILITIES
+		if (nd.ni_strictrelative == 1)
+			fcaps = &nd.ni_filecaps;
+		else
+#endif
+			fcaps = NULL;
+		error = finstall(td, fp, &indx, flags, fcaps);
+		/* On success finstall() consumes fcaps. */
+		if (error != 0) {
+			filecaps_free(&nd.ni_filecaps);
+			goto bad;
+		}
+	} else {
+		filecaps_free(&nd.ni_filecaps);
+	}
+
+	/*
+	 * Release our private reference, leaving the one associated with
+	 * the descriptor table intact.
+	 */
+	fdrop(fp, td);
+	td->td_retval[0] = indx;
+	return (0);
+bad:
+	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
+	fdrop(fp, td);
+	return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+ocreat(td, uap)
+	struct thread *td;
+	register struct ocreat_args /* {
+		char *path;
+		int mode;
+	} */ *uap;
+{
+
+	return (kern_open(td, uap->path, UIO_USERSPACE,
+	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+	char	*path;
+	int	mode;
+	int	dev;
+};
+#endif
+int
+sys_mknod(td, uap)
+	struct thread *td;
+	register struct mknod_args /* {
+		char *path;
+		int mode;
+		int dev;
+	} */ *uap;
+{
+
+	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct mknodat_args {
+	int	fd;
+	char	*path;
+	mode_t	mode;
+	dev_t	dev;
+};
+#endif
+int
+sys_mknodat(struct thread *td, struct mknodat_args *uap)
+{
+
+	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
+	    uap->dev));
+}
+
+int
+kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
+    int dev)
+{
+
+	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
+}
+
+int
+kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    int mode, int dev)
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct vattr vattr;
+	struct nameidata nd;
+	cap_rights_t rights;
+	int error, whiteout = 0;
+
+	AUDIT_ARG_MODE(mode);
+	AUDIT_ARG_DEV(dev);
+	switch (mode & S_IFMT) {
+	case S_IFCHR:
+	case S_IFBLK:
+		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
+		break;
+	case S_IFMT:
+		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
+		break;
+	case S_IFWHT:
+		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
+		break;
+	case S_IFIFO:
+		if (dev == 0)
+			return (kern_mkfifoat(td, fd, path, pathseg, mode));
+		/* FALLTHROUGH */
+	default:
+		error = EINVAL;
+		break;
+	}
+	if (error != 0)
+		return (error);
+restart:
+	bwillwrite();
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
+	    pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (vp == nd.ni_dvp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(vp);
+		return (EEXIST);
+	} else {
+		VATTR_NULL(&vattr);
+		vattr.va_mode = (mode & ALLPERMS) &
+		    ~td->td_proc->p_fd->fd_cmask;
+		vattr.va_rdev = dev;
+		whiteout = 0;
+
+		switch (mode & S_IFMT) {
+		case S_IFMT:	/* used by badsect to flag bad sectors */
+			vattr.va_type = VBAD;
+			break;
+		case S_IFCHR:
+			vattr.va_type = VCHR;
+			break;
+		case S_IFBLK:
+			vattr.va_type = VBLK;
+			break;
+		case S_IFWHT:
+			whiteout = 1;
+			break;
+		default:
+			panic("kern_mknod: invalid mode");
+		}
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+#ifdef MAC
+	if (error == 0 && !whiteout)
+		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
+		    &nd.ni_cnd, &vattr);
+#endif
+	if (error == 0) {
+		if (whiteout)
+			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+		else {
+			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+						&nd.ni_cnd, &vattr);
+			if (error == 0)
+				vput(nd.ni_vp);
+		}
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+sys_mkfifo(td, uap)
+	struct thread *td;
+	register struct mkfifo_args /* {
+		char *path;
+		int mode;
+	} */ *uap;
+{
+
+	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifoat_args {
+	int	fd;
+	char	*path;
+	mode_t	mode;
+};
+#endif
+int
+sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
+{
+
+	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
+	    uap->mode));
+}
+
+int
+kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
+{
+
+	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
+}
+
+int
+kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    int mode)
+{
+	struct mount *mp;
+	struct vattr vattr;
+	struct nameidata nd;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_MODE(mode);
+restart:
+	bwillwrite();
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
+	    pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	if (nd.ni_vp != NULL) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_vp == nd.ni_dvp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(nd.ni_vp);
+		return (EEXIST);
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VFIFO;
+	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
+#ifdef MAC
+	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+	    &vattr);
+	if (error != 0)
+		goto out;
+#endif
+	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	if (error == 0)
+		vput(nd.ni_vp);
+#ifdef MAC
+out:
+#endif
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	return (error);
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+	char	*path;
+	char	*link;
+};
+#endif
+int
+sys_link(td, uap)
+	struct thread *td;
+	register struct link_args /* {
+		char *path;
+		char *link;
+	} */ *uap;
+{
+
+	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct linkat_args {
+	int	fd1;
+	char	*path1;
+	int	fd2;
+	char	*path2;
+	int	flag;
+};
+#endif
+int
+sys_linkat(struct thread *td, struct linkat_args *uap)
+{
+	int flag;
+
+	flag = uap->flag;
+	if (flag & ~AT_SYMLINK_FOLLOW)
+		return (EINVAL);
+
+	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
+	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
+}
+
+int hardlink_check_uid = 0;
+SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
+    &hardlink_check_uid, 0,
+    "Unprivileged processes cannot create hard links to files owned by other "
+    "users");
+static int hardlink_check_gid = 0;
+SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
+    &hardlink_check_gid, 0,
+    "Unprivileged processes cannot create hard links to files owned by other "
+    "groups");
+
+static int
+can_hardlink(struct vnode *vp, struct ucred *cred)
+{
+	struct vattr va;
+	int error;
+
+	if (!hardlink_check_uid && !hardlink_check_gid)
+		return (0);
+
+	error = VOP_GETATTR(vp, &va, cred);
+	if (error != 0)
+		return (error);
+
+	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
+		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
+		if (error != 0)
+			return (error);
+	}
+
+	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
+		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
+		if (error != 0)
+			return (error);
+	}
+
+	return (0);
+}
+
+int
+kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
+{
+
+	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
+}
+
+int
+kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
+    enum uio_seg segflg, int follow)
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct nameidata nd;
+	cap_rights_t rights;
+	int error;
+
+	bwillwrite();
+	NDINIT_AT(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, td);
+
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	if (vp->v_type == VDIR) {
+		vrele(vp);
+		return (EPERM);		/* POSIX */
+	}
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		vrele(vp);
+		return (error);
+	}
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2,
+	    segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT), td);
+	if ((error = namei(&nd)) == 0) {
+		if (nd.ni_vp != NULL) {
+			if (nd.ni_dvp == nd.ni_vp)
+				vrele(nd.ni_dvp);
+			else
+				vput(nd.ni_dvp);
+			vrele(nd.ni_vp);
+			error = EEXIST;
+		} else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY))
+		    == 0) {
+			error = can_hardlink(vp, td->td_ucred);
+			if (error == 0)
+#ifdef MAC
+				error = mac_vnode_check_link(td->td_ucred,
+				    nd.ni_dvp, vp, &nd.ni_cnd);
+			if (error == 0)
+#endif
+				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+			VOP_UNLOCK(vp, 0);
+			vput(nd.ni_dvp);
+		}
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+	}
+	vrele(vp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+	char	*path;
+	char	*link;
+};
+#endif
+int
+sys_symlink(td, uap)
+	struct thread *td;
+	register struct symlink_args /* {
+		char *path;
+		char *link;
+	} */ *uap;
+{
+
+	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct symlinkat_args {
+	char	*path;
+	int	fd;
+	char	*path2;
+};
+#endif
+int
+sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
+{
+
+	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
+	    UIO_USERSPACE));
+}
+
+int
+kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
+{
+
+	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
+}
+
+int
+kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
+    enum uio_seg segflg)
+{
+	struct mount *mp;
+	struct vattr vattr;
+	char *syspath;
+	struct nameidata nd;
+	int error;
+	cap_rights_t rights;
+
+	if (segflg == UIO_SYSSPACE) {
+		syspath = path1;
+	} else {
+		syspath = uma_zalloc(namei_zone, M_WAITOK);
+		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
+			goto out;
+	}
+	AUDIT_ARG_TEXT(syspath);
+restart:
+	bwillwrite();
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
+	    segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT), td);
+	if ((error = namei(&nd)) != 0)
+		goto out;
+	if (nd.ni_vp) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_vp == nd.ni_dvp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(nd.ni_vp);
+		error = EEXIST;
+		goto out;
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			goto out;
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
+#ifdef MAC
+	vattr.va_type = VLNK;
+	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+	    &vattr);
+	if (error != 0)
+		goto out2;
+#endif
+	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
+	if (error == 0)
+		vput(nd.ni_vp);
+#ifdef MAC
+out2:
+#endif
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+out:
+	if (segflg != UIO_SYSSPACE)
+		uma_zfree(namei_zone, syspath);
+	return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+int
+sys_undelete(td, uap)
+	struct thread *td;
+	register struct undelete_args /* {
+		char *path;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct nameidata nd;
+	int error;
+
+restart:
+	bwillwrite();
+	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
+	    UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error != 0)
+		return (error);
+
+	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_vp == nd.ni_dvp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if (nd.ni_vp)
+			vrele(nd.ni_vp);
+		return (EEXIST);
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+	char	*path;
+};
+#endif
+int
+sys_unlink(td, uap)
+	struct thread *td;
+	struct unlink_args /* {
+		char *path;
+	} */ *uap;
+{
+
+	return (kern_unlink(td, uap->path, UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct unlinkat_args {
+	int	fd;
+	char	*path;
+	int	flag;
+};
+#endif
+int
+sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
+{
+	int flag = uap->flag;
+	int fd = uap->fd;
+	char *path = uap->path;
+
+	if (flag & ~AT_REMOVEDIR)
+		return (EINVAL);
+
+	if (flag & AT_REMOVEDIR)
+		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
+	else
+		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
+}
+
+int
+kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
+{
+
+	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
+}
+
+int
+kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    ino_t oldinum)
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct nameidata nd;
+	struct stat sb;
+	cap_rights_t rights;
+	int error;
+
+restart:
+	bwillwrite();
+	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
+	if ((error = namei(&nd)) != 0)
+		return (error == EINVAL ? EPERM : error);
+	vp = nd.ni_vp;
+	if (vp->v_type == VDIR && oldinum == 0) {
+		error = EPERM;		/* POSIX */
+	} else if (oldinum != 0 &&
+		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
+		  sb.st_ino != oldinum) {
+			error = EIDRM;	/* Identifier removed */
+	} else {
+		/*
+		 * The root of a mounted filesystem cannot be deleted.
+		 *
+		 * XXX: can this only be a VDIR case?
+		 */
+		if (vp->v_vflag & VV_ROOT)
+			error = EBUSY;
+	}
+	if (error == 0) {
+		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+			vput(nd.ni_dvp);
+			if (vp == nd.ni_dvp)
+				vrele(vp);
+			else
+				vput(vp);
+			if ((error = vn_start_write(NULL, &mp,
+			    V_XSLEEP | PCATCH)) != 0)
+				return (error);
+			goto restart;
+		}
+#ifdef MAC
+		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
+		    &nd.ni_cnd);
+		if (error != 0)
+			goto out;
+#endif
+		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
+		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+#ifdef MAC
+out:
+#endif
+		vn_finished_write(mp);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	if (vp == nd.ni_dvp)
+		vrele(vp);
+	else
+		vput(vp);
+	return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+	int	fd;
+	int	pad;
+	off_t	offset;
+	int	whence;
+};
+#endif
+int
+sys_lseek(td, uap)
+	struct thread *td;
+	register struct lseek_args /* {
+		int fd;
+		int pad;
+		off_t offset;
+		int whence;
+	} */ *uap;
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
+	if (error != 0)
+		return (error);
+	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
+	    fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
+	fdrop(fp, td);
+	return (error);
+}
+
+#if defined(COMPAT_43)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+	int	fd;
+	long	offset;
+	int	whence;
+};
+#endif
+int
+olseek(td, uap)
+	struct thread *td;
+	register struct olseek_args /* {
+		int fd;
+		long offset;
+		int whence;
+	} */ *uap;
+{
+	struct lseek_args /* {
+		int fd;
+		int pad;
+		off_t offset;
+		int whence;
+	} */ nuap;
+
+	nuap.fd = uap->fd;
+	nuap.offset = uap->offset;
+	nuap.whence = uap->whence;
+	return (sys_lseek(td, &nuap));
+}
+#endif /* COMPAT_43 */
+
+/* Version with the 'pad' argument */
+int
+freebsd6_lseek(td, uap)
+	struct thread *td;
+	register struct freebsd6_lseek_args *uap;
+{
+	struct lseek_args ouap;
+
+	ouap.fd = uap->fd;
+	ouap.offset = uap->offset;
+	ouap.whence = uap->whence;
+	return (sys_lseek(td, &ouap));
+}
+
+/*
+ * Check access permissions using passed credentials.
+ */
+static int
+vn_access(vp, user_flags, cred, td)
+	struct vnode	*vp;
+	int		user_flags;
+	struct ucred	*cred;
+	struct thread	*td;
+{
+	accmode_t accmode;
+	int error;
+
+	/* Flags == 0 means only check for existence. */
+	error = 0;
+	if (user_flags) {
+		accmode = 0;
+		if (user_flags & R_OK)
+			accmode |= VREAD;
+		if (user_flags & W_OK)
+			accmode |= VWRITE;
+		if (user_flags & X_OK)
+			accmode |= VEXEC;
+#ifdef MAC
+		error = mac_vnode_check_access(cred, vp, accmode);
+		if (error != 0)
+			return (error);
+#endif
+		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+			error = VOP_ACCESS(vp, accmode, cred, td);
+	}
+	return (error);
+}
+
+/*
+ * Check access permissions using "real" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+	char	*path;
+	int	amode;
+};
+#endif
+int
+sys_access(td, uap)
+	struct thread *td;
+	register struct access_args /* {
+		char *path;
+		int amode;
+	} */ *uap;
+{
+
+	return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct faccessat_args {
+	int	dirfd;
+	char	*path;
+	int	amode;
+	int	flag;
+}
+#endif
+int
+sys_faccessat(struct thread *td, struct faccessat_args *uap)
+{
+
+	if (uap->flag & ~AT_EACCESS)
+		return (EINVAL);
+	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
+	    uap->amode));
+}
+
+int
+kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
+{
+
+	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
+}
+
+int
+kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    int flag, int amode)
+{
+	struct ucred *cred, *tmpcred;
+	struct vnode *vp;
+	struct nameidata nd;
+	cap_rights_t rights;
+	int error;
+
+	/*
+	 * Create and modify a temporary credential instead of one that
+	 * is potentially shared.
+	 */
+	if (!(flag & AT_EACCESS)) {
+		cred = td->td_ucred;
+		tmpcred = crdup(cred);
+		tmpcred->cr_uid = cred->cr_ruid;
+		tmpcred->cr_groups[0] = cred->cr_rgid;
+		td->td_ucred = tmpcred;
+	} else
+		cred = tmpcred = td->td_ucred;
+	AUDIT_ARG_VALUE(amode);
+	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
+	    AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
+	    td);
+	if ((error = namei(&nd)) != 0)
+		goto out1;
+	vp = nd.ni_vp;
+
+	error = vn_access(vp, amode, tmpcred, td);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(vp);
+out1:
+	if (!(flag & AT_EACCESS)) {
+		td->td_ucred = cred;
+		crfree(tmpcred);
+	}
+	return (error);
+}
+
+/*
+ * Check access permissions using "effective" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct eaccess_args {
+	char	*path;
+	int	amode;
+};
+#endif
+int
+sys_eaccess(td, uap)
+	struct thread *td;
+	register struct eaccess_args /* {
+		char *path;
+		int amode;
+	} */ *uap;
+{
+
+	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
+}
+
+int
+kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
+{
+
+	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
+}
+
+#if defined(COMPAT_43)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+int
+ostat(td, uap)
+	struct thread *td;
+	register struct ostat_args /* {
+		char *path;
+		struct ostat *ub;
+	} */ *uap;
+{
+	struct stat sb;
+	struct ostat osb;
+	int error;
+
+	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
+	if (error != 0)
+		return (error);
+	cvtstat(&sb, &osb);
+	return (copyout(&osb, uap->ub, sizeof (osb)));
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+int
+olstat(td, uap)
+	struct thread *td;
+	register struct olstat_args /* {
+		char *path;
+		struct ostat *ub;
+	} */ *uap;
+{
+	struct stat sb;
+	struct ostat osb;
+	int error;
+
+	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
+	if (error != 0)
+		return (error);
+	cvtstat(&sb, &osb);
+	return (copyout(&osb, uap->ub, sizeof (osb)));
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ */
+void
+cvtstat(st, ost)
+	struct stat *st;
+	struct ostat *ost;
+{
+
+	ost->st_dev = st->st_dev;
+	ost->st_ino = st->st_ino;
+	ost->st_mode = st->st_mode;
+	ost->st_nlink = st->st_nlink;
+	ost->st_uid = st->st_uid;
+	ost->st_gid = st->st_gid;
+	ost->st_rdev = st->st_rdev;
+	if (st->st_size < (quad_t)1 << 32)
+		ost->st_size = st->st_size;
+	else
+		ost->st_size = -2;
+	ost->st_atim = st->st_atim;
+	ost->st_mtim = st->st_mtim;
+	ost->st_ctim = st->st_ctim;
+	ost->st_blksize = st->st_blksize;
+	ost->st_blocks = st->st_blocks;
+	ost->st_flags = st->st_flags;
+	ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+int
+sys_stat(td, uap)
+	struct thread *td;
+	register struct stat_args /* {
+		char *path;
+		struct stat *ub;
+	} */ *uap;
+{
+	struct stat sb;
+	int error;
+
+	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
+	if (error == 0)
+		error = copyout(&sb, uap->ub, sizeof (sb));
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fstatat_args {
+	int	fd;
+	char	*path;
+	struct stat	*buf;
+	int	flag;
+}
+#endif
+int
+sys_fstatat(struct thread *td, struct fstatat_args *uap)
+{
+	struct stat sb;
+	int error;
+
+	error = kern_statat(td, uap->flag, uap->fd, uap->path,
+	    UIO_USERSPACE, &sb);
+	if (error == 0)
+		error = copyout(&sb, uap->buf, sizeof (sb));
+	return (error);
+}
+
+int
+kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
+{
+
+	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
+}
+
+int
+kern_statat(struct thread *td, int flag, int fd, char *path,
+    enum uio_seg pathseg, struct stat *sbp)
+{
+
+	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
+}
+
+int
+kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
+    enum uio_seg pathseg, struct stat *sbp,
+    void (*hook)(struct vnode *vp, struct stat *sbp))
+{
+	struct nameidata nd;
+	struct stat sb;
+	cap_rights_t rights;
+	int error;
+
+	if (flag & ~AT_SYMLINK_NOFOLLOW)
+		return (EINVAL);
+
+	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
+	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
+	    cap_rights_init(&rights, CAP_FSTAT), td);
+
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
+	if (error == 0) {
+		SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
+		if (S_ISREG(sb.st_mode))
+			SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
+		if (__predict_false(hook != NULL))
+			hook(nd.ni_vp, &sb);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_vp);
+	if (error != 0)
+		return (error);
+	*sbp = sb;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_STRUCT))
+		ktrstat(&sb);
+#endif
+	return (0);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+int
+sys_lstat(td, uap)
+	struct thread *td;
+	register struct lstat_args /* {
+		char *path;
+		struct stat *ub;
+	} */ *uap;
+{
+	struct stat sb;
+	int error;
+
+	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
+	if (error == 0)
+		error = copyout(&sb, uap->ub, sizeof (sb));
+	return (error);
+}
+
+int
+kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
+{
+
+	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
+	    sbp));
+}
+
+/*
+ * Implementation of the NetBSD [l]stat() functions.
+ */
+void
+cvtnstat(sb, nsb)
+	struct stat *sb;
+	struct nstat *nsb;
+{
+
+	bzero(nsb, sizeof *nsb);
+	nsb->st_dev = sb->st_dev;
+	nsb->st_ino = sb->st_ino;
+	nsb->st_mode = sb->st_mode;
+	nsb->st_nlink = sb->st_nlink;
+	nsb->st_uid = sb->st_uid;
+	nsb->st_gid = sb->st_gid;
+	nsb->st_rdev = sb->st_rdev;
+	nsb->st_atim = sb->st_atim;
+	nsb->st_mtim = sb->st_mtim;
+	nsb->st_ctim = sb->st_ctim;
+	nsb->st_size = sb->st_size;
+	nsb->st_blocks = sb->st_blocks;
+	nsb->st_blksize = sb->st_blksize;
+	nsb->st_flags = sb->st_flags;
+	nsb->st_gen = sb->st_gen;
+	nsb->st_birthtim = sb->st_birthtim;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nstat_args {
+	char	*path;
+	struct nstat *ub;
+};
+#endif
+int
+sys_nstat(td, uap)
+	struct thread *td;
+	register struct nstat_args /* {
+		char *path;
+		struct nstat *ub;
+	} */ *uap;
+{
+	struct stat sb;
+	struct nstat nsb;
+	int error;
+
+	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
+	if (error != 0)
+		return (error);
+	cvtnstat(&sb, &nsb);
+	return (copyout(&nsb, uap->ub, sizeof (nsb)));
+}
+
+/*
+ * NetBSD lstat.  Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+int
+sys_nlstat(td, uap)
+	struct thread *td;
+	register struct nlstat_args /* {
+		char *path;
+		struct nstat *ub;
+	} */ *uap;
+{
+	struct stat sb;
+	struct nstat nsb;
+	int error;
+
+	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
+	if (error != 0)
+		return (error);
+	cvtnstat(&sb, &nsb);
+	return (copyout(&nsb, uap->ub, sizeof (nsb)));
+}
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+	char	*path;
+	int	name;
+};
+#endif
+int
+sys_pathconf(td, uap)
+	struct thread *td;
+	register struct pathconf_args /* {
+		char *path;
+		int name;
+	} */ *uap;
+{
+
+	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct lpathconf_args {
+	char	*path;
+	int	name;
+};
+#endif
+int
+sys_lpathconf(td, uap)
+	struct thread *td;
+	register struct lpathconf_args /* {
+		char *path;
+		int name;
+	} */ *uap;
+{
+
+	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
+	    NOFOLLOW));
+}
+
+int
+kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
+    u_long flags)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
+	    pathseg, path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	/* If asynchronous I/O is available, it works for all files. */
+	if (name == _PC_ASYNC_IO)
+		td->td_retval[0] = async_io_version;
+	else
+		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
+	vput(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+	char	*path;
+	char	*buf;
+	size_t	count;
+};
+#endif
+int
+sys_readlink(td, uap)
+	struct thread *td;
+	register struct readlink_args /* {
+		char *path;
+		char *buf;
+		size_t count;
+	} */ *uap;
+{
+
+	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
+	    UIO_USERSPACE, uap->count));
+}
+#ifndef _SYS_SYSPROTO_H_
+struct readlinkat_args {
+	int	fd;
+	char	*path;
+	char	*buf;
+	size_t	bufsize;
+};
+#endif
+int
+sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
+{
+
+	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
+	    uap->buf, UIO_USERSPACE, uap->bufsize));
+}
+
+int
+kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
+    enum uio_seg bufseg, size_t count)
+{
+
+	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
+	    count));
+}
+
+int
+kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    char *buf, enum uio_seg bufseg, size_t count)
+{
+	struct vnode *vp;
+	struct iovec aiov;
+	struct uio auio;
+	struct nameidata nd;
+	int error;
+
+	if (count > IOSIZE_MAX)
+		return (EINVAL);
+
+	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, fd, td);
+
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+#ifdef MAC
+	error = mac_vnode_check_readlink(td->td_ucred, vp);
+	if (error != 0) {
+		vput(vp);
+		return (error);
+	}
+#endif
+	if (vp->v_type != VLNK)
+		error = EINVAL;
+	else {
+		aiov.iov_base = buf;
+		aiov.iov_len = count;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = bufseg;
+		auio.uio_td = td;
+		auio.uio_resid = count;
+		error = VOP_READLINK(vp, &auio, td->td_ucred);
+	}
+	vput(vp);
+	td->td_retval[0] = count - auio.uio_resid;
+	return (error);
+}
+
+/*
+ * Common implementation code for chflags() and fchflags().
+ */
+static int
+setfflags(td, vp, flags)
+	struct thread *td;
+	struct vnode *vp;
+	u_long flags;
+{
+	struct mount *mp;
+	struct vattr vattr;
+	int error;
+
+	/* We can't support the value matching VNOVAL. */
+	if (flags == VNOVAL)
+		return (EOPNOTSUPP);
+
+	/*
+	 * Prevent non-root users from setting flags on devices.  When
+	 * a device is reused, users can retain ownership of the device
+	 * if they are allowed to set flags and programs assume that
+	 * chown can't fail when done as root.
+	 */
+	if (vp->v_type == VCHR || vp->v_type == VBLK) {
+		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
+		if (error != 0)
+			return (error);
+	}
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VATTR_NULL(&vattr);
+	vattr.va_flags = flags;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef MAC
+	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
+	if (error == 0)
+#endif
+		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+	const char *path;
+	u_long	flags;
+};
+#endif
+int
+sys_chflags(td, uap)
+	struct thread *td;
+	register struct chflags_args /* {
+		const char *path;
+		u_long flags;
+	} */ *uap;
+{
+
+	return (kern_chflags(td, uap->path, UIO_USERSPACE, uap->flags));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct chflagsat_args {
+	int	fd;
+	const char *path;
+	u_long	flags;
+	int	atflag;
+}
+#endif
+int
+sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
+{
+	int fd = uap->fd;
+	const char *path = uap->path;
+	u_long flags = uap->flags;
+	int atflag = uap->atflag;
+
+	if (atflag & ~AT_SYMLINK_NOFOLLOW)
+		return (EINVAL);
+
+	return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
+}
+
+static int
+kern_chflags(struct thread *td, const char *path, enum uio_seg pathseg,
+    u_long flags)
+{
+
+	return (kern_chflagsat(td, AT_FDCWD, path, pathseg, flags, 0));
+}
+
+/*
+ * Same as chflags() but doesn't follow symlinks.
+ */
+int
+sys_lchflags(td, uap)
+	struct thread *td;
+	register struct lchflags_args /* {
+		const char *path;
+		u_long flags;
+	} */ *uap;
+{
+
+	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->flags, AT_SYMLINK_NOFOLLOW));
+}
+
+static int
+kern_chflagsat(struct thread *td, int fd, const char *path,
+    enum uio_seg pathseg, u_long flags, int atflag)
+{
+	struct nameidata nd;
+	cap_rights_t rights;
+	int error, follow;
+
+	AUDIT_ARG_FFLAGS(flags);
+	follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+	    cap_rights_init(&rights, CAP_FCHFLAGS), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfflags(td, nd.ni_vp, flags);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+	int	fd;
+	u_long	flags;
+};
+#endif
+int
+sys_fchflags(td, uap)
+	struct thread *td;
+	register struct fchflags_args /* {
+		int fd;
+		u_long flags;
+	} */ *uap;
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_FFLAGS(uap->flags);
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_FCHFLAGS), &fp);
+	if (error != 0)
+		return (error);
+#ifdef AUDIT
+	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(fp->f_vnode);
+	VOP_UNLOCK(fp->f_vnode, 0);
+#endif
+	error = setfflags(td, fp->f_vnode, uap->flags);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common implementation code for chmod(), lchmod() and fchmod().
+ */
+int
+setfmode(td, cred, vp, mode)
+	struct thread *td;
+	struct ucred *cred;
+	struct vnode *vp;
+	int mode;
+{
+	struct mount *mp;
+	struct vattr vattr;
+	int error;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	VATTR_NULL(&vattr);
+	vattr.va_mode = mode & ALLPERMS;
+#ifdef MAC
+	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
+	if (error == 0)
+#endif
+		error = VOP_SETATTR(vp, &vattr, cred);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+sys_chmod(td, uap)
+	struct thread *td;
+	register struct chmod_args /* {
+		char *path;
+		int mode;
+	} */ *uap;
+{
+
+	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fchmodat_args {
+	int	dirfd;
+	char	*path;
+	mode_t	mode;
+	int	flag;
+}
+#endif
+int
+sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
+{
+	int flag = uap->flag;
+	int fd = uap->fd;
+	char *path = uap->path;
+	mode_t mode = uap->mode;
+
+	if (flag & ~AT_SYMLINK_NOFOLLOW)
+		return (EINVAL);
+
+	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
+}
+
+int
+kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
+{
+
+	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
+}
+
+/*
+ * Change mode of a file given path name (don't follow links.)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+sys_lchmod(td, uap)
+	struct thread *td;
+	register struct lchmod_args /* {
+		char *path;
+		int mode;
+	} */ *uap;
+{
+
+	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->mode, AT_SYMLINK_NOFOLLOW));
+}
+
+int
+kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    mode_t mode, int flag)
+{
+	struct nameidata nd;
+	cap_rights_t rights;
+	int error, follow;
+
+	AUDIT_ARG_MODE(mode);
+	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+	    cap_rights_init(&rights, CAP_FCHMOD), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+	int	fd;
+	int	mode;
+};
+#endif
+int
+sys_fchmod(struct thread *td, struct fchmod_args *uap)
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_MODE(uap->mode);
+
+	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
+	if (error != 0)
+		return (error);
+	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common implementation for chown(), lchown(), and fchown()
+ */
+int
+setfown(td, cred, vp, uid, gid)
+	struct thread *td;
+	struct ucred *cred;
+	struct vnode *vp;
+	uid_t uid;
+	gid_t gid;
+{
+	struct mount *mp;
+	struct vattr vattr;
+	int error;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	VATTR_NULL(&vattr);
+	vattr.va_uid = uid;
+	vattr.va_gid = gid;
+#ifdef MAC
+	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
+	    vattr.va_gid);
+	if (error == 0)
+#endif
+		error = VOP_SETATTR(vp, &vattr, cred);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
+int
+sys_chown(td, uap)
+	struct thread *td;
+	register struct chown_args /* {
+		char *path;
+		int uid;
+		int gid;
+	} */ *uap;
+{
+
+	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fchownat_args {
+	int fd;
+	const char * path;
+	uid_t uid;
+	gid_t gid;
+	int flag;
+};
+#endif
+int
+sys_fchownat(struct thread *td, struct fchownat_args *uap)
+{
+	int flag;
+
+	flag = uap->flag;
+	if (flag & ~AT_SYMLINK_NOFOLLOW)
+		return (EINVAL);
+
+	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
+	    uap->gid, uap->flag));
+}
+
+int
+kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
+    int gid)
+{
+
+	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
+}
+
+int
+kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    int uid, int gid, int flag)
+{
+	struct nameidata nd;
+	cap_rights_t rights;
+	int error, follow;
+
+	AUDIT_ARG_OWNER(uid, gid);
+	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+	    cap_rights_init(&rights, CAP_FCHOWN), td);
+
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set ownership given a path name, do not cross symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
+int
+sys_lchown(td, uap)
+	struct thread *td;
+	register struct lchown_args /* {
+		char *path;
+		int uid;
+		int gid;
+	} */ *uap;
+{
+
+	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
+}
+
+int
+kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
+    int gid)
+{
+
+	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
+	    AT_SYMLINK_NOFOLLOW));
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+	int	fd;
+	int	uid;
+	int	gid;
+};
+#endif
+int
+sys_fchown(td, uap)
+	struct thread *td;
+	register struct fchown_args /* {
+		int fd;
+		int uid;
+		int gid;
+	} */ *uap;
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_OWNER(uap->uid, uap->gid);
+	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
+	if (error != 0)
+		return (error);
+	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+getutimes(usrtvp, tvpseg, tsp)
+	const struct timeval *usrtvp;
+	enum uio_seg tvpseg;
+	struct timespec *tsp;
+{
+	struct timeval tv[2];
+	const struct timeval *tvp;
+	int error;
+
+	if (usrtvp == NULL) {
+		vfs_timestamp(&tsp[0]);
+		tsp[1] = tsp[0];
+	} else {
+		if (tvpseg == UIO_SYSSPACE) {
+			tvp = usrtvp;
+		} else {
+			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
+				return (error);
+			tvp = tv;
+		}
+
+		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
+		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
+			return (EINVAL);
+		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
+		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
+	}
+	return (0);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+setutimes(td, vp, ts, numtimes, nullflag)
+	struct thread *td;
+	struct vnode *vp;
+	const struct timespec *ts;
+	int numtimes;
+	int nullflag;
+{
+	struct mount *mp;
+	struct vattr vattr;
+	int error, setbirthtime;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	setbirthtime = 0;
+	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
+	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
+		setbirthtime = 1;
+	VATTR_NULL(&vattr);
+	vattr.va_atime = ts[0];
+	vattr.va_mtime = ts[1];
+	if (setbirthtime)
+		vattr.va_birthtime = ts[1];
+	if (numtimes > 2)
+		vattr.va_birthtime = ts[2];
+	if (nullflag)
+		vattr.va_vaflags |= VA_UTIMES_NULL;
+#ifdef MAC
+	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
+	    vattr.va_mtime);
+#endif
+	if (error == 0)
+		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
+int
+sys_utimes(td, uap)
+	struct thread *td;
+	register struct utimes_args /* {
+		char *path;
+		struct timeval *tptr;
+	} */ *uap;
+{
+
+	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
+	    UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct futimesat_args {
+	int fd;
+	const char * path;
+	const struct timeval * times;
+};
+#endif
+int
+sys_futimesat(struct thread *td, struct futimesat_args *uap)
+{
+
+	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
+	    uap->times, UIO_USERSPACE));
+}
+
+int
+kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
+    struct timeval *tptr, enum uio_seg tptrseg)
+{
+
+	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
+}
+
+int
+kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    struct timeval *tptr, enum uio_seg tptrseg)
+{
+	struct nameidata nd;
+	struct timespec ts[2];
+	cap_rights_t rights;
+	int error;
+
+	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
+		return (error);
+	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
+	    cap_rights_init(&rights, CAP_FUTIMES), td);
+
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lutimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
+int
+sys_lutimes(td, uap)
+	struct thread *td;
+	register struct lutimes_args /* {
+		char *path;
+		struct timeval *tptr;
+	} */ *uap;
+{
+
+	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
+	    UIO_USERSPACE));
+}
+
+int
+kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
+    struct timeval *tptr, enum uio_seg tptrseg)
+{
+	struct timespec ts[2];
+	struct nameidata nd;
+	int error;
+
+	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
+		return (error);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct futimes_args {
+	int	fd;
+	struct	timeval *tptr;
+};
+#endif
+int
+sys_futimes(td, uap)
+	struct thread *td;
+	register struct futimes_args /* {
+		int  fd;
+		struct timeval *tptr;
+	} */ *uap;
+{
+
+	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
+}
+
+int
+kern_futimes(struct thread *td, int fd, struct timeval *tptr,
+    enum uio_seg tptrseg)
+{
+	struct timespec ts[2];
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(fd);
+	error = getutimes(tptr, tptrseg, ts);
+	if (error != 0)
+		return (error);
+	error = getvnode(td->td_proc->p_fd, fd,
+	    cap_rights_init(&rights, CAP_FUTIMES), &fp);
+	if (error != 0)
+		return (error);
+#ifdef AUDIT
+	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(fp->f_vnode);
+	VOP_UNLOCK(fp->f_vnode, 0);
+#endif
+	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+	char	*path;
+	int	pad;
+	off_t	length;
+};
+#endif
+int
+sys_truncate(td, uap)
+	struct thread *td;
+	register struct truncate_args /* {
+		char *path;
+		int pad;
+		off_t length;
+	} */ *uap;
+{
+
+	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
+}
+
+int
+kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
+{
+	struct mount *mp;
+	struct vnode *vp;
+	void *rl_cookie;
+	struct vattr vattr;
+	struct nameidata nd;
+	int error;
+
+	if (length < 0)
+		return(EINVAL);
+	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		vn_rangelock_unlock(vp, rl_cookie);
+		vrele(vp);
+		return (error);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	if (vp->v_type == VDIR)
+		error = EISDIR;
+#ifdef MAC
+	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
+	}
+#endif
+	else if ((error = vn_writechk(vp)) == 0 &&
+	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = length;
+		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
+	}
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	vn_rangelock_unlock(vp, rl_cookie);
+	vrele(vp);
+	return (error);
+}
+
+#if defined(COMPAT_43)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+	char	*path;
+	long	length;
+};
+#endif
+int
+otruncate(td, uap)
+	struct thread *td;
+	register struct otruncate_args /* {
+		char *path;
+		long length;
+	} */ *uap;
+{
+	struct truncate_args /* {
+		char *path;
+		int pad;
+		off_t length;
+	} */ nuap;
+
+	nuap.path = uap->path;
+	nuap.length = uap->length;
+	return (sys_truncate(td, &nuap));
+}
+#endif /* COMPAT_43 */
+
+/* Versions with the pad argument */
+int
+freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
+{
+	struct truncate_args ouap;
+
+	ouap.path = uap->path;
+	ouap.length = uap->length;
+	return (sys_truncate(td, &ouap));
+}
+
+int
+freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
+{
+	struct ftruncate_args ouap;
+
+	ouap.fd = uap->fd;
+	ouap.length = uap->length;
+	return (sys_ftruncate(td, &ouap));
+}
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+	int	fd;
+};
+#endif
+int
+sys_fsync(td, uap)
+	struct thread *td;
+	struct fsync_args /* {
+		int fd;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct file *fp;
+	cap_rights_t rights;
+	int error, lock_flags;
+
+	AUDIT_ARG_FD(uap->fd);
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_FSYNC), &fp);
+	if (error != 0)
+		return (error);
+	vp = fp->f_vnode;
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error != 0)
+		goto drop;
+	if (MNT_SHARED_WRITES(mp) ||
+	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
+		lock_flags = LK_SHARED;
+	} else {
+		lock_flags = LK_EXCLUSIVE;
+	}
+	vn_lock(vp, lock_flags | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+	if (vp->v_object != NULL) {
+		VM_OBJECT_WLOCK(vp->v_object);
+		vm_object_page_clean(vp->v_object, 0, 0, 0);
+		VM_OBJECT_WUNLOCK(vp->v_object);
+	}
+	error = VOP_FSYNC(vp, MNT_WAIT, td);
+
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+drop:
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Rename files.  Source and destination must either both be directories, or
+ * both not be directories.  If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+	char	*from;
+	char	*to;
+};
+#endif
+int
+sys_rename(td, uap)
+	struct thread *td;
+	register struct rename_args /* {
+		char *from;
+		char *to;
+	} */ *uap;
+{
+
+	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct renameat_args {
+	int	oldfd;
+	char	*old;
+	int	newfd;
+	char	*new;
+};
+#endif
+int
+sys_renameat(struct thread *td, struct renameat_args *uap)
+{
+
+	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
+	    UIO_USERSPACE));
+}
+
+int
+kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
+{
+
+	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
+}
+
+int
+kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
+    enum uio_seg pathseg)
+{
+	struct mount *mp = NULL;
+	struct vnode *tvp, *fvp, *tdvp;
+	struct nameidata fromnd, tond;
+	cap_rights_t rights;
+	int error;
+
+	bwillwrite();
+#ifdef MAC
+	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
+	    AUDITVNODE1, pathseg, old, oldfd,
+	    cap_rights_init(&rights, CAP_RENAMEAT), td);
+#else
+	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
+	    pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT), td);
+#endif
+
+	if ((error = namei(&fromnd)) != 0)
+		return (error);
+#ifdef MAC
+	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
+	    fromnd.ni_vp, &fromnd.ni_cnd);
+	VOP_UNLOCK(fromnd.ni_dvp, 0);
+	if (fromnd.ni_dvp != fromnd.ni_vp)
+		VOP_UNLOCK(fromnd.ni_vp, 0);
+#endif
+	fvp = fromnd.ni_vp;
+	if (error == 0)
+		error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
+	if (error != 0) {
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+		goto out1;
+	}
+	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
+	    SAVESTART | AUDITVNODE2, pathseg, new, newfd,
+	    cap_rights_init(&rights, CAP_LINKAT), td);
+	if (fromnd.ni_vp->v_type == VDIR)
+		tond.ni_cnd.cn_flags |= WILLBEDIR;
+	if ((error = namei(&tond)) != 0) {
+		/* Translate error code for rename("dir1", "dir2/."). */
+		if (error == EISDIR && fvp->v_type == VDIR)
+			error = EINVAL;
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+		vn_finished_write(mp);
+		goto out1;
+	}
+	tdvp = tond.ni_dvp;
+	tvp = tond.ni_vp;
+	if (tvp != NULL) {
+		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+			error = ENOTDIR;
+			goto out;
+		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+			error = EISDIR;
+			goto out;
+		}
+#ifdef CAPABILITIES
+		if (newfd != AT_FDCWD) {
+			/*
+			 * If the target already exists we require CAP_UNLINKAT
+			 * from 'newfd'.
+			 */
+			error = cap_check(&tond.ni_filecaps.fc_rights,
+			    cap_rights_init(&rights, CAP_UNLINKAT));
+			if (error != 0)
+				goto out;
+		}
+#endif
+	}
+	if (fvp == tdvp) {
+		error = EINVAL;
+		goto out;
+	}
+	/*
+	 * If the source is the same as the destination (that is, if they
+	 * are links to the same vnode), then there is nothing to do.
+	 */
+	if (fvp == tvp)
+		error = -1;
+#ifdef MAC
+	else
+		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
+		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
+#endif
+out:
+	if (error == 0) {
+		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		NDFREE(&tond, NDF_ONLY_PNBUF);
+	} else {
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		NDFREE(&tond, NDF_ONLY_PNBUF);
+		if (tvp != NULL)
+			vput(tvp);
+		if (tdvp == tvp)
+			vrele(tdvp);
+		else
+			vput(tdvp);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+	}
+	vrele(tond.ni_startdir);
+	vn_finished_write(mp);
+out1:
+	if (fromnd.ni_startdir)
+		vrele(fromnd.ni_startdir);
+	if (error == -1)
+		return (0);
+	return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+sys_mkdir(td, uap)
+	struct thread *td;
+	register struct mkdir_args /* {
+		char *path;
+		int mode;
+	} */ *uap;
+{
+
+	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct mkdirat_args {
+	int	fd;
+	char	*path;
+	mode_t	mode;
+};
+#endif
+int
+sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
+{
+
+	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
+}
+
+int
+kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
+{
+
+	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
+}
+
+int
+kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
+    int mode)
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct vattr vattr;
+	struct nameidata nd;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_MODE(mode);
+restart:
+	bwillwrite();
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
+	    segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT), td);
+	nd.ni_cnd.cn_flags |= WILLBEDIR;
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		/*
+		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
+		 * the strange behaviour of leaving the vnode unlocked
+		 * if the target is the same vnode as the parent.
+		 */
+		if (vp == nd.ni_dvp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(vp);
+		return (EEXIST);
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VDIR;
+	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
+#ifdef MAC
+	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+	    &vattr);
+	if (error != 0)
+		goto out;
+#endif
+	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+#ifdef MAC
+out:
+#endif
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	if (error == 0)
+		vput(nd.ni_vp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+	char	*path;
+};
+#endif
+int
+sys_rmdir(td, uap)
+	struct thread *td;
+	struct rmdir_args /* {
+		char *path;
+	} */ *uap;
+{
+
+	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
+}
+
+int
+kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
+{
+
+	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
+}
+
+int
+kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct nameidata nd;
+	cap_rights_t rights;
+	int error;
+
+restart:
+	bwillwrite();
+	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+	/*
+	 * No rmdir "." please.
+	 */
+	if (nd.ni_dvp == vp) {
+		error = EINVAL;
+		goto out;
+	}
+	/*
+	 * The root of a mounted filesystem cannot be deleted.
+	 */
+	if (vp->v_vflag & VV_ROOT) {
+		error = EBUSY;
+		goto out;
+	}
+#ifdef MAC
+	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
+	    &nd.ni_cnd);
+	if (error != 0)
+		goto out;
+#endif
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(vp);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
+	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+	vn_finished_write(mp);
+out:
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(vp);
+	if (nd.ni_dvp == vp)
+		vrele(nd.ni_dvp);
+	else
+		vput(nd.ni_dvp);
+	return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
+{
+	long loff;
+	int error;
+
+	error = kern_ogetdirentries(td, uap, &loff);
+	if (error == 0)
+		error = copyout(&loff, uap->basep, sizeof(long));
+	return (error);
+}
+
+int
+kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
+    long *ploff)
+{
+	struct vnode *vp;
+	struct file *fp;
+	struct uio auio, kuio;
+	struct iovec aiov, kiov;
+	struct dirent *dp, *edp;
+	cap_rights_t rights;
+	caddr_t dirbuf;
+	int error, eofflag, readcnt;
+	long loff;
+	off_t foffset;
+
+	/* XXX arbitrary sanity limit on `count'. */
+	if (uap->count > 64 * 1024)
+		return (EINVAL);
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_READ), &fp);
+	if (error != 0)
+		return (error);
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
+		return (EBADF);
+	}
+	vp = fp->f_vnode;
+	foffset = foffset_lock(fp, 0);
+unionread:
+	if (vp->v_type != VDIR) {
+		foffset_unlock(fp, foffset, 0);
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->count;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+	auio.uio_resid = uap->count;
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	loff = auio.uio_offset = foffset;
+#ifdef MAC
+	error = mac_vnode_check_readdir(td->td_ucred, vp);
+	if (error != 0) {
+		VOP_UNLOCK(vp, 0);
+		foffset_unlock(fp, foffset, FOF_NOUPDATE);
+		fdrop(fp, td);
+		return (error);
+	}
+#endif
+#	if (BYTE_ORDER != LITTLE_ENDIAN)
+		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
+			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+			foffset = auio.uio_offset;
+		} else
+#	endif
+	{
+		kuio = auio;
+		kuio.uio_iov = &kiov;
+		kuio.uio_segflg = UIO_SYSSPACE;
+		kiov.iov_len = uap->count;
+		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
+		kiov.iov_base = dirbuf;
+		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+		foffset = kuio.uio_offset;
+		if (error == 0) {
+			readcnt = uap->count - kuio.uio_resid;
+			edp = (struct dirent *)&dirbuf[readcnt];
+			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+#				if (BYTE_ORDER == LITTLE_ENDIAN)
+					/*
+					 * The expected low byte of
+					 * dp->d_namlen is our dp->d_type.
+					 * The high MBZ byte of dp->d_namlen
+					 * is our dp->d_namlen.
+					 */
+					dp->d_type = dp->d_namlen;
+					dp->d_namlen = 0;
+#				else
+					/*
+					 * The dp->d_type is the high byte
+					 * of the expected dp->d_namlen,
+					 * so must be zero'ed.
+					 */
+					dp->d_type = 0;
+#				endif
+				if (dp->d_reclen > 0) {
+					dp = (struct dirent *)
+					    ((char *)dp + dp->d_reclen);
+				} else {
+					error = EIO;
+					break;
+				}
+			}
+			if (dp >= edp)
+				error = uiomove(dirbuf, readcnt, &auio);
+		}
+		free(dirbuf, M_TEMP);
+	}
+	if (error != 0) {
+		VOP_UNLOCK(vp, 0);
+		foffset_unlock(fp, foffset, 0);
+		fdrop(fp, td);
+		return (error);
+	}
+	if (uap->count == auio.uio_resid &&
+	    (vp->v_vflag & VV_ROOT) &&
+	    (vp->v_mount->mnt_flag & MNT_UNION)) {
+		struct vnode *tvp = vp;
+		vp = vp->v_mount->mnt_vnodecovered;
+		VREF(vp);
+		fp->f_vnode = vp;
+		fp->f_data = vp;
+		foffset = 0;
+		vput(tvp);
+		goto unionread;
+	}
+	VOP_UNLOCK(vp, 0);
+	foffset_unlock(fp, foffset, 0);
+	fdrop(fp, td);
+	td->td_retval[0] = uap->count - auio.uio_resid;
+	if (error == 0)
+		*ploff = loff;
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+sys_getdirentries(td, uap)
+	struct thread *td;
+	register struct getdirentries_args /* {
+		int fd;
+		char *buf;
+		u_int count;
+		long *basep;
+	} */ *uap;
+{
+	long base;
+	int error;
+
+	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
+	    NULL, UIO_USERSPACE);
+	if (error != 0)
+		return (error);
+	if (uap->basep != NULL)
+		error = copyout(&base, uap->basep, sizeof(long));
+	return (error);
+}
+
+int
+kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
+    long *basep, ssize_t *residp, enum uio_seg bufseg)
+{
+	struct vnode *vp;
+	struct file *fp;
+	struct uio auio;
+	struct iovec aiov;
+	cap_rights_t rights;
+	long loff;
+	int error, eofflag;
+	off_t foffset;
+
+	AUDIT_ARG_FD(fd);
+	if (count > IOSIZE_MAX)
+		return (EINVAL);
+	auio.uio_resid = count;
+	error = getvnode(td->td_proc->p_fd, fd,
+	    cap_rights_init(&rights, CAP_READ), &fp);
+	if (error != 0)
+		return (error);
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
+		return (EBADF);
+	}
+	vp = fp->f_vnode;
+	foffset = foffset_lock(fp, 0);
+unionread:
+	if (vp->v_type != VDIR) {
+		error = EINVAL;
+		goto fail;
+	}
+	aiov.iov_base = buf;
+	aiov.iov_len = count;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = bufseg;
+	auio.uio_td = td;
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+	loff = auio.uio_offset = foffset;
+#ifdef MAC
+	error = mac_vnode_check_readdir(td->td_ucred, vp);
+	if (error == 0)
+#endif
+		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
+		    NULL);
+	foffset = auio.uio_offset;
+	if (error != 0) {
+		VOP_UNLOCK(vp, 0);
+		goto fail;
+	}
+	if (count == auio.uio_resid &&
+	    (vp->v_vflag & VV_ROOT) &&
+	    (vp->v_mount->mnt_flag & MNT_UNION)) {
+		struct vnode *tvp = vp;
+
+		vp = vp->v_mount->mnt_vnodecovered;
+		VREF(vp);
+		fp->f_vnode = vp;
+		fp->f_data = vp;
+		foffset = 0;
+		vput(tvp);
+		goto unionread;
+	}
+	VOP_UNLOCK(vp, 0);
+	*basep = loff;
+	if (residp != NULL)
+		*residp = auio.uio_resid;
+	td->td_retval[0] = count - auio.uio_resid;
+fail:
+	foffset_unlock(fp, foffset, 0);
+	fdrop(fp, td);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getdents_args {
+	int fd;
+	char *buf;
+	size_t count;
+};
+#endif
+int
+sys_getdents(td, uap)
+	struct thread *td;
+	register struct getdents_args /* {
+		int fd;
+		char *buf;
+		u_int count;
+	} */ *uap;
+{
+	struct getdirentries_args ap;
+
+	ap.fd = uap->fd;
+	ap.buf = uap->buf;
+	ap.count = uap->count;
+	ap.basep = NULL;
+	return (sys_getdirentries(td, &ap));
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+	int	newmask;
+};
+#endif
+int
+sys_umask(td, uap)
+	struct thread *td;
+	struct umask_args /* {
+		int newmask;
+	} */ *uap;
+{
+	register struct filedesc *fdp;
+
+	FILEDESC_XLOCK(td->td_proc->p_fd);
+	fdp = td->td_proc->p_fd;
+	td->td_retval[0] = fdp->fd_cmask;
+	fdp->fd_cmask = uap->newmask & ALLPERMS;
+	FILEDESC_XUNLOCK(td->td_proc->p_fd);
+	return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem away from
+ * vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+	char	*path;
+};
+#endif
+int
+sys_revoke(td, uap)
+	struct thread *td;
+	register struct revoke_args /* {
+		char *path;
+	} */ *uap;
+{
+	struct vnode *vp;
+	struct vattr vattr;
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
+		error = EINVAL;
+		goto out;
+	}
+#ifdef MAC
+	error = mac_vnode_check_revoke(td->td_ucred, vp);
+	if (error != 0)
+		goto out;
+#endif
+	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
+	if (error != 0)
+		goto out;
+	if (td->td_ucred->cr_uid != vattr.va_uid) {
+		error = priv_check(td, PRIV_VFS_ADMIN);
+		if (error != 0)
+			goto out;
+	}
+	if (vcount(vp) > 1)
+		VOP_REVOKE(vp, REVOKEALL);
+out:
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry and check that, if it
+ * is a capability, the correct rights are present. A reference on the file
+ * entry is held upon returning.
+ */
+int
+getvnode(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+	struct file *fp;
+	int error;
+
+	error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * The file could be not of the vnode type, or it may be not
+	 * yet fully initialized, in which case the f_vnode pointer
+	 * may be set, but f_ops is still badfileops.  E.g.,
+	 * devfs_open() transiently create such situation to
+	 * facilitate csw d_fdopen().
+	 *
+	 * Dupfdopen() handling in kern_openat() installs the
+	 * half-baked file into the process descriptor table, allowing
+	 * other thread to dereference it. Guard against the race by
+	 * checking f_ops.
+	 */
+	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
+		fdrop(fp, curthread);
+		return (EINVAL);
+	}
+	*fpp = fp;
+	return (0);
+}
+
+
+/*
+ * Get an (NFS) file handle.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lgetfh_args {
+	char	*fname;
+	fhandle_t *fhp;
+};
+#endif
+int
+sys_lgetfh(td, uap)
+	struct thread *td;
+	register struct lgetfh_args *uap;
+{
+	struct nameidata nd;
+	fhandle_t fh;
+	register struct vnode *vp;
+	int error;
+
+	error = priv_check(td, PRIV_VFS_GETFH);
+	if (error != 0)
+		return (error);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+	    uap->fname, td);
+	error = namei(&nd);
+	if (error != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	bzero(&fh, sizeof(fh));
+	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
+	error = VOP_VPTOFH(vp, &fh.fh_fid);
+	vput(vp);
+	if (error == 0)
+		error = copyout(&fh, uap->fhp, sizeof (fh));
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getfh_args {
+	char	*fname;
+	fhandle_t *fhp;
+};
+#endif
+int
+sys_getfh(td, uap)
+	struct thread *td;
+	register struct getfh_args *uap;
+{
+	struct nameidata nd;
+	fhandle_t fh;
+	register struct vnode *vp;
+	int error;
+
+	error = priv_check(td, PRIV_VFS_GETFH);
+	if (error != 0)
+		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+	    uap->fname, td);
+	error = namei(&nd);
+	if (error != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	bzero(&fh, sizeof(fh));
+	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
+	error = VOP_VPTOFH(vp, &fh.fh_fid);
+	vput(vp);
+	if (error == 0)
+		error = copyout(&fh, uap->fhp, sizeof (fh));
+	return (error);
+}
+
+/*
+ * syscall for the rpc.lockd to use to translate a NFS file handle into an
+ * open descriptor.
+ *
+ * warning: do not remove the priv_check() call or this becomes one giant
+ * security hole.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhopen_args {
+	const struct fhandle *u_fhp;
+	int flags;
+};
+#endif
+int
+sys_fhopen(td, uap)
+	struct thread *td;
+	struct fhopen_args /* {
+		const struct fhandle *u_fhp;
+		int flags;
+	} */ *uap;
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct fhandle fhp;
+	struct file *fp;
+	int fmode, error;
+	int indx;
+
+	error = priv_check(td, PRIV_VFS_FHOPEN);
+	if (error != 0)
+		return (error);
+	indx = -1;
+	fmode = FFLAGS(uap->flags);
+	/* why not allow a non-read/write open for our lockd? */
+	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
+		return (EINVAL);
+	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
+	if (error != 0)
+		return(error);
+	/* find the mount point */
+	mp = vfs_busyfs(&fhp.fh_fsid);
+	if (mp == NULL)
+		return (ESTALE);
+	/* now give me my vnode, it gets returned to me locked */
+	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
+	vfs_unbusy(mp);
+	if (error != 0)
+		return (error);
+
+	error = falloc_noinstall(td, &fp);
+	if (error != 0) {
+		vput(vp);
+		return (error);
+	}
+	/*
+	 * An extra reference on `fp' has been held for us by
+	 * falloc_noinstall().
+	 */
+
+#ifdef INVARIANTS
+	td->td_dupfd = -1;
+#endif
+	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
+	if (error != 0) {
+		KASSERT(fp->f_ops == &badfileops,
+		    ("VOP_OPEN in fhopen() set f_ops"));
+		KASSERT(td->td_dupfd < 0,
+		    ("fhopen() encountered fdopen()"));
+
+		vput(vp);
+		goto bad;
+	}
+#ifdef INVARIANTS
+	td->td_dupfd = 0;
+#endif
+	fp->f_vnode = vp;
+	fp->f_seqcount = 1;
+	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
+	    &vnops);
+	VOP_UNLOCK(vp, 0);
+	if ((fmode & O_TRUNC) != 0) {
+		error = fo_truncate(fp, 0, td->td_ucred, td);
+		if (error != 0)
+			goto bad;
+	}
+
+	error = finstall(td, fp, &indx, fmode, NULL);
+bad:
+	fdrop(fp, td);
+	td->td_retval[0] = indx;
+	return (error);
+}
+
+/*
+ * Stat an (NFS) file handle.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstat_args {
+	struct fhandle *u_fhp;
+	struct stat *sb;
+};
+#endif
+int
+sys_fhstat(td, uap)
+	struct thread *td;
+	register struct fhstat_args /* {
+		struct fhandle *u_fhp;
+		struct stat *sb;
+	} */ *uap;
+{
+	struct stat sb;
+	struct fhandle fh;
+	int error;
+
+	error = copyin(uap->u_fhp, &fh, sizeof(fh));
+	if (error != 0)
+		return (error);
+	error = kern_fhstat(td, fh, &sb);
+	if (error == 0)
+		error = copyout(&sb, uap->sb, sizeof(sb));
+	return (error);
+}
+
+int
+kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
+{
+	struct mount *mp;
+	struct vnode *vp;
+	int error;
+
+	error = priv_check(td, PRIV_VFS_FHSTAT);
+	if (error != 0)
+		return (error);
+	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
+		return (ESTALE);
+	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
+	vfs_unbusy(mp);
+	if (error != 0)
+		return (error);
+	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstatfs_args {
+	struct fhandle *u_fhp;
+	struct statfs *buf;
+};
+#endif
+int
+sys_fhstatfs(td, uap)
+	struct thread *td;
+	struct fhstatfs_args /* {
+		struct fhandle *u_fhp;
+		struct statfs *buf;
+	} */ *uap;
+{
+	struct statfs sf;
+	fhandle_t fh;
+	int error;
+
+	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
+	if (error != 0)
+		return (error);
+	error = kern_fhstatfs(td, fh, &sf);
+	if (error != 0)
+		return (error);
+	return (copyout(&sf, uap->buf, sizeof(sf)));
+}
+
+int
+kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
+{
+	struct statfs *sp;
+	struct mount *mp;
+	struct vnode *vp;
+	int error;
+
+	error = priv_check(td, PRIV_VFS_FHSTATFS);
+	if (error != 0)
+		return (error);
+	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
+		return (ESTALE);
+	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
+	if (error != 0) {
+		vfs_unbusy(mp);
+		return (error);
+	}
+	vput(vp);
+	error = prison_canseemount(td->td_ucred, mp);
+	if (error != 0)
+		goto out;
+#ifdef MAC
+	error = mac_mount_check_stat(td->td_ucred, mp);
+	if (error != 0)
+		goto out;
+#endif
+	/*
+	 * Set these in case the underlying filesystem fails to do so.
+	 */
+	sp = &mp->mnt_stat;
+	sp->f_version = STATFS_VERSION;
+	sp->f_namemax = NAME_MAX;
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	error = VFS_STATFS(mp, sp);
+	if (error == 0)
+		*buf = *sp;
+out:
+	vfs_unbusy(mp);
+	return (error);
+}
+
+int
+kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
+{
+	struct file *fp;
+	struct mount *mp;
+	struct vnode *vp;
+	cap_rights_t rights;
+	off_t olen, ooffset;
+	int error;
+
+	fp = NULL;
+	error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
+	if (error != 0)
+		goto out;
+
+	switch (fp->f_type) {
+	case DTYPE_VNODE:
+		break;
+	case DTYPE_PIPE:
+	case DTYPE_FIFO:
+		error = ESPIPE;
+		goto out;
+	default:
+		error = ENODEV;
+		goto out;
+	}
+	if ((fp->f_flag & FWRITE) == 0) {
+		error = EBADF;
+		goto out;
+	}
+	vp = fp->f_vnode;
+	if (vp->v_type != VREG) {
+		error = ENODEV;
+		goto out;
+	}
+	if (offset < 0 || len <= 0) {
+		error = EINVAL;
+		goto out;
+	}
+	/* Check for wrap. */
+	if (offset > OFF_MAX - len) {
+		error = EFBIG;
+		goto out;
+	}
+
+	/* Allocating blocks may take a long time, so iterate. */
+	for (;;) {
+		olen = len;
+		ooffset = offset;
+
+		bwillwrite();
+		mp = NULL;
+		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+		if (error != 0)
+			break;
+		error = vn_lock(vp, LK_EXCLUSIVE);
+		if (error != 0) {
+			vn_finished_write(mp);
+			break;
+		}
+#ifdef MAC
+		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
+		if (error == 0)
+#endif
+			error = VOP_ALLOCATE(vp, &offset, &len);
+		VOP_UNLOCK(vp, 0);
+		vn_finished_write(mp);
+
+		if (olen + ooffset != offset + len) {
+			panic("offset + len changed from %jx/%jx to %jx/%jx",
+			    ooffset, olen, offset, len);
+		}
+		if (error != 0 || len == 0)
+			break;
+		KASSERT(olen > len, ("Iteration did not make progress?"));
+		maybe_yield();
+	}
+ out:
+	if (fp != NULL)
+		fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
+{
+
+	return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
+}
+
+/*
+ * Unlike madvise(2), we do not make a best effort to remember every
+ * possible caching hint.  Instead, we remember the last setting with
+ * the exception that we will allow POSIX_FADV_NORMAL to adjust the
+ * region of any current setting.
+ */
+int
+kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
+    int advice)
+{
+	struct fadvise_info *fa, *new;
+	struct file *fp;
+	struct vnode *vp;
+	cap_rights_t rights;
+	off_t end;
+	int error;
+
+	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
+		return (EINVAL);
+	switch (advice) {
+	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_RANDOM:
+	case POSIX_FADV_NOREUSE:
+		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
+		break;
+	case POSIX_FADV_NORMAL:
+	case POSIX_FADV_WILLNEED:
+	case POSIX_FADV_DONTNEED:
+		new = NULL;
+		break;
+	default:
+		return (EINVAL);
+	}
+	/* XXX: CAP_POSIX_FADVISE? */
+	error = fget(td, fd, cap_rights_init(&rights), &fp);
+	if (error != 0)
+		goto out;
+
+	switch (fp->f_type) {
+	case DTYPE_VNODE:
+		break;
+	case DTYPE_PIPE:
+	case DTYPE_FIFO:
+		error = ESPIPE;
+		goto out;
+	default:
+		error = ENODEV;
+		goto out;
+	}
+	vp = fp->f_vnode;
+	if (vp->v_type != VREG) {
+		error = ENODEV;
+		goto out;
+	}
+	if (len == 0)
+		end = OFF_MAX;
+	else
+		end = offset + len - 1;
+	switch (advice) {
+	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_RANDOM:
+	case POSIX_FADV_NOREUSE:
+		/*
+		 * Try to merge any existing non-standard region with
+		 * this new region if possible, otherwise create a new
+		 * non-standard region for this request.
+		 */
+		mtx_pool_lock(mtxpool_sleep, fp);
+		fa = fp->f_advice;
+		if (fa != NULL && fa->fa_advice == advice &&
+		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
+		    (end != OFF_MAX && fa->fa_start == end + 1) ||
+		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
+			if (offset < fa->fa_start)
+				fa->fa_start = offset;
+			if (end > fa->fa_end)
+				fa->fa_end = end;
+		} else {
+			new->fa_advice = advice;
+			new->fa_start = offset;
+			new->fa_end = end;
+			new->fa_prevstart = 0;
+			new->fa_prevend = 0;
+			fp->f_advice = new;
+			new = fa;
+		}
+		mtx_pool_unlock(mtxpool_sleep, fp);
+		break;
+	case POSIX_FADV_NORMAL:
+		/*
+		 * If a the "normal" region overlaps with an existing
+		 * non-standard region, trim or remove the
+		 * non-standard region.
+		 */
+		mtx_pool_lock(mtxpool_sleep, fp);
+		fa = fp->f_advice;
+		if (fa != NULL) {
+			if (offset <= fa->fa_start && end >= fa->fa_end) {
+				new = fa;
+				fp->f_advice = NULL;
+			} else if (offset <= fa->fa_start &&
+			    end >= fa->fa_start)
+				fa->fa_start = end + 1;
+			else if (offset <= fa->fa_end && end >= fa->fa_end)
+				fa->fa_end = offset - 1;
+			else if (offset >= fa->fa_start && end <= fa->fa_end) {
+				/*
+				 * If the "normal" region is a middle
+				 * portion of the existing
+				 * non-standard region, just remove
+				 * the whole thing rather than picking
+				 * one side or the other to
+				 * preserve.
+				 */
+				new = fa;
+				fp->f_advice = NULL;
+			}
+		}
+		mtx_pool_unlock(mtxpool_sleep, fp);
+		break;
+	case POSIX_FADV_WILLNEED:
+	case POSIX_FADV_DONTNEED:
+		error = VOP_ADVISE(vp, offset, end, advice);
+		break;
+	}
+out:
+	if (fp != NULL)
+		fdrop(fp, td);
+	free(new, M_FADVISE);
+	return (error);
+}
+
+int
+sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
+{
+
+	return (kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
+	    uap->advice));
+}
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
new file mode 100644
index 0000000..c53030a
--- /dev/null
+++ b/sys/kern/vfs_vnops.c
@@ -0,0 +1,2083 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
+ * Copyright (c) 2013 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/disk.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/kdb.h>
+#include <sys/stat.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/filio.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/ttycom.h>
+#include <sys/conf.h>
+#include <sys/syslog.h>
+#include <sys/unistd.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+
+static fo_rdwr_t	vn_read;
+static fo_rdwr_t	vn_write;
+static fo_rdwr_t	vn_io_fault;
+static fo_truncate_t	vn_truncate;
+static fo_ioctl_t	vn_ioctl;
+static fo_poll_t	vn_poll;
+static fo_kqfilter_t	vn_kqfilter;
+static fo_stat_t	vn_statfile;
+static fo_close_t	vn_closefile;
+
+struct 	fileops vnops = {
+	.fo_read = vn_io_fault,
+	.fo_write = vn_io_fault,
+	.fo_truncate = vn_truncate,
+	.fo_ioctl = vn_ioctl,
+	.fo_poll = vn_poll,
+	.fo_kqfilter = vn_kqfilter,
+	.fo_stat = vn_statfile,
+	.fo_close = vn_closefile,
+	.fo_chmod = vn_chmod,
+	.fo_chown = vn_chown,
+	.fo_sendfile = vn_sendfile,
+	.fo_seek = vn_seek,
+	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
+};
+
+int
+vn_open(ndp, flagp, cmode, fp)
+	struct nameidata *ndp;
+	int *flagp, cmode;
+	struct file *fp;
+{
+	struct thread *td = ndp->ni_cnd.cn_thread;
+
+	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
+}
+
+/*
+ * Common code for vnode open operations via a name lookup.
+ * Lookup the vnode and invoke VOP_CREATE if needed.
+ * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
+ * 
+ * Note that this does NOT free nameidata for the successful case,
+ * due to the NDINIT being done elsewhere.
+ */
+int
+vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
+    struct ucred *cred, struct file *fp)
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct thread *td = ndp->ni_cnd.cn_thread;
+	struct vattr vat;
+	struct vattr *vap = &vat;
+	int fmode, error;
+
+restart:
+	fmode = *flagp;
+	if (fmode & O_CREAT) {
+		ndp->ni_cnd.cn_nameiop = CREATE;
+		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF;
+		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
+			ndp->ni_cnd.cn_flags |= FOLLOW;
+		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
+			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
+		if (vn_open_flags & VN_OPEN_NOCAPCHECK)
+			ndp->ni_cnd.cn_flags |= NOCAPCHECK;
+		bwillwrite();
+		if ((error = namei(ndp)) != 0)
+			return (error);
+		if (ndp->ni_vp == NULL) {
+			VATTR_NULL(vap);
+			vap->va_type = VREG;
+			vap->va_mode = cmode;
+			if (fmode & O_EXCL)
+				vap->va_vaflags |= VA_EXCLUSIVE;
+			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
+				NDFREE(ndp, NDF_ONLY_PNBUF);
+				vput(ndp->ni_dvp);
+				if ((error = vn_start_write(NULL, &mp,
+				    V_XSLEEP | PCATCH)) != 0)
+					return (error);
+				goto restart;
+			}
+#ifdef MAC
+			error = mac_vnode_check_create(cred, ndp->ni_dvp,
+			    &ndp->ni_cnd, vap);
+			if (error == 0)
+#endif
+				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
+						   &ndp->ni_cnd, vap);
+			vput(ndp->ni_dvp);
+			vn_finished_write(mp);
+			if (error) {
+				NDFREE(ndp, NDF_ONLY_PNBUF);
+				return (error);
+			}
+			fmode &= ~O_TRUNC;
+			vp = ndp->ni_vp;
+		} else {
+			if (ndp->ni_dvp == ndp->ni_vp)
+				vrele(ndp->ni_dvp);
+			else
+				vput(ndp->ni_dvp);
+			ndp->ni_dvp = NULL;
+			vp = ndp->ni_vp;
+			if (fmode & O_EXCL) {
+				error = EEXIST;
+				goto bad;
+			}
+			fmode &= ~O_CREAT;
+		}
+	} else {
+		ndp->ni_cnd.cn_nameiop = LOOKUP;
+		ndp->ni_cnd.cn_flags = ISOPEN |
+		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
+		if (!(fmode & FWRITE))
+			ndp->ni_cnd.cn_flags |= LOCKSHARED;
+		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
+			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
+		if (vn_open_flags & VN_OPEN_NOCAPCHECK)
+			ndp->ni_cnd.cn_flags |= NOCAPCHECK;
+		if ((error = namei(ndp)) != 0)
+			return (error);
+		vp = ndp->ni_vp;
+	}
+	error = vn_open_vnode(vp, fmode, cred, td, fp);
+	if (error)
+		goto bad;
+	*flagp = fmode;
+	return (0);
+bad:
+	NDFREE(ndp, NDF_ONLY_PNBUF);
+	vput(vp);
+	*flagp = fmode;
+	ndp->ni_vp = NULL;
+	return (error);
+}
+
+/*
+ * Common code for vnode open operations once a vnode is located.
+ * Check permissions, and call the VOP_OPEN routine.
+ */
+int
+vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
+    struct thread *td, struct file *fp)
+{
+	struct mount *mp;
+	accmode_t accmode;
+	struct flock lf;
+	int error, have_flock, lock_flags, type;
+
+	if (vp->v_type == VLNK)
+		return (EMLINK);
+	if (vp->v_type == VSOCK)
+		return (EOPNOTSUPP);
+	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
+		return (ENOTDIR);
+	accmode = 0;
+	if (fmode & (FWRITE | O_TRUNC)) {
+		if (vp->v_type == VDIR)
+			return (EISDIR);
+		accmode |= VWRITE;
+	}
+	if (fmode & FREAD)
+		accmode |= VREAD;
+	if (fmode & FEXEC)
+		accmode |= VEXEC;
+	if ((fmode & O_APPEND) && (fmode & FWRITE))
+		accmode |= VAPPEND;
+#ifdef MAC
+	error = mac_vnode_check_open(cred, vp, accmode);
+	if (error)
+		return (error);
+#endif
+	if ((fmode & O_CREAT) == 0) {
+		if (accmode & VWRITE) {
+			error = vn_writechk(vp);
+			if (error)
+				return (error);
+		}
+		if (accmode) {
+		        error = VOP_ACCESS(vp, accmode, cred, td);
+			if (error)
+				return (error);
+		}
+	}
+	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
+		return (error);
+
+	if (fmode & (O_EXLOCK | O_SHLOCK)) {
+		KASSERT(fp != NULL, ("open with flock requires fp"));
+		lock_flags = VOP_ISLOCKED(vp);
+		VOP_UNLOCK(vp, 0);
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		if (fmode & O_EXLOCK)
+			lf.l_type = F_WRLCK;
+		else
+			lf.l_type = F_RDLCK;
+		type = F_FLOCK;
+		if ((fmode & FNONBLOCK) == 0)
+			type |= F_WAIT;
+		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
+		have_flock = (error == 0);
+		vn_lock(vp, lock_flags | LK_RETRY);
+		if (error == 0 && vp->v_iflag & VI_DOOMED)
+			error = ENOENT;
+		/*
+		 * Another thread might have used this vnode as an
+		 * executable while the vnode lock was dropped.
+		 * Ensure the vnode is still able to be opened for
+		 * writing after the lock has been obtained.
+		 */
+		if (error == 0 && accmode & VWRITE)
+			error = vn_writechk(vp);
+		if (error) {
+			VOP_UNLOCK(vp, 0);
+			if (have_flock) {
+				lf.l_whence = SEEK_SET;
+				lf.l_start = 0;
+				lf.l_len = 0;
+				lf.l_type = F_UNLCK;
+				(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf,
+				    F_FLOCK);
+			}
+			vn_start_write(vp, &mp, V_WAIT);
+			vn_lock(vp, lock_flags | LK_RETRY);
+			(void)VOP_CLOSE(vp, fmode, cred, td);
+			vn_finished_write(mp);
+			return (error);
+		}
+		fp->f_flag |= FHASLOCK;
+	}
+	if (fmode & FWRITE) {
+		VOP_ADD_WRITECOUNT(vp, 1);
+		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
+		    __func__, vp, vp->v_writecount);
+	}
+	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
+	return (0);
+}
+
+/*
+ * Check for write permissions on the specified vnode.
+ * Prototype text segments cannot be written.
+ */
+int
+vn_writechk(vp)
+	register struct vnode *vp;
+{
+
+	ASSERT_VOP_LOCKED(vp, "vn_writechk");
+	/*
+	 * If there's shared text associated with
+	 * the vnode, try to free it up once.  If
+	 * we fail, we can't allow writing.
+	 */
+	if (VOP_IS_TEXT(vp))
+		return (ETXTBSY);
+
+	return (0);
+}
+
+/*
+ * Vnode close call
+ */
+int
+vn_close(vp, flags, file_cred, td)
+	register struct vnode *vp;
+	int flags;
+	struct ucred *file_cred;
+	struct thread *td;
+{
+	struct mount *mp;
+	int error, lock_flags;
+
+	if (!(flags & FWRITE) && vp->v_mount != NULL &&
+	    vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
+		lock_flags = LK_SHARED;
+	else
+		lock_flags = LK_EXCLUSIVE;
+
+	vn_start_write(vp, &mp, V_WAIT);
+	vn_lock(vp, lock_flags | LK_RETRY);
+	if (flags & FWRITE) {
+		VNASSERT(vp->v_writecount > 0, vp, 
+		    ("vn_close: negative writecount"));
+		VOP_ADD_WRITECOUNT(vp, -1);
+		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
+		    __func__, vp, vp->v_writecount);
+	}
+	error = VOP_CLOSE(vp, flags, file_cred, td);
+	vput(vp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Heuristic to detect sequential operation.
+ */
+static int
+sequential_heuristic(struct uio *uio, struct file *fp)
+{
+
+	if (atomic_load_acq_int(&(fp->f_flag)) & FRDAHEAD)
+		return (fp->f_seqcount << IO_SEQSHIFT);
+
+	/*
+	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
+	 * that the first I/O is normally considered to be slightly
+	 * sequential.  Seeking to offset 0 doesn't change sequentiality
+	 * unless previous seeks have reduced f_seqcount to 0, in which
+	 * case offset 0 is not special.
+	 */
+	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
+	    uio->uio_offset == fp->f_nextoff) {
+		/*
+		 * f_seqcount is in units of fixed-size blocks so that it
+		 * depends mainly on the amount of sequential I/O and not
+		 * much on the number of sequential I/O's.  The fixed size
+		 * of 16384 is hard-coded here since it is (not quite) just
+		 * a magic size that works well here.  This size is more
+		 * closely related to the best I/O size for real disks than
+		 * to any block size used by software.
+		 */
+		fp->f_seqcount += howmany(uio->uio_resid, 16384);
+		if (fp->f_seqcount > IO_SEQMAX)
+			fp->f_seqcount = IO_SEQMAX;
+		return (fp->f_seqcount << IO_SEQSHIFT);
+	}
+
+	/* Not sequential.  Quickly draw-down sequentiality. */
+	if (fp->f_seqcount > 1)
+		fp->f_seqcount = 1;
+	else
+		fp->f_seqcount = 0;
+	return (0);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it.
+ */
+int
+vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
+    enum uio_seg segflg, int ioflg, struct ucred *active_cred,
+    struct ucred *file_cred, ssize_t *aresid, struct thread *td)
+{
+	struct uio auio;
+	struct iovec aiov;
+	struct mount *mp;
+	struct ucred *cred;
+	void *rl_cookie;
+	int error, lock_flags;
+
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	aiov.iov_base = base;
+	aiov.iov_len = len;
+	auio.uio_resid = len;
+	auio.uio_offset = offset;
+	auio.uio_segflg = segflg;
+	auio.uio_rw = rw;
+	auio.uio_td = td;
+	error = 0;
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		if (rw == UIO_READ) {
+			rl_cookie = vn_rangelock_rlock(vp, offset,
+			    offset + len);
+		} else {
+			rl_cookie = vn_rangelock_wlock(vp, offset,
+			    offset + len);
+		}
+		mp = NULL;
+		if (rw == UIO_WRITE) { 
+			if (vp->v_type != VCHR &&
+			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
+			    != 0)
+				goto out;
+			if (MNT_SHARED_WRITES(mp) ||
+			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
+				lock_flags = LK_SHARED;
+			else
+				lock_flags = LK_EXCLUSIVE;
+		} else
+			lock_flags = LK_SHARED;
+		vn_lock(vp, lock_flags | LK_RETRY);
+	} else
+		rl_cookie = NULL;
+
+	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+#ifdef MAC
+	if ((ioflg & IO_NOMACCHECK) == 0) {
+		if (rw == UIO_READ)
+			error = mac_vnode_check_read(active_cred, file_cred,
+			    vp);
+		else
+			error = mac_vnode_check_write(active_cred, file_cred,
+			    vp);
+	}
+#endif
+	if (error == 0) {
+		if (file_cred != NULL)
+			cred = file_cred;
+		else
+			cred = active_cred;
+		if (rw == UIO_READ)
+			error = VOP_READ(vp, &auio, ioflg, cred);
+		else
+			error = VOP_WRITE(vp, &auio, ioflg, cred);
+	}
+	if (aresid)
+		*aresid = auio.uio_resid;
+	else
+		if (auio.uio_resid && error == 0)
+			error = EIO;
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		VOP_UNLOCK(vp, 0);
+		if (mp != NULL)
+			vn_finished_write(mp);
+	}
+ out:
+	if (rl_cookie != NULL)
+		vn_rangelock_unlock(vp, rl_cookie);
+	return (error);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it.  The I/O
+ * request is split up into smaller chunks and we try to avoid saturating
+ * the buffer cache while potentially holding a vnode locked, so we 
+ * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
+ * to give other processes a chance to lock the vnode (either other processes
+ * core'ing the same binary, or unrelated processes scanning the directory).
+ */
+int
+vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
+    file_cred, aresid, td)
+	enum uio_rw rw;
+	struct vnode *vp;
+	void *base;
+	size_t len;
+	off_t offset;
+	enum uio_seg segflg;
+	int ioflg;
+	struct ucred *active_cred;
+	struct ucred *file_cred;
+	size_t *aresid;
+	struct thread *td;
+{
+	int error = 0;
+	ssize_t iaresid;
+
+	do {
+		int chunk;
+
+		/*
+		 * Force `offset' to a multiple of MAXBSIZE except possibly
+		 * for the first chunk, so that filesystems only need to
+		 * write full blocks except possibly for the first and last
+		 * chunks.
+		 */
+		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
+
+		if (chunk > len)
+			chunk = len;
+		if (rw != UIO_READ && vp->v_type == VREG)
+			bwillwrite();
+		iaresid = 0;
+		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
+		    ioflg, active_cred, file_cred, &iaresid, td);
+		len -= chunk;	/* aresid calc already includes length */
+		if (error)
+			break;
+		offset += chunk;
+		base = (char *)base + chunk;
+		kern_yield(PRI_USER);
+	} while (len);
+	if (aresid)
+		*aresid = len + iaresid;
+	return (error);
+}
+
+off_t
+foffset_lock(struct file *fp, int flags)
+{
+	struct mtx *mtxp;
+	off_t res;
+
+	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
+
+#if OFF_MAX <= LONG_MAX
+	/*
+	 * Caller only wants the current f_offset value.  Assume that
+	 * the long and shorter integer types reads are atomic.
+	 */
+	if ((flags & FOF_NOLOCK) != 0)
+		return (fp->f_offset);
+#endif
+
+	/*
+	 * According to McKusick the vn lock was protecting f_offset here.
+	 * It is now protected by the FOFFSET_LOCKED flag.
+	 */
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if ((flags & FOF_NOLOCK) == 0) {
+		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+			    "vofflock", 0);
+		}
+		fp->f_vnread_flags |= FOFFSET_LOCKED;
+	}
+	res = fp->f_offset;
+	mtx_unlock(mtxp);
+	return (res);
+}
+
+void
+foffset_unlock(struct file *fp, off_t val, int flags)
+{
+	struct mtx *mtxp;
+
+	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
+
+#if OFF_MAX <= LONG_MAX
+	if ((flags & FOF_NOLOCK) != 0) {
+		if ((flags & FOF_NOUPDATE) == 0)
+			fp->f_offset = val;
+		if ((flags & FOF_NEXTOFF) != 0)
+			fp->f_nextoff = val;
+		return;
+	}
+#endif
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if ((flags & FOF_NOUPDATE) == 0)
+		fp->f_offset = val;
+	if ((flags & FOF_NEXTOFF) != 0)
+		fp->f_nextoff = val;
+	if ((flags & FOF_NOLOCK) == 0) {
+		KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
+		    ("Lost FOFFSET_LOCKED"));
+		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+			wakeup(&fp->f_vnread_flags);
+		fp->f_vnread_flags = 0;
+	}
+	mtx_unlock(mtxp);
+}
+
+void
+foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
+{
+
+	if ((flags & FOF_OFFSET) == 0)
+		uio->uio_offset = foffset_lock(fp, flags);
+}
+
+void
+foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
+{
+
+	if ((flags & FOF_OFFSET) == 0)
+		foffset_unlock(fp, uio->uio_offset, flags);
+}
+
+static int
+get_advice(struct file *fp, struct uio *uio)
+{
+	struct mtx *mtxp;
+	int ret;
+
+	ret = POSIX_FADV_NORMAL;
+	if (fp->f_advice == NULL)
+		return (ret);
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if (uio->uio_offset >= fp->f_advice->fa_start &&
+	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+		ret = fp->f_advice->fa_advice;
+	mtx_unlock(mtxp);
+	return (ret);
+}
+
+/*
+ * File table vnode read routine.
+ */
+static int
+vn_read(fp, uio, active_cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *active_cred;
+	int flags;
+	struct thread *td;
+{
+	struct vnode *vp;
+	struct mtx *mtxp;
+	int error, ioflag;
+	int advice;
+	off_t offset, start, end;
+
+	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
+	    uio->uio_td, td));
+	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
+	vp = fp->f_vnode;
+	ioflag = 0;
+	if (fp->f_flag & FNONBLOCK)
+		ioflag |= IO_NDELAY;
+	if (fp->f_flag & O_DIRECT)
+		ioflag |= IO_DIRECT;
+	advice = get_advice(fp, uio);
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+
+	switch (advice) {
+	case POSIX_FADV_NORMAL:
+	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_NOREUSE:
+		ioflag |= sequential_heuristic(uio, fp);
+		break;
+	case POSIX_FADV_RANDOM:
+		/* Disable read-ahead for random I/O. */
+		break;
+	}
+	offset = uio->uio_offset;
+
+#ifdef MAC
+	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
+	if (error == 0)
+#endif
+		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
+	fp->f_nextoff = uio->uio_offset;
+	VOP_UNLOCK(vp, 0);
+	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+	    offset != uio->uio_offset) {
+		/*
+		 * Use POSIX_FADV_DONTNEED to flush clean pages and
+		 * buffers for the backing file after a
+		 * POSIX_FADV_NOREUSE read(2).  To optimize the common
+		 * case of using POSIX_FADV_NOREUSE with sequential
+		 * access, track the previous implicit DONTNEED
+		 * request and grow this request to include the
+		 * current read(2) in addition to the previous
+		 * DONTNEED.  With purely sequential access this will
+		 * cause the DONTNEED requests to continously grow to
+		 * cover all of the previously read regions of the
+		 * file.  This allows filesystem blocks that are
+		 * accessed by multiple calls to read(2) to be flushed
+		 * once the last read(2) finishes.
+		 */
+		start = offset;
+		end = uio->uio_offset - 1;
+		mtxp = mtx_pool_find(mtxpool_sleep, fp);
+		mtx_lock(mtxp);
+		if (fp->f_advice != NULL &&
+		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+				start = fp->f_advice->fa_prevstart;
+			else if (fp->f_advice->fa_prevstart != 0 &&
+			    fp->f_advice->fa_prevstart == end + 1)
+				end = fp->f_advice->fa_prevend;
+			fp->f_advice->fa_prevstart = start;
+			fp->f_advice->fa_prevend = end;
+		}
+		mtx_unlock(mtxp);
+		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+	}
+	return (error);
+}
+
+/*
+ * File table vnode write routine.
+ */
+static int
+vn_write(fp, uio, active_cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *active_cred;
+	int flags;
+	struct thread *td;
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct mtx *mtxp;
+	int error, ioflag, lock_flags;
+	int advice;
+	off_t offset, start, end;
+
+	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
+	    uio->uio_td, td));
+	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
+	vp = fp->f_vnode;
+	if (vp->v_type == VREG)
+		bwillwrite();
+	ioflag = IO_UNIT;
+	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
+		ioflag |= IO_APPEND;
+	if (fp->f_flag & FNONBLOCK)
+		ioflag |= IO_NDELAY;
+	if (fp->f_flag & O_DIRECT)
+		ioflag |= IO_DIRECT;
+	if ((fp->f_flag & O_FSYNC) ||
+	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
+		ioflag |= IO_SYNC;
+	mp = NULL;
+	if (vp->v_type != VCHR &&
+	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		goto unlock;
+
+	advice = get_advice(fp, uio);
+
+	if (MNT_SHARED_WRITES(mp) ||
+	    (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
+		lock_flags = LK_SHARED;
+	} else {
+		lock_flags = LK_EXCLUSIVE;
+	}
+
+	vn_lock(vp, lock_flags | LK_RETRY);
+	switch (advice) {
+	case POSIX_FADV_NORMAL:
+	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_NOREUSE:
+		ioflag |= sequential_heuristic(uio, fp);
+		break;
+	case POSIX_FADV_RANDOM:
+		/* XXX: Is this correct? */
+		break;
+	}
+	offset = uio->uio_offset;
+
+#ifdef MAC
+	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
+	if (error == 0)
+#endif
+		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
+	fp->f_nextoff = uio->uio_offset;
+	VOP_UNLOCK(vp, 0);
+	if (vp->v_type != VCHR)
+		vn_finished_write(mp);
+	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+	    offset != uio->uio_offset) {
+		/*
+		 * Use POSIX_FADV_DONTNEED to flush clean pages and
+		 * buffers for the backing file after a
+		 * POSIX_FADV_NOREUSE write(2).  To optimize the
+		 * common case of using POSIX_FADV_NOREUSE with
+		 * sequential access, track the previous implicit
+		 * DONTNEED request and grow this request to include
+		 * the current write(2) in addition to the previous
+		 * DONTNEED.  With purely sequential access this will
+		 * cause the DONTNEED requests to continously grow to
+		 * cover all of the previously written regions of the
+		 * file.
+		 *
+		 * Note that the blocks just written are almost
+		 * certainly still dirty, so this only works when
+		 * VOP_ADVISE() calls from subsequent writes push out
+		 * the data written by this write(2) once the backing
+		 * buffers are clean.  However, as compared to forcing
+		 * IO_DIRECT, this gives much saner behavior.  Write
+		 * clustering is still allowed, and clean pages are
+		 * merely moved to the cache page queue rather than
+		 * outright thrown away.  This means a subsequent
+		 * read(2) can still avoid hitting the disk if the
+		 * pages have not been reclaimed.
+		 *
+		 * This does make POSIX_FADV_NOREUSE largely useless
+		 * with non-sequential access.  However, sequential
+		 * access is the more common use case and the flag is
+		 * merely advisory.
+		 */
+		start = offset;
+		end = uio->uio_offset - 1;
+		mtxp = mtx_pool_find(mtxpool_sleep, fp);
+		mtx_lock(mtxp);
+		if (fp->f_advice != NULL &&
+		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+				start = fp->f_advice->fa_prevstart;
+			else if (fp->f_advice->fa_prevstart != 0 &&
+			    fp->f_advice->fa_prevstart == end + 1)
+				end = fp->f_advice->fa_prevend;
+			fp->f_advice->fa_prevstart = start;
+			fp->f_advice->fa_prevend = end;
+		}
+		mtx_unlock(mtxp);
+		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+	}
+	
+unlock:
+	return (error);
+}
+
+static const int io_hold_cnt = 16;
+static int vn_io_fault_enable = 1;
+SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
+    &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
+static u_long vn_io_faults_cnt;
+SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
+    &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
+
+/*
+ * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
+ * prevent the following deadlock:
+ *
+ * Assume that the thread A reads from the vnode vp1 into userspace
+ * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
+ * currently not resident, then system ends up with the call chain
+ *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
+ *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
+ * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
+ * If, at the same time, thread B reads from vnode vp2 into buffer buf2
+ * backed by the pages of vnode vp1, and some page in buf2 is not
+ * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
+ *
+ * To prevent the lock order reversal and deadlock, vn_io_fault() does
+ * not allow page faults to happen during VOP_READ() or VOP_WRITE().
+ * Instead, it first tries to do the whole range i/o with pagefaults
+ * disabled. If all pages in the i/o buffer are resident and mapped,
+ * VOP will succeed (ignoring the genuine filesystem errors).
+ * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
+ * i/o in chunks, with all pages in the chunk prefaulted and held
+ * using vm_fault_quick_hold_pages().
+ *
+ * Filesystems using this deadlock avoidance scheme should use the
+ * array of the held pages from uio, saved in the curthread->td_ma,
+ * instead of doing uiomove().  A helper function
+ * vn_io_fault_uiomove() converts uiomove request into
+ * uiomove_fromphys() over td_ma array.
+ *
+ * Since vnode locks do not cover the whole i/o anymore, rangelocks
+ * make the current i/o request atomic with respect to other i/os and
+ * truncations.
+ */
+static int
+vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	vm_page_t ma[io_hold_cnt + 2];
+	struct uio *uio_clone, short_uio;
+	struct iovec short_iovec[1];
+	fo_rdwr_t *doio;
+	struct vnode *vp;
+	void *rl_cookie;
+	struct mount *mp;
+	vm_page_t *prev_td_ma;
+	int cnt, error, save, saveheld, prev_td_ma_cnt;
+	vm_offset_t addr, end;
+	vm_prot_t prot;
+	size_t len, resid;
+	ssize_t adv;
+
+	if (uio->uio_rw == UIO_READ)
+		doio = vn_read;
+	else
+		doio = vn_write;
+	vp = fp->f_vnode;
+	foffset_lock_uio(fp, uio, flags);
+
+	if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG ||
+	    ((mp = vp->v_mount) != NULL &&
+	    (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0) ||
+	    !vn_io_fault_enable) {
+		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
+		goto out_last;
+	}
+
+	/*
+	 * The UFS follows IO_UNIT directive and replays back both
+	 * uio_offset and uio_resid if an error is encountered during the
+	 * operation.  But, since the iovec may be already advanced,
+	 * uio is still in an inconsistent state.
+	 *
+	 * Cache a copy of the original uio, which is advanced to the redo
+	 * point using UIO_NOCOPY below.
+	 */
+	uio_clone = cloneuio(uio);
+	resid = uio->uio_resid;
+
+	short_uio.uio_segflg = UIO_USERSPACE;
+	short_uio.uio_rw = uio->uio_rw;
+	short_uio.uio_td = uio->uio_td;
+
+	if (uio->uio_rw == UIO_READ) {
+		prot = VM_PROT_WRITE;
+		rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
+		    uio->uio_offset + uio->uio_resid);
+	} else {
+		prot = VM_PROT_READ;
+		if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0)
+			/* For appenders, punt and lock the whole range. */
+			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+		else
+			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
+			    uio->uio_offset + uio->uio_resid);
+	}
+
+	save = vm_fault_disable_pagefaults();
+	error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
+	if (error != EFAULT)
+		goto out;
+
+	atomic_add_long(&vn_io_faults_cnt, 1);
+	uio_clone->uio_segflg = UIO_NOCOPY;
+	uiomove(NULL, resid - uio->uio_resid, uio_clone);
+	uio_clone->uio_segflg = uio->uio_segflg;
+
+	saveheld = curthread_pflags_set(TDP_UIOHELD);
+	prev_td_ma = td->td_ma;
+	prev_td_ma_cnt = td->td_ma_cnt;
+
+	while (uio_clone->uio_resid != 0) {
+		len = uio_clone->uio_iov->iov_len;
+		if (len == 0) {
+			KASSERT(uio_clone->uio_iovcnt >= 1,
+			    ("iovcnt underflow"));
+			uio_clone->uio_iov++;
+			uio_clone->uio_iovcnt--;
+			continue;
+		}
+
+		addr = (vm_offset_t)uio_clone->uio_iov->iov_base;
+		end = round_page(addr + len);
+		cnt = howmany(end - trunc_page(addr), PAGE_SIZE);
+		/*
+		 * A perfectly misaligned address and length could cause
+		 * both the start and the end of the chunk to use partial
+		 * page.  +2 accounts for such a situation.
+		 */
+		if (cnt > io_hold_cnt + 2) {
+			len = io_hold_cnt * PAGE_SIZE;
+			KASSERT(howmany(round_page(addr + len) -
+			    trunc_page(addr), PAGE_SIZE) <= io_hold_cnt + 2,
+			    ("cnt overflow"));
+		}
+		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
+		    addr, len, prot, ma, io_hold_cnt + 2);
+		if (cnt == -1) {
+			error = EFAULT;
+			break;
+		}
+		short_uio.uio_iov = &short_iovec[0];
+		short_iovec[0].iov_base = (void *)addr;
+		short_uio.uio_iovcnt = 1;
+		short_uio.uio_resid = short_iovec[0].iov_len = len;
+		short_uio.uio_offset = uio_clone->uio_offset;
+		td->td_ma = ma;
+		td->td_ma_cnt = cnt;
+
+		error = doio(fp, &short_uio, active_cred, flags | FOF_OFFSET,
+		    td);
+		vm_page_unhold_pages(ma, cnt);
+		adv = len - short_uio.uio_resid;
+
+		uio_clone->uio_iov->iov_base =
+		    (char *)uio_clone->uio_iov->iov_base + adv;
+		uio_clone->uio_iov->iov_len -= adv;
+		uio_clone->uio_resid -= adv;
+		uio_clone->uio_offset += adv;
+
+		uio->uio_resid -= adv;
+		uio->uio_offset += adv;
+
+		if (error != 0 || adv == 0)
+			break;
+	}
+	td->td_ma = prev_td_ma;
+	td->td_ma_cnt = prev_td_ma_cnt;
+	curthread_pflags_restore(saveheld);
+out:
+	vm_fault_enable_pagefaults(save);
+	vn_rangelock_unlock(vp, rl_cookie);
+	free(uio_clone, M_IOV);
+out_last:
+	foffset_unlock_uio(fp, uio, flags);
+	return (error);
+}
+
+/*
+ * Helper function to perform the requested uiomove operation using
+ * the held pages for io->uio_iov[0].iov_base buffer instead of
+ * copyin/copyout.  Access to the pages with uiomove_fromphys()
+ * instead of iov_base prevents page faults that could occur due to
+ * pmap_collect() invalidating the mapping created by
+ * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
+ * object cleanup revoking the write access from page mappings.
+ *
+ * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
+ * instead of plain uiomove().
+ */
+int
+vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
+{
+	struct uio transp_uio;
+	struct iovec transp_iov[1];
+	struct thread *td;
+	size_t adv;
+	int error, pgadv;
+
+	td = curthread;
+	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
+	    uio->uio_segflg != UIO_USERSPACE)
+		return (uiomove(data, xfersize, uio));
+
+	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
+	transp_iov[0].iov_base = data;
+	transp_uio.uio_iov = &transp_iov[0];
+	transp_uio.uio_iovcnt = 1;
+	if (xfersize > uio->uio_resid)
+		xfersize = uio->uio_resid;
+	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
+	transp_uio.uio_offset = 0;
+	transp_uio.uio_segflg = UIO_SYSSPACE;
+	/*
+	 * Since transp_iov points to data, and td_ma page array
+	 * corresponds to original uio->uio_iov, we need to invert the
+	 * direction of the i/o operation as passed to
+	 * uiomove_fromphys().
+	 */
+	switch (uio->uio_rw) {
+	case UIO_WRITE:
+		transp_uio.uio_rw = UIO_READ;
+		break;
+	case UIO_READ:
+		transp_uio.uio_rw = UIO_WRITE;
+		break;
+	}
+	transp_uio.uio_td = uio->uio_td;
+	error = uiomove_fromphys(td->td_ma,
+	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
+	    xfersize, &transp_uio);
+	adv = xfersize - transp_uio.uio_resid;
+	pgadv =
+	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
+	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
+	td->td_ma += pgadv;
+	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
+	    pgadv));
+	td->td_ma_cnt -= pgadv;
+	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
+	uio->uio_iov->iov_len -= adv;
+	uio->uio_resid -= adv;
+	uio->uio_offset += adv;
+	return (error);
+}
+
+int
+vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
+    struct uio *uio)
+{
+	struct thread *td;
+	vm_offset_t iov_base;
+	int cnt, pgadv;
+
+	td = curthread;
+	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
+	    uio->uio_segflg != UIO_USERSPACE)
+		return (uiomove_fromphys(ma, offset, xfersize, uio));
+
+	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
+	cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
+	iov_base = (vm_offset_t)uio->uio_iov->iov_base;
+	switch (uio->uio_rw) {
+	case UIO_WRITE:
+		pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
+		    offset, cnt);
+		break;
+	case UIO_READ:
+		pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
+		    cnt);
+		break;
+	}
+	pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
+	td->td_ma += pgadv;
+	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
+	    pgadv));
+	td->td_ma_cnt -= pgadv;
+	uio->uio_iov->iov_base = (char *)(iov_base + cnt);
+	uio->uio_iov->iov_len -= cnt;
+	uio->uio_resid -= cnt;
+	uio->uio_offset += cnt;
+	return (0);
+}
+
+
+/*
+ * File table truncate routine.
+ */
+static int
+vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct vattr vattr;
+	struct mount *mp;
+	struct vnode *vp;
+	void *rl_cookie;
+	int error;
+
+	vp = fp->f_vnode;
+
+	/*
+	 * Lock the whole range for truncation.  Otherwise split i/o
+	 * might happen partly before and partly after the truncation.
+	 */
+	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error)
+		goto out1;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	if (vp->v_type == VDIR) {
+		error = EISDIR;
+		goto out;
+	}
+#ifdef MAC
+	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
+	if (error)
+		goto out;
+#endif
+	error = vn_writechk(vp);
+	if (error == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = length;
+		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
+	}
+out:
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+out1:
+	vn_rangelock_unlock(vp, rl_cookie);
+	return (error);
+}
+
+/*
+ * File table vnode stat routine.
+ */
+static int
+vn_statfile(fp, sb, active_cred, td)
+	struct file *fp;
+	struct stat *sb;
+	struct ucred *active_cred;
+	struct thread *td;
+{
+	struct vnode *vp = fp->f_vnode;
+	int error;
+
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
+	VOP_UNLOCK(vp, 0);
+
+	return (error);
+}
+
+/*
+ * Stat a vnode; implementation for the stat syscall
+ */
+int
+vn_stat(vp, sb, active_cred, file_cred, td)
+	struct vnode *vp;
+	register struct stat *sb;
+	struct ucred *active_cred;
+	struct ucred *file_cred;
+	struct thread *td;
+{
+	struct vattr vattr;
+	register struct vattr *vap;
+	int error;
+	u_short mode;
+
+#ifdef MAC
+	error = mac_vnode_check_stat(active_cred, file_cred, vp);
+	if (error)
+		return (error);
+#endif
+
+	vap = &vattr;
+
+	/*
+	 * Initialize defaults for new and unusual fields, so that file
+	 * systems which don't support these fields don't need to know
+	 * about them.
+	 */
+	vap->va_birthtime.tv_sec = -1;
+	vap->va_birthtime.tv_nsec = 0;
+	vap->va_fsid = VNOVAL;
+	vap->va_rdev = NODEV;
+
+	error = VOP_GETATTR(vp, vap, active_cred);
+	if (error)
+		return (error);
+
+	/*
+	 * Zero the spare stat fields
+	 */
+	bzero(sb, sizeof *sb);
+
+	/*
+	 * Copy from vattr table
+	 */
+	if (vap->va_fsid != VNOVAL)
+		sb->st_dev = vap->va_fsid;
+	else
+		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
+	sb->st_ino = vap->va_fileid;
+	mode = vap->va_mode;
+	switch (vap->va_type) {
+	case VREG:
+		mode |= S_IFREG;
+		break;
+	case VDIR:
+		mode |= S_IFDIR;
+		break;
+	case VBLK:
+		mode |= S_IFBLK;
+		break;
+	case VCHR:
+		mode |= S_IFCHR;
+		break;
+	case VLNK:
+		mode |= S_IFLNK;
+		break;
+	case VSOCK:
+		mode |= S_IFSOCK;
+		break;
+	case VFIFO:
+		mode |= S_IFIFO;
+		break;
+	default:
+		return (EBADF);
+	};
+	sb->st_mode = mode;
+	sb->st_nlink = vap->va_nlink;
+	sb->st_uid = vap->va_uid;
+	sb->st_gid = vap->va_gid;
+	sb->st_rdev = vap->va_rdev;
+	if (vap->va_size > OFF_MAX)
+		return (EOVERFLOW);
+	sb->st_size = vap->va_size;
+	sb->st_atim = vap->va_atime;
+	sb->st_mtim = vap->va_mtime;
+	sb->st_ctim = vap->va_ctime;
+	sb->st_birthtim = vap->va_birthtime;
+
+        /*
+	 * According to www.opengroup.org, the meaning of st_blksize is 
+	 *   "a filesystem-specific preferred I/O block size for this 
+	 *    object.  In some filesystem types, this may vary from file
+	 *    to file"
+	 * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
+	 */
+
+	sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
+	
+	sb->st_flags = vap->va_flags;
+	if (priv_check(td, PRIV_VFS_GENERATION))
+		sb->st_gen = 0;
+	else
+		sb->st_gen = vap->va_gen;
+
+	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
+	return (0);
+}
+
+/*
+ * File table vnode ioctl routine.
+ */
+static int
+vn_ioctl(fp, com, data, active_cred, td)
+	struct file *fp;
+	u_long com;
+	void *data;
+	struct ucred *active_cred;
+	struct thread *td;
+{
+	struct vattr vattr;
+	struct vnode *vp;
+	int error;
+
+	vp = fp->f_vnode;
+	switch (vp->v_type) {
+	case VDIR:
+	case VREG:
+		switch (com) {
+		case FIONREAD:
+			vn_lock(vp, LK_SHARED | LK_RETRY);
+			error = VOP_GETATTR(vp, &vattr, active_cred);
+			VOP_UNLOCK(vp, 0);
+			if (error == 0)
+				*(int *)data = vattr.va_size - fp->f_offset;
+			return (error);
+		case FIONBIO:
+		case FIOASYNC:
+			return (0);
+		default:
+			return (VOP_IOCTL(vp, com, data, fp->f_flag,
+			    active_cred, td));
+		}
+	default:
+		return (ENOTTY);
+	}
+}
+
+/*
+ * File table vnode poll routine.
+ */
+static int
+vn_poll(fp, events, active_cred, td)
+	struct file *fp;
+	int events;
+	struct ucred *active_cred;
+	struct thread *td;
+{
+	struct vnode *vp;
+	int error;
+
+	vp = fp->f_vnode;
+#ifdef MAC
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
+	VOP_UNLOCK(vp, 0);
+	if (!error)
+#endif
+
+	error = VOP_POLL(vp, events, fp->f_cred, td);
+	return (error);
+}
+
+/*
+ * Acquire the requested lock and then check for validity.  LK_RETRY
+ * permits vn_lock to return doomed vnodes.
+ */
+int
+_vn_lock(struct vnode *vp, int flags, char *file, int line)
+{
+	int error;
+
+	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
+	    ("vn_lock called with no locktype."));
+	do {
+#ifdef DEBUG_VFS_LOCKS
+		KASSERT(vp->v_holdcnt != 0,
+		    ("vn_lock %p: zero hold count", vp));
+#endif
+		error = VOP_LOCK1(vp, flags, file, line);
+		flags &= ~LK_INTERLOCK;	/* Interlock is always dropped. */
+		KASSERT((flags & LK_RETRY) == 0 || error == 0,
+		    ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
+		    flags, error));
+		/*
+		 * Callers specify LK_RETRY if they wish to get dead vnodes.
+		 * If RETRY is not set, we return ENOENT instead.
+		 */
+		if (error == 0 && vp->v_iflag & VI_DOOMED &&
+		    (flags & LK_RETRY) == 0) {
+			VOP_UNLOCK(vp, 0);
+			error = ENOENT;
+			break;
+		}
+	} while (flags & LK_RETRY && error != 0);
+	return (error);
+}
+
+/*
+ * File table vnode close routine.
+ */
+static int
+vn_closefile(fp, td)
+	struct file *fp;
+	struct thread *td;
+{
+	struct vnode *vp;
+	struct flock lf;
+	int error;
+
+	vp = fp->f_vnode;
+	fp->f_ops = &badfileops;
+
+	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK)
+		vref(vp);
+
+	error = vn_close(vp, fp->f_flag, fp->f_cred, td);
+
+	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		lf.l_type = F_UNLCK;
+		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
+		vrele(vp);
+	}
+	return (error);
+}
+
+/*
+ * Preparing to start a filesystem write operation. If the operation is
+ * permitted, then we bump the count of operations in progress and
+ * proceed. If a suspend request is in progress, we wait until the
+ * suspension is over, and then proceed.
+ */
+static int
+vn_start_write_locked(struct mount *mp, int flags)
+{
+	int error;
+
+	mtx_assert(MNT_MTX(mp), MA_OWNED);
+	error = 0;
+
+	/*
+	 * Check on status of suspension.
+	 */
+	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
+	    mp->mnt_susp_owner != curthread) {
+		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
+			if (flags & V_NOWAIT) {
+				error = EWOULDBLOCK;
+				goto unlock;
+			}
+			error = msleep(&mp->mnt_flag, MNT_MTX(mp),
+			    (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
+			if (error)
+				goto unlock;
+		}
+	}
+	if (flags & V_XSLEEP)
+		goto unlock;
+	mp->mnt_writeopcount++;
+unlock:
+	if (error != 0 || (flags & V_XSLEEP) != 0)
+		MNT_REL(mp);
+	MNT_IUNLOCK(mp);
+	return (error);
+}
+
+int
+vn_start_write(vp, mpp, flags)
+	struct vnode *vp;
+	struct mount **mpp;
+	int flags;
+{
+	struct mount *mp;
+	int error;
+
+	error = 0;
+	/*
+	 * If a vnode is provided, get and return the mount point that
+	 * to which it will write.
+	 */
+	if (vp != NULL) {
+		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
+			*mpp = NULL;
+			if (error != EOPNOTSUPP)
+				return (error);
+			return (0);
+		}
+	}
+	if ((mp = *mpp) == NULL)
+		return (0);
+
+	/*
+	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
+	 * a vfs_ref().
+	 * As long as a vnode is not provided we need to acquire a
+	 * refcount for the provided mountpoint too, in order to
+	 * emulate a vfs_ref().
+	 */
+	MNT_ILOCK(mp);
+	if (vp == NULL)
+		MNT_REF(mp);
+
+	return (vn_start_write_locked(mp, flags));
+}
+
+/*
+ * Secondary suspension. Used by operations such as vop_inactive
+ * routines that are needed by the higher level functions. These
+ * are allowed to proceed until all the higher level functions have
+ * completed (indicated by mnt_writeopcount dropping to zero). At that
+ * time, these operations are halted until the suspension is over.
+ */
+int
+vn_start_secondary_write(vp, mpp, flags)
+	struct vnode *vp;
+	struct mount **mpp;
+	int flags;
+{
+	struct mount *mp;
+	int error;
+
+ retry:
+	if (vp != NULL) {
+		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
+			*mpp = NULL;
+			if (error != EOPNOTSUPP)
+				return (error);
+			return (0);
+		}
+	}
+	/*
+	 * If we are not suspended or have not yet reached suspended
+	 * mode, then let the operation proceed.
+	 */
+	if ((mp = *mpp) == NULL)
+		return (0);
+
+	/*
+	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
+	 * a vfs_ref().
+	 * As long as a vnode is not provided we need to acquire a
+	 * refcount for the provided mountpoint too, in order to
+	 * emulate a vfs_ref().
+	 */
+	MNT_ILOCK(mp);
+	if (vp == NULL)
+		MNT_REF(mp);
+	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
+		mp->mnt_secondary_writes++;
+		mp->mnt_secondary_accwrites++;
+		MNT_IUNLOCK(mp);
+		return (0);
+	}
+	if (flags & V_NOWAIT) {
+		MNT_REL(mp);
+		MNT_IUNLOCK(mp);
+		return (EWOULDBLOCK);
+	}
+	/*
+	 * Wait for the suspension to finish.
+	 */
+	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
+		       (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
+	vfs_rel(mp);
+	if (error == 0)
+		goto retry;
+	return (error);
+}
+
+/*
+ * Filesystem write operation has completed. If we are suspending and this
+ * operation is the last one, notify the suspender that the suspension is
+ * now in effect.
+ */
+void
+vn_finished_write(mp)
+	struct mount *mp;
+{
+	if (mp == NULL)
+		return;
+	MNT_ILOCK(mp);
+	MNT_REL(mp);
+	mp->mnt_writeopcount--;
+	if (mp->mnt_writeopcount < 0)
+		panic("vn_finished_write: neg cnt");
+	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
+	    mp->mnt_writeopcount <= 0)
+		wakeup(&mp->mnt_writeopcount);
+	MNT_IUNLOCK(mp);
+}
+
+
+/*
+ * Filesystem secondary write operation has completed. If we are
+ * suspending and this operation is the last one, notify the suspender
+ * that the suspension is now in effect.
+ */
+void
+vn_finished_secondary_write(mp)
+	struct mount *mp;
+{
+	if (mp == NULL)
+		return;
+	MNT_ILOCK(mp);
+	MNT_REL(mp);
+	mp->mnt_secondary_writes--;
+	if (mp->mnt_secondary_writes < 0)
+		panic("vn_finished_secondary_write: neg cnt");
+	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
+	    mp->mnt_secondary_writes <= 0)
+		wakeup(&mp->mnt_secondary_writes);
+	MNT_IUNLOCK(mp);
+}
+
+
+
+/*
+ * Request a filesystem to suspend write operations.
+ */
+int
+vfs_write_suspend(struct mount *mp, int flags)
+{
+	int error;
+
+	MNT_ILOCK(mp);
+	if (mp->mnt_susp_owner == curthread) {
+		MNT_IUNLOCK(mp);
+		return (EALREADY);
+	}
+	while (mp->mnt_kern_flag & MNTK_SUSPEND)
+		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
+
+	/*
+	 * Unmount holds a write reference on the mount point.  If we
+	 * own busy reference and drain for writers, we deadlock with
+	 * the reference draining in the unmount path.  Callers of
+	 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
+	 * vfs_busy() reference is owned and caller is not in the
+	 * unmount context.
+	 */
+	if ((flags & VS_SKIP_UNMOUNT) != 0 &&
+	    (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
+		MNT_IUNLOCK(mp);
+		return (EBUSY);
+	}
+
+	mp->mnt_kern_flag |= MNTK_SUSPEND;
+	mp->mnt_susp_owner = curthread;
+	if (mp->mnt_writeopcount > 0)
+		(void) msleep(&mp->mnt_writeopcount, 
+		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
+	else
+		MNT_IUNLOCK(mp);
+	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
+		vfs_write_resume(mp, 0);
+	return (error);
+}
+
+/*
+ * Request a filesystem to resume write operations.
+ */
+void
+vfs_write_resume(struct mount *mp, int flags)
+{
+
+	MNT_ILOCK(mp);
+	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
+		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
+		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
+				       MNTK_SUSPENDED);
+		mp->mnt_susp_owner = NULL;
+		wakeup(&mp->mnt_writeopcount);
+		wakeup(&mp->mnt_flag);
+		curthread->td_pflags &= ~TDP_IGNSUSP;
+		if ((flags & VR_START_WRITE) != 0) {
+			MNT_REF(mp);
+			mp->mnt_writeopcount++;
+		}
+		MNT_IUNLOCK(mp);
+		if ((flags & VR_NO_SUSPCLR) == 0)
+			VFS_SUSP_CLEAN(mp);
+	} else if ((flags & VR_START_WRITE) != 0) {
+		MNT_REF(mp);
+		vn_start_write_locked(mp, 0);
+	} else {
+		MNT_IUNLOCK(mp);
+	}
+}
+
+/*
+ * Implement kqueues for files by translating it to vnode operation.
+ */
+static int
+vn_kqfilter(struct file *fp, struct knote *kn)
+{
+
+	return (VOP_KQFILTER(fp->f_vnode, kn));
+}
+
+/*
+ * Simplified in-kernel wrapper calls for extended attribute access.
+ * Both calls pass in a NULL credential, authorizing as "kernel" access.
+ * Set IO_NODELOCKED in ioflg if the vnode is already locked.
+ */
+int
+vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
+    const char *attrname, int *buflen, char *buf, struct thread *td)
+{
+	struct uio	auio;
+	struct iovec	iov;
+	int	error;
+
+	iov.iov_len = *buflen;
+	iov.iov_base = buf;
+
+	auio.uio_iov = &iov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_td = td;
+	auio.uio_offset = 0;
+	auio.uio_resid = *buflen;
+
+	if ((ioflg & IO_NODELOCKED) == 0)
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+
+	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+
+	/* authorize attribute retrieval as kernel */
+	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
+	    td);
+
+	if ((ioflg & IO_NODELOCKED) == 0)
+		VOP_UNLOCK(vp, 0);
+
+	if (error == 0) {
+		*buflen = *buflen - auio.uio_resid;
+	}
+
+	return (error);
+}
+
+/*
+ * XXX failure mode if partially written?
+ */
+int
+vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
+    const char *attrname, int buflen, char *buf, struct thread *td)
+{
+	struct uio	auio;
+	struct iovec	iov;
+	struct mount	*mp;
+	int	error;
+
+	iov.iov_len = buflen;
+	iov.iov_base = buf;
+
+	auio.uio_iov = &iov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_td = td;
+	auio.uio_offset = 0;
+	auio.uio_resid = buflen;
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
+			return (error);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	}
+
+	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+
+	/* authorize attribute setting as kernel */
+	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		vn_finished_write(mp);
+		VOP_UNLOCK(vp, 0);
+	}
+
+	return (error);
+}
+
+int
+vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
+    const char *attrname, struct thread *td)
+{
+	struct mount	*mp;
+	int	error;
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
+			return (error);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	}
+
+	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+
+	/* authorize attribute removal as kernel */
+	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
+	if (error == EOPNOTSUPP)
+		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
+		    NULL, td);
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		vn_finished_write(mp);
+		VOP_UNLOCK(vp, 0);
+	}
+
+	return (error);
+}
+
+int
+vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
+{
+	struct mount *mp;
+	int ltype, error;
+
+	mp = vp->v_mount;
+	ltype = VOP_ISLOCKED(vp);
+	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
+	    ("vn_vget_ino: vp not locked"));
+	error = vfs_busy(mp, MBF_NOWAIT);
+	if (error != 0) {
+		vfs_ref(mp);
+		VOP_UNLOCK(vp, 0);
+		error = vfs_busy(mp, 0);
+		vn_lock(vp, ltype | LK_RETRY);
+		vfs_rel(mp);
+		if (error != 0)
+			return (ENOENT);
+		if (vp->v_iflag & VI_DOOMED) {
+			vfs_unbusy(mp);
+			return (ENOENT);
+		}
+	}
+	VOP_UNLOCK(vp, 0);
+	error = VFS_VGET(mp, ino, lkflags, rvp);
+	vfs_unbusy(mp);
+	vn_lock(vp, ltype | LK_RETRY);
+	if (vp->v_iflag & VI_DOOMED) {
+		if (error == 0)
+			vput(*rvp);
+		error = ENOENT;
+	}
+	return (error);
+}
+
+int
+vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
+    const struct thread *td)
+{
+
+	if (vp->v_type != VREG || td == NULL)
+		return (0);
+	PROC_LOCK(td->td_proc);
+	if ((uoff_t)uio->uio_offset + uio->uio_resid >
+	    lim_cur(td->td_proc, RLIMIT_FSIZE)) {
+		kern_psignal(td->td_proc, SIGXFSZ);
+		PROC_UNLOCK(td->td_proc);
+		return (EFBIG);
+	}
+	PROC_UNLOCK(td->td_proc);
+	return (0);
+}
+
+int
+vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct vnode *vp;
+
+	vp = fp->f_vnode;
+#ifdef AUDIT
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+	VOP_UNLOCK(vp, 0);
+#endif
+	return (setfmode(td, active_cred, vp, mode));
+}
+
+int
+vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct vnode *vp;
+
+	vp = fp->f_vnode;
+#ifdef AUDIT
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+	VOP_UNLOCK(vp, 0);
+#endif
+	return (setfown(td, active_cred, vp, uid, gid));
+}
+
+void
+vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
+{
+	vm_object_t object;
+
+	if ((object = vp->v_object) == NULL)
+		return;
+	VM_OBJECT_WLOCK(object);
+	vm_object_page_remove(object, start, end, 0);
+	VM_OBJECT_WUNLOCK(object);
+}
+
+int
+vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
+{
+	struct vattr va;
+	daddr_t bn, bnp;
+	uint64_t bsize;
+	off_t noff;
+	int error;
+
+	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
+	    ("Wrong command %lu", cmd));
+
+	if (vn_lock(vp, LK_SHARED) != 0)
+		return (EBADF);
+	if (vp->v_type != VREG) {
+		error = ENOTTY;
+		goto unlock;
+	}
+	error = VOP_GETATTR(vp, &va, cred);
+	if (error != 0)
+		goto unlock;
+	noff = *off;
+	if (noff >= va.va_size) {
+		error = ENXIO;
+		goto unlock;
+	}
+	bsize = vp->v_mount->mnt_stat.f_iosize;
+	for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
+		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
+		if (error == EOPNOTSUPP) {
+			error = ENOTTY;
+			goto unlock;
+		}
+		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
+		    (bnp != -1 && cmd == FIOSEEKDATA)) {
+			noff = bn * bsize;
+			if (noff < *off)
+				noff = *off;
+			goto unlock;
+		}
+	}
+	if (noff > va.va_size)
+		noff = va.va_size;
+	/* noff == va.va_size. There is an implicit hole at the end of file. */
+	if (cmd == FIOSEEKDATA)
+		error = ENXIO;
+unlock:
+	VOP_UNLOCK(vp, 0);
+	if (error == 0)
+		*off = noff;
+	return (error);
+}
+
+int
+vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
+{
+	struct ucred *cred;
+	struct vnode *vp;
+	struct vattr vattr;
+	off_t foffset, size;
+	int error, noneg;
+
+	cred = td->td_ucred;
+	vp = fp->f_vnode;
+	foffset = foffset_lock(fp, 0);
+	noneg = (vp->v_type != VCHR);
+	error = 0;
+	switch (whence) {
+	case L_INCR:
+		if (noneg &&
+		    (foffset < 0 ||
+		    (offset > 0 && foffset > OFF_MAX - offset))) {
+			error = EOVERFLOW;
+			break;
+		}
+		offset += foffset;
+		break;
+	case L_XTND:
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+		error = VOP_GETATTR(vp, &vattr, cred);
+		VOP_UNLOCK(vp, 0);
+		if (error)
+			break;
+
+		/*
+		 * If the file references a disk device, then fetch
+		 * the media size and use that to determine the ending
+		 * offset.
+		 */
+		if (vattr.va_size == 0 && vp->v_type == VCHR &&
+		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
+			vattr.va_size = size;
+		if (noneg &&
+		    (vattr.va_size > OFF_MAX ||
+		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
+			error = EOVERFLOW;
+			break;
+		}
+		offset += vattr.va_size;
+		break;
+	case L_SET:
+		break;
+	case SEEK_DATA:
+		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
+		break;
+	case SEEK_HOLE:
+		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
+		break;
+	default:
+		error = EINVAL;
+	}
+	if (error == 0 && noneg && offset < 0)
+		error = EINVAL;
+	if (error != 0)
+		goto drop;
+	VFS_KNOTE_UNLOCKED(vp, 0);
+	*(off_t *)(td->td_retval) = offset;
+drop:
+	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
+	return (error);
+}
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
new file mode 100644
index 0000000..eabfb43
--- /dev/null
+++ b/sys/kern/vnode_if.src
@@ -0,0 +1,716 @@
+#-
+# Copyright (c) 1992, 1993
+#	The Regents of the University of California.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 4. Neither the name of the University nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+#	@(#)vnode_if.src	8.12 (Berkeley) 5/14/95
+# $FreeBSD$
+#
+
+#
+# Above each of the vop descriptors in lines starting with %%
+# is a specification of the locking protocol used by each vop call.
+# The first column is the name of the variable, the remaining three
+# columns are in, out and error respectively.  The "in" column defines
+# the lock state on input, the "out" column defines the state on succesful
+# return, and the "error" column defines the locking state on error exit.
+#
+# The locking value can take the following values:
+# L: locked; not converted to type of lock.
+# A: any lock type.
+# S: locked with shared lock.
+# E: locked with exclusive lock for this process.
+# O: locked with exclusive lock for other process.
+# U: unlocked.
+# -: not applicable.  vnode does not yet (or no longer) exists.
+# =: the same on input and output, may be either L or U.
+# X: locked if not nil.
+#
+# The paramater named "vpp" is assumed to be always used with double
+# indirection (**vpp) and that name is hard-coded in vnode_if.awk !
+#
+# Lines starting with %! specify a pre or post-condition function
+# to call before/after the vop call.
+#
+# If other such parameters are introduced, they have to be added to
+# the AWK script at the head of the definition of "add_debug_code()".
+#
+
+vop_islocked {
+	IN struct vnode *vp;
+};
+
+%% lookup	dvp	L L L
+%% lookup	vpp	- L -
+
+# XXX - the lookup locking protocol defies simple description and depends
+#	on the flags and operation fields in the (cnp) structure.  Note
+#	especially that *vpp may equal dvp and both may be locked.
+
+vop_lookup {
+	IN struct vnode *dvp;
+	INOUT struct vnode **vpp;
+	IN struct componentname *cnp;
+};
+
+%% cachedlookup	dvp	L L L
+%% cachedlookup	vpp	- L -
+
+# This must be an exact copy of lookup.  See kern/vfs_cache.c for details.
+
+vop_cachedlookup {
+	IN struct vnode *dvp;
+	INOUT struct vnode **vpp;
+	IN struct componentname *cnp;
+};
+
+%% create	dvp	E E E
+%% create	vpp	- L -
+%! create	post	vop_create_post
+
+vop_create {
+	IN struct vnode *dvp;
+	OUT struct vnode **vpp;
+	IN struct componentname *cnp;
+	IN struct vattr *vap;
+};
+
+
+%% whiteout	dvp	E E E
+
+vop_whiteout {
+	IN struct vnode *dvp;
+	IN struct componentname *cnp;
+	IN int flags;
+};
+
+
+%% mknod	dvp	E E E
+%% mknod	vpp	- L -
+%! mknod	post	vop_mknod_post
+
+vop_mknod {
+	IN struct vnode *dvp;
+	OUT struct vnode **vpp;
+	IN struct componentname *cnp;
+	IN struct vattr *vap;
+};
+
+
+%% open		vp	L L L
+
+vop_open {
+	IN struct vnode *vp;
+	IN int mode;
+	IN struct ucred *cred;
+	IN struct thread *td;
+	IN struct file *fp;
+};
+
+
+%% close	vp	L L L
+
+vop_close {
+	IN struct vnode *vp;
+	IN int fflag;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% access	vp	L L L
+
+vop_access {
+	IN struct vnode *vp;
+	IN accmode_t accmode;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% accessx	vp	L L L
+
+vop_accessx {
+	IN struct vnode *vp;
+	IN accmode_t accmode;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% getattr	vp	L L L
+
+vop_getattr {
+	IN struct vnode *vp;
+	OUT struct vattr *vap;
+	IN struct ucred *cred;
+};
+
+
+%% setattr	vp	E E E
+%! setattr	post	vop_setattr_post
+
+vop_setattr {
+	IN struct vnode *vp;
+	IN struct vattr *vap;
+	IN struct ucred *cred;
+};
+
+%% markatime	vp	L L L
+
+vop_markatime {
+	IN struct vnode *vp;
+};
+
+%% read		vp	L L L
+
+vop_read {
+	IN struct vnode *vp;
+	INOUT struct uio *uio;
+	IN int ioflag;
+	IN struct ucred *cred;
+};
+
+
+%% write	vp	L L L
+%! write	pre	VOP_WRITE_PRE
+%! write	post	VOP_WRITE_POST
+
+vop_write {
+	IN struct vnode *vp;
+	INOUT struct uio *uio;
+	IN int ioflag;
+	IN struct ucred *cred;
+};
+
+
+%% ioctl	vp	U U U
+
+vop_ioctl {
+	IN struct vnode *vp;
+	IN u_long command;
+	IN void *data;
+	IN int fflag;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% poll		vp	U U U
+
+vop_poll {
+	IN struct vnode *vp;
+	IN int events;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% kqfilter	vp	U U U
+
+vop_kqfilter {
+	IN struct vnode *vp;
+	IN struct knote *kn;
+};
+
+
+%% revoke	vp	L L L
+
+vop_revoke {
+	IN struct vnode *vp;
+	IN int flags;
+};
+
+
+%% fsync	vp	L L L
+
+vop_fsync {
+	IN struct vnode *vp;
+	IN int waitfor;
+	IN struct thread *td;
+};
+
+
+%% remove	dvp	E E E
+%% remove	vp	E E E
+%! remove	post	vop_remove_post
+
+vop_remove {
+	IN struct vnode *dvp;
+	IN struct vnode *vp;
+	IN struct componentname *cnp;
+};
+
+
+%% link		tdvp	E E E
+%% link		vp	E E E
+%! link		post	vop_link_post
+
+vop_link {
+	IN struct vnode *tdvp;
+	IN struct vnode *vp;
+	IN struct componentname *cnp;
+};
+
+
+%! rename	pre	vop_rename_pre
+%! rename	post	vop_rename_post
+
+vop_rename {
+	IN WILLRELE struct vnode *fdvp;
+	IN WILLRELE struct vnode *fvp;
+	IN struct componentname *fcnp;
+	IN WILLRELE struct vnode *tdvp;
+	IN WILLRELE struct vnode *tvp;
+	IN struct componentname *tcnp;
+};
+
+
+%% mkdir	dvp	E E E
+%% mkdir	vpp	- E -
+%! mkdir	post	vop_mkdir_post
+
+vop_mkdir {
+	IN struct vnode *dvp;
+	OUT struct vnode **vpp;
+	IN struct componentname *cnp;
+	IN struct vattr *vap;
+};
+
+
+%% rmdir	dvp	E E E
+%% rmdir	vp	E E E
+%! rmdir	post	vop_rmdir_post
+
+vop_rmdir {
+	IN struct vnode *dvp;
+	IN struct vnode *vp;
+	IN struct componentname *cnp;
+};
+
+
+%% symlink	dvp	E E E
+%% symlink	vpp	- E -
+%! symlink	post	vop_symlink_post
+
+vop_symlink {
+	IN struct vnode *dvp;
+	OUT struct vnode **vpp;
+	IN struct componentname *cnp;
+	IN struct vattr *vap;
+	IN char *target;
+};
+
+
+%% readdir	vp	L L L
+
+vop_readdir {
+	IN struct vnode *vp;
+	INOUT struct uio *uio;
+	IN struct ucred *cred;
+	INOUT int *eofflag;
+	OUT int *ncookies;
+	INOUT u_long **cookies;
+};
+
+
+%% readlink	vp	L L L
+
+vop_readlink {
+	IN struct vnode *vp;
+	INOUT struct uio *uio;
+	IN struct ucred *cred;
+};
+
+
+%% inactive	vp	E E E
+
+vop_inactive {
+	IN struct vnode *vp;
+	IN struct thread *td;
+};
+
+
+%% reclaim	vp	E E E
+
+vop_reclaim {
+	IN struct vnode *vp;
+	IN struct thread *td;
+};
+
+
+%! lock1	pre	vop_lock_pre
+%! lock1	post	vop_lock_post
+
+vop_lock1 {
+	IN struct vnode *vp;
+	IN int flags;
+	IN char *file;
+	IN int line;
+};
+
+
+%! unlock	pre	vop_unlock_pre
+%! unlock	post	vop_unlock_post
+
+vop_unlock {
+	IN struct vnode *vp;
+	IN int flags;
+};
+
+
+%% bmap		vp	L L L
+
+vop_bmap {
+	IN struct vnode *vp;
+	IN daddr_t bn;
+	OUT struct bufobj **bop;
+	IN daddr_t *bnp;
+	OUT int *runp;
+	OUT int *runb;
+};
+
+
+%% strategy	vp	L L L
+%! strategy	pre	vop_strategy_pre
+
+vop_strategy {
+	IN struct vnode *vp;
+	IN struct buf *bp;
+};
+
+
+%% getwritemount vp	= = =
+
+vop_getwritemount {
+	IN struct vnode *vp;
+	OUT struct mount **mpp;
+};
+
+
+%% print	vp	- - -
+
+vop_print {
+	IN struct vnode *vp;
+};
+
+
+%% pathconf	vp	L L L
+
+vop_pathconf {
+	IN struct vnode *vp;
+	IN int name;
+	OUT register_t *retval;
+};
+
+
+%% advlock	vp	U U U
+
+vop_advlock {
+	IN struct vnode *vp;
+	IN void *id;
+	IN int op;
+	IN struct flock *fl;
+	IN int flags;
+};
+
+
+%% advlockasync	vp	U U U
+
+vop_advlockasync {
+	IN struct vnode *vp;
+	IN void *id;
+	IN int op;
+	IN struct flock *fl;
+	IN int flags;
+	IN struct task *task;	
+	INOUT void **cookiep;
+};
+
+
+%% advlockpurge	vp	E E E
+
+vop_advlockpurge {
+	IN struct vnode *vp;
+};
+
+
+%% reallocblks	vp	E E E
+
+vop_reallocblks {
+	IN struct vnode *vp;
+	IN struct cluster_save *buflist;
+};
+
+
+%% getpages	vp	L L L
+
+vop_getpages {
+	IN struct vnode *vp;
+	IN vm_page_t *m;
+	IN int count;
+	IN int reqpage;
+	IN vm_ooffset_t offset;
+};
+
+
+%% putpages	vp	E E E
+
+vop_putpages {
+	IN struct vnode *vp;
+	IN vm_page_t *m;
+	IN int count;
+	IN int sync;
+	IN int *rtvals;
+	IN vm_ooffset_t offset;
+};
+
+
+%% getacl	vp	L L L
+
+vop_getacl {
+	IN struct vnode *vp;
+	IN acl_type_t type;
+	OUT struct acl *aclp;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% setacl	vp	E E E
+
+vop_setacl {
+	IN struct vnode *vp;
+	IN acl_type_t type;
+	IN struct acl *aclp;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% aclcheck	vp	= = =
+
+vop_aclcheck {
+	IN struct vnode *vp;
+	IN acl_type_t type;
+	IN struct acl *aclp;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% closeextattr	vp	L L L
+
+vop_closeextattr {
+	IN struct vnode *vp;
+	IN int commit;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% getextattr	vp	L L L
+
+vop_getextattr {
+	IN struct vnode *vp;
+	IN int attrnamespace;
+	IN const char *name;
+	INOUT struct uio *uio;
+	OUT size_t *size;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% listextattr	vp	L L L
+
+vop_listextattr {
+	IN struct vnode *vp;
+	IN int attrnamespace;
+	INOUT struct uio *uio;
+	OUT size_t *size;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% openextattr	vp	L L L
+
+vop_openextattr {
+	IN struct vnode *vp;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% deleteextattr	vp	E E E
+%! deleteextattr	post	vop_deleteextattr_post
+
+vop_deleteextattr {
+	IN struct vnode *vp;
+	IN int attrnamespace;
+	IN const char *name;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% setextattr	vp	E E E
+%! setextattr	post	vop_setextattr_post
+
+vop_setextattr {
+	IN struct vnode *vp;
+	IN int attrnamespace;
+	IN const char *name;
+	INOUT struct uio *uio;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% setlabel	vp	E E E
+
+vop_setlabel {
+	IN struct vnode *vp;
+	IN struct label *label;
+	IN struct ucred *cred;
+	IN struct thread *td;
+};
+
+
+%% vptofh	vp	= = =
+
+vop_vptofh {
+	IN struct vnode *vp;
+	IN struct fid *fhp;
+};
+
+
+%% vptocnp		vp	L L L
+%% vptocnp		vpp	- U -
+
+vop_vptocnp {
+	IN struct vnode *vp;
+	OUT struct vnode **vpp;
+	IN struct ucred *cred;
+	INOUT char *buf;
+	INOUT int *buflen;
+};
+
+
+%% allocate	vp	E E E
+
+vop_allocate {
+	IN struct vnode *vp;
+	INOUT off_t *offset;
+	INOUT off_t *len;
+};
+
+%% advise	vp	U U U
+
+vop_advise {
+	IN struct vnode *vp;
+	IN off_t start;
+	IN off_t end;
+	IN int advice;
+};
+
+%% unp_bind	vp	E E E
+
+vop_unp_bind {
+	IN struct vnode *vp;
+	IN struct socket *socket;
+};
+
+%% unp_connect	vp	L L L
+
+vop_unp_connect {
+	IN struct vnode *vp;
+	OUT struct socket **socket;
+};
+
+%% unp_detach	vp	= = =
+
+vop_unp_detach {
+	IN struct vnode *vp;
+};
+
+%% is_text	vp	L L L
+
+vop_is_text {
+	IN struct vnode *vp;
+};
+
+%% set_text	vp	E E E
+
+vop_set_text {
+	IN struct vnode *vp;
+};
+
+%% vop_unset_text	vp	E E E
+
+vop_unset_text {
+	IN struct vnode *vp;
+};
+
+%% get_writecount	vp	L L L
+
+vop_get_writecount {
+	IN struct vnode *vp;
+	OUT int *writecount;
+};
+
+%% add_writecount	vp	E E E
+
+vop_add_writecount {
+	IN struct vnode *vp;
+	IN int inc;
+};
+
+# The VOPs below are spares at the end of the table to allow new VOPs to be
+# added in stable branches without breaking the KBI.  New VOPs in HEAD should
+# be added above these spares.  When merging a new VOP to a stable branch,
+# the new VOP should replace one of the spares.
+
+vop_spare1 {
+	IN struct vnode *vp;
+};
+
+vop_spare2 {
+	IN struct vnode *vp;
+};
+
+vop_spare3 {
+	IN struct vnode *vp;
+};
+
+vop_spare4 {
+	IN struct vnode *vp;
+};
+
+vop_spare5 {
+	IN struct vnode *vp;
+};